Piki033 2023-03-01 20:22 采纳率: 100%
浏览 39
已结题

用python爬虫无法导出数据

用python做爬虫,代码调来调去一直这样报错,麻烦有没有人看看是怎么回事呀

import urllib.parse
import random
import requests
from lxml import etree
import re
import json
import time
import xlwt


class QianChengWuYouSpider(object):
    # 初始化
    def __init__(self, city_id, job_type, pages):
        # url模板
        self.url = 'https://search.51job.com/list/{},000000,0000,00,9,99,{},2,{}.html'
        # UA池
        self.UApool = [
            "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:68.0) Gecko/20100101 Firefox/68.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:75.0) Gecko/20100101 Firefox/75.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:83.0) Gecko/20100101 Firefox/83.0',
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)',
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Hot Lingo 2.0)',
            "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
            "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
            "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
        ]

        # 请求头
        self.headers = {
            'User-Agent': random.choice(self.UApool),
            'referer':'https://blog.csdn.net/EricNTH/article/details/104840887',
            # 注意加上自己的Cookie
            'Cookie': 'guid=5fe585588fded74cf3a82a228c6d9a05; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; ps=needv%3D0; 51job=cuid%3D222881165%26%7C%26cusername%3Dr1iIYNbsPNRfcePxJ5NnZgEj5wBOs3Lekgry9sYJYfs%253D%26%7C%26cpassword%3D%26%7C%26cname%3D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0wsHycnvu2wI%26%7C%26cconfirmkey%3D%25241%2524G2r4TO2.%25240IZIc4jYqAIuNUJTameNb0%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D%26%7C%26cnamekey%3D%25241%2524fbxj8Rqp%2524c40fEtLHks8SAV1.ooCtW%252F%26%7C%26to%3Ddfb0153b79106f4855f2546b250424b463fd81e5%26%7C%26; sensor=createDate%3D2023-02-28%26%7C%26identityType%3D1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22222881165%22%2C%22first_id%22%3A%2218696373a9a28f-0fa9aef36da24f-74525470-1395396-18696373a9b1537%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg2OTYzNzNhOWEyOGYtMGZhOWFlZjM2ZGEyNGYtNzQ1MjU0NzAtMTM5NTM5Ni0xODY5NjM3M2E5YjE1MzciLCIkaWRlbnRpdHlfbG9naW5faWQiOiIyMjI4ODExNjUifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22222881165%22%7D%2C%22%24device_id%22%3A%2218696373a9a28f-0fa9aef36da24f-74525470-1395396-18696373a9b1537%22%7D; ssxmod_itna=YuD=0KBIqfgGCzDX3G7maa+x0xDqH2Tapve0QIaDla2YxA5D8D6DQeGTbnPsbqzK0DW+aqNhrWhaNt6j+RaCKC8p5L7mDB3DEx06Tq0Ci4GG0xBYDQxAYDGDDpRD84DrD72=ZSUxYPG0DGQD3qGyl4tDA8tDb2=nDiUVDDtOB4G2D7tyfwdY5lbDAMmSY2=DjdTD/+xaZ06oH6aNRLtboh2aiL04xBQD7kiyDYoXUeDH+kNKVOoqm0mxBi4K8gm3BBh4lBmq3DPPfBxttYjXNohxRYh4mIjdWx8DG8GoWrD=; ssxmod_itna2=YuD=0KBIqfgGCzDX3G7maa+x0xDqH2Tapve0QzD6EK40HaRo03PvquXvCnD6eTwm57vlcOFQHyIZL0jeUj2j45maC205xmidt64Rq0C9dA7sGESSyQNuKC8=UnqMhkU7MXqXI9CAp=TiDRi=lcuquCDHLrbtQSKkvWDWGEyDUmBNUpeaTgWb0=c+OBRDrIxwqjA1ExcfQ13sEeMGIfgF1I0vYaG3DQ9iDjKD+ghDD===; partner=sem_pcsogouqg_16633; privacy=1677640570; Hm_lvt_1370a11171bd6f2d9b1fe98951541941=1677557447,1677640568; Hm_lpvt_1370a11171bd6f2d9b1fe98951541941=1677640568; slife=lastlogindate%3D20230301%26%7C%26securetime%3DUGxTZlAxBWZVMQA6CjEPYQczVmI%253D',
        }

        # 请求参数
        self.params = {
            "lang": "c",
            "postchannel": 0000,
            "workyear": 99,
            "cotype": 99,
            "degreefrom": 99,
            "jobterm": 99,
            "companysize": 99,
            "ord_field": 0,
            "dibiaoid": 0,
            "line": '',
            "welfare": ''
        }

        # 保存的文件名
        self.filename = "前程无忧网" + job_type + "职位信息.xls"

        # 城市编号
        self.city_id = city_id

        # 职位名称 【转为urlencode编码】
        self.job_type = urllib.parse.quote(job_type)

        # 页数
        self.pages = pages

        # 临时存储容器
        self.words = []

    # 请求网页
    def parse(self, url):
        response = requests.get(url=url, headers=self.headers, params=self.params)

        # 设置编码格式为gbk
        response.encoding = 'gbk'

        # 网页源代码
        return response.text

    # 数据提取
    def get_job(self, page_text):
        # xpath
        tree = etree.HTML(page_text)
        job_label = tree.xpath('//script[@type="text/javascript"]')

        # 正则表达式
        job_str = re.findall('"engine_jds":(.*"adid":""}]),', str(job_label))

        # 转换为json类型
        data = json.loads(str(job_str).replace("'", "\""))

        # 数据提取
        for item in data:
            # 职位名称
            job_name = item['job_name']

            # 职位链接
            job_href = item['job_href']

            # 公司名称
            company_name = item['company_name']

            # 公司链接
            company_href = item['company_href']

            # 月薪范围
            salary = item['providesalary_text']

            # 工作地点
            address = item['workarea_text']

            # 其他信息
            info_list = item['attribute_text']

            # 有个别数据不完整, 直接跳过
            if len(info_list) < 3:
                continue

            # 经验要求
            experience = info_list[1]

            # 学历要求
            education = info_list[2]

            # 发布日期
            update_date = item['updatedate']

            # 公司性质
            company_type = item['companytype_text']

            # 公司福利
            job_welf = item['jobwelf']

            # 公司行业
            company_status = item['companyind_text']

            # 公司规模
            company_size = item['companysize_text']

            self.words.append({
                "职位名称": job_name,
                "公司名称": company_name,
                "月薪范围": salary,
                "工作地点": address,
                "经验要求": experience,
                "学历要求": education,
                "发布日期": update_date,
                "公司性质": company_type,
                "公司福利": job_welf,
                "公司行业": company_status,
                "公司规模": company_size,
                "职位链接": job_href,
                "公司链接": company_href,
            })

        print("该页爬取完成")

    # 数据保存
    def save(self, words, filename, sheet_name='sheet1'):
        try:
            # 1、创建工作薄
            work_book = xlwt.Workbook(encoding="utf-8")
            # 2、创建sheet表单
            sheet = work_book.add_sheet(sheet_name)
            # 3、写表头
            head = []
            for k in words[0].keys():
                head.append(k)

            for i in range(len(head)):
                sheet.write(0, i, head[i])
            # 4、添加内容
            # 行号
            i = 1
            for item in words:
                for j in range(len(head)):
                    sheet.write(i, j, item[head[j]])
                # 写完一行,将行号+1
                i += 1
            # 保存
            work_book.save(filename)
            print('数据保存成功')

        except Exception as e:
            print('数据保存失败', e)

    # 主程序
    def run(self):
        for page in range(1, self.pages + 1):
            # 拼接每页url
            url = self.url.format(self.city_id, self.job_type, page)

            # 请求网页
            page_text = self.parse(url)

            # 数据提取
            self.get_job(page_text)

            # 防止爬取过快
            time.sleep(random.randint(1, 2))

        self.save(words=self.words, filename=self.filename)


if __name__ == '__main__':
    # 实例化爬虫对象 全国爬虫职位信息
    # city_id:城市编号(上表)
    # job_type:职位名称 (尽量精准,爬取到的数据会更贴切)
    # pages:页数(自己指定,注意不要超过总页数)
    spider = QianChengWuYouSpider(city_id=000000, job_type="数据分析", pages=2)

    # 运行主程序
    spider.run()


报的错误是这样的

img

  • 写回答

3条回答 默认 最新

  • sanbaofengs 2023-03-01 20:54
    关注

    回答不易,望采纳!

    这一行代码报错了

    img

    应该是连页面内容都没有爬下来,后边更别谈格式化处理了。

    检查一下你的 cookie 是不是过期了吧

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

问题事件

  • 系统已结题 3月10日
  • 已采纳回答 3月2日
  • 修改了问题 3月1日
  • 创建了问题 3月1日

悬赏问题

  • ¥15 linux驱动,linux应用,多线程
  • ¥20 我要一个分身加定位两个功能的安卓app
  • ¥15 基于FOC驱动器,如何实现卡丁车下坡无阻力的遛坡的效果
  • ¥15 IAR程序莫名变量多重定义
  • ¥15 (标签-UDP|关键词-client)
  • ¥15 关于库卡officelite无法与虚拟机通讯的问题
  • ¥15 目标检测项目无法读取视频
  • ¥15 GEO datasets中基因芯片数据仅仅提供了normalized signal如何进行差异分析
  • ¥100 求采集电商背景音乐的方法
  • ¥15 数学建模竞赛求指导帮助