m0_68280323 2022-11-13 11:40 采纳率: 100%
浏览 72
已结题

这是咋回事啊?改了好久都不对

img


这是咋回事啊?改了好久都不对。
import json
import re
import requests
from lxml import etree
import csv

class Spider(object):

def __init__(self):
    self.keyword = input("请输入搜索关键词:")
    self.url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html'  # 网页url
    self.headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
                    'cookie': '_uab_collina=162694245076236003474511; guid=2806f21dc07e57b92a6d38ae6ad831db; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C9%CC%CE%F1%D3%A2%D3%EF%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; acw_tc=2f624a4216269424503425150e0da85d25580876b2bc112b8d45305e878b60; acw_sc__v2=60f92bf226271b72b3adf6a65ef4bd4094c491a1; ssxmod_itna=QqmxnQi=qmuDB7DzOD2YLDkYOIhm0DDTaRweqxKDspbDSxGKidDqxBmnj2wAehYi+zDBIaoe=Du+QLzm2idILYIYfAa/UDB3DEx064btxYAkDt4DTD34DYDixibsxi5GRD0KDF8NVt/yDi3DbE=Di4D+8MQDmqG0DDU7S4G2D7U9R7Dr8q2U7nt3EkDeLA+9D0tdxBLeFpTYOcarBqrb=1K4KDDHD8H=9cDKY8GxHgQDzqODtqNMSMd3yPtVRk0s+SDolhGTK7P4fGAG7BhzlIx=b+hnv0Dbf0D+YbudUiL6xDf57B5QeD==; ssxmod_itna2=QqmxnQi=qmuDB7DzOD2YLDkYOIhm0DDTaRweqxikfoOqDlZ0DjbP6kjY6h6xlUxnzDCA6QPs7q0=PLYWwuza8UL+RLPoiBAFL=xnR6L1xqP8B2Fr8=xT67LRjkMy+62=MZEon/zHzWxTPGiqxdcqqQ0K=y=DbyjqrMCHWWfFrt3bIw2B150E=SnYWY2P7dZIi+=FTE3K80nq7eWfj=Fr=4SI7S+nLg/K0OlKXNZ9jaj96GEkTLSQQv+PbLflkyG3=Y1y1Z68NhO4Us7qqNQStVY90lDw7edKHeoFRuV/c=NnM2v3Z2TQS=dVZUTgKS63hiXTcqSp6K+NC6N8cYY96Mc=PKrujN3Er32PKmohY50Dr3O25tratE3UbbZb3S0K1XdPTkrrOFi8trK9To+iHUih/Uetr0rqrITbFW7/DaDG2z0qYDQKGmBRxtYa6S=AfhQifW9bD08DiQrYD==='
                    }  
    self.header = ['position', 'company', 'wages', 'place', 'education', 'work_experience', 'release_date',
                   'limit_people', 'address', 'company_type', 'company_size', 'industry'] 
    self.fp = open('{}.csv'.format(self.keyword), 'a', encoding='utf-8-sig', newline='')  
    self.writer = csv.DictWriter(self.fp, self.header) 
    self.writer.writeheader() 

def get_end_page(self): 
    response = requests.get(self.url.format(self.keyword, str(1)), headers=self.headers)
    text = response.content.decode('gb18030')  
    json_obj = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', text)
    print(json_obj)
    py_obj = json.loads(json_obj[0])
    end_page = py_obj['total_page']
    return end_page

def get_url(self, count):
    response = requests.get(url=self.url.format(self.keyword, count), headers=self.headers)  

    text = response.content.decode('gb18030')
    json_obj = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', text)
    py_obj = json.loads(json_obj[0])
    detail_urls = [i['job_href'] for i in py_obj['engine_search_result']]
    return detail_urls  

def parse_url(self, url):
    response = requests.get(url=url, headers=self.headers)
    try:  # 这里可能会出现解码错误,因为有个别很少的特殊网页结构,另类来的,不用管
        text = response.content.decode('gb18030')
        html = etree.HTML(text)
        position = html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/h1/@title")[0]  # 职位名
    except Exception as e:
        print("特殊网页:{},结束执行该函数,解析下一个详情url".format(e))
        return  
    company = "".join(html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/p[1]/a[1]//text()"))  # 公司名
    wages = "".join(html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/strong/text()"))  # 工资
    informations = html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/p[2]/text()")  # 获取地点经验学历等信息
    informations = [i.strip() for i in informations] 
    place = informations[0] 
    education = "".join([i for i in informations if i in '初中及以下高中/中技/中专大专本科硕士博士无学历要求'])  # 通过列表推导式获取学历
    work_experience = "".join([i for i in informations if '经验' in i]) 
    release_date = "".join([i for i in informations if '发布' in i])  
    limit_people = "".join([i for i in informations if '招' in i]) 
    address = "".join(html.xpath("//div[@class='tCompany_main']/div[2]/div[@class='bmsg inbox']/p/text()"))  # 上班地址
    company_type = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[1]/@title"))  # 公司类型
    company_size = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[2]/@title"))  # 公司规模
    industry = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[3]/@title"))  # 所属行业

    item = {'position': position, 'company': company, 'wages': wages, 'place': place, 'education': education,
            'work_experience': work_experience, 'release_date': release_date, 'limit_people': limit_people,
            'address': address, 'company_type': company_type, 'company_size': company_size,
            'industry': industry} 
    print(item)
    self.writer.writerow(item)  

if name == 'main':
print("爬虫开始")
spider = Spider()
end_page = spider.get_end_page() # 获取该职位的总页数
print("总页数:{}".format(end_page))
page = input("输入采集页数:")
for count in range(1, int(page) + 1):
detail_urls = spider.get_url(count)
for detail_url in detail_urls:
spider.parse_url(detail_url)
print("已爬取第{}页".format(count))

spider.fp.close()  
print("爬取结束")
  • 写回答

4条回答 默认 最新

  • 游一游走一走 2022-11-13 11:57
    关注
    1. 编码请都设置为utf-8,现在的网页一般都是utf-8的
    2. 另外你这个爬虫估计运行不了,网站有反爬设置
    # coding=utf-8
    import json
    import re
    import requests
    from lxml import etree
    import csv
    
    
    class Spider(object):
        def __init__(self):
            self.keyword = input("请输入搜索关键词:")
            self.url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html'  # 网页url
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
                'cookie': '_uab_collina=162694245076236003474511; guid=2806f21dc07e57b92a6d38ae6ad831db; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C9%CC%CE%F1%D3%A2%D3%EF%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; acw_tc=2f624a4216269424503425150e0da85d25580876b2bc112b8d45305e878b60; acw_sc__v2=60f92bf226271b72b3adf6a65ef4bd4094c491a1; ssxmod_itna=QqmxnQi=qmuDB7DzOD2YLDkYOIhm0DDTaRweqxKDspbDSxGKidDqxBmnj2wAehYi+zDBIaoe=Du+QLzm2idILYIYfAa/UDB3DEx064btxYAkDt4DTD34DYDixibsxi5GRD0KDF8NVt/yDi3DbE=Di4D+8MQDmqG0DDU7S4G2D7U9R7Dr8q2U7nt3EkDeLA+9D0tdxBLeFpTYOcarBqrb=1K4KDDHD8H=9cDKY8GxHgQDzqODtqNMSMd3yPtVRk0s+SDolhGTK7P4fGAG7BhzlIx=b+hnv0Dbf0D+YbudUiL6xDf57B5QeD==; ssxmod_itna2=QqmxnQi=qmuDB7DzOD2YLDkYOIhm0DDTaRweqxikfoOqDlZ0DjbP6kjY6h6xlUxnzDCA6QPs7q0=PLYWwuza8UL+RLPoiBAFL=xnR6L1xqP8B2Fr8=xT67LRjkMy+62=MZEon/zHzWxTPGiqxdcqqQ0K=y=DbyjqrMCHWWfFrt3bIw2B150E=SnYWY2P7dZIi+=FTE3K80nq7eWfj=Fr=4SI7S+nLg/K0OlKXNZ9jaj96GEkTLSQQv+PbLflkyG3=Y1y1Z68NhO4Us7qqNQStVY90lDw7edKHeoFRuV/c=NnM2v3Z2TQS=dVZUTgKS63hiXTcqSp6K+NC6N8cYY96Mc=PKrujN3Er32PKmohY50Dr3O25tratE3UbbZb3S0K1XdPTkrrOFi8trK9To+iHUih/Uetr0rqrITbFW7/DaDG2z0qYDQKGmBRxtYa6S=AfhQifW9bD08DiQrYD==='
            }
            self.header = ['position', 'company', 'wages', 'place', 'education', 'work_experience', 'release_date',
                           'limit_people', 'address', 'company_type', 'company_size', 'industry']
            self.fp = open('{}.csv'.format(self.keyword), 'a', encoding='utf-8-sig', newline='')
            self.writer = csv.DictWriter(self.fp, self.header)
            self.writer.writeheader()
    
    
        def get_end_page(self):
            response = requests.get(self.url.format(self.keyword, str(1)), headers=self.headers)
            text = response.content.decode('utf-8')
            json_obj = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', text)
            print(json_obj)
            py_obj = json.loads(json_obj[0])
            end_page = py_obj['total_page']
            return end_page
    
    
        def get_url(self, count):
            response = requests.get(url=self.url.format(self.keyword, count), headers=self.headers)
    
            text = response.content.decode('utf-8')
            json_obj = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', text)
            py_obj = json.loads(json_obj[0])
            detail_urls = [i['job_href'] for i in py_obj['engine_search_result']]
            return detail_urls
    
    
        def parse_url(self, url):
            response = requests.get(url=url, headers=self.headers)
            try:  # 这里可能会出现解码错误,因为有个别很少的特殊网页结构,另类来的,不用管
                text = response.content.decode('utf-8')
                html = etree.HTML(text)
                position = html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/h1/@title")[0]  # 职位名
            except Exception as e:
                print("特殊网页:{},结束执行该函数,解析下一个详情url".format(e))
                return
            company = "".join(html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/p[1]/a[1]//text()"))  # 公司名
            wages = "".join(html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/strong/text()"))  # 工资
            informations = html.xpath("//div[@class='tHeader tHjob']//div[@class='cn']/p[2]/text()")  # 获取地点经验学历等信息
            informations = [i.strip() for i in informations]
            place = informations[0]
            education = "".join([i for i in informations if i in '初中及以下高中/中技/中专大专本科硕士博士无学历要求'])  # 通过列表推导式获取学历
            work_experience = "".join([i for i in informations if '经验' in i])
            release_date = "".join([i for i in informations if '发布' in i])
            limit_people = "".join([i for i in informations if '招' in i])
            address = "".join(html.xpath("//div[@class='tCompany_main']/div[2]/div[@class='bmsg inbox']/p/text()"))  # 上班地址
            company_type = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[1]/@title"))  # 公司类型
            company_size = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[2]/@title"))  # 公司规模
            industry = "".join(html.xpath("//div[@class='tCompany_sidebar']/div[1]/div[2]/p[3]/@title"))  # 所属行业
    
            item = {'position': position, 'company': company, 'wages': wages, 'place': place, 'education': education,
                    'work_experience': work_experience, 'release_date': release_date, 'limit_people': limit_people,
                    'address': address, 'company_type': company_type, 'company_size': company_size,
                    'industry': industry}
            print(item)
            self.writer.writerow(item)
    
    
    if __name__ == '__main__':
        print("爬虫开始")
        spider = Spider()
        end_page = spider.get_end_page()  # 获取该职位的总页数
        print("总页数:{}".format(end_page))
        page = input("输入采集页数:")
        for count in range(1, int(page) + 1):
            detail_urls = spider.get_url(count)
        for detail_url in detail_urls:
            spider.parse_url(detail_url)
        print("已爬取第{}页".format(count))
        spider.fp.close()
        print("爬取结束")
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论 编辑记录
查看更多回答(3条)

报告相同问题?

问题事件

  • 系统已结题 11月21日
  • 已采纳回答 11月13日
  • 创建了问题 11月13日

悬赏问题

  • ¥15 有两个非常“自以为是”烦人的问题急期待大家解决!
  • ¥30 STM32 INMP441无法读取数据
  • ¥100 求汇川机器人IRCB300控制器和示教器同版本升级固件文件升级包
  • ¥15 用visualstudio2022创建vue项目后无法启动
  • ¥15 x趋于0时tanx-sinx极限可以拆开算吗
  • ¥500 把面具戴到人脸上,请大家贡献智慧,别用大模型回答,大模型的答案没啥用
  • ¥15 任意一个散点图自己下载其js脚本文件并做成独立的案例页面,不要作在线的,要离线状态。
  • ¥15 各位 帮我看看如何写代码,打出来的图形要和如下图呈现的一样,急
  • ¥30 c#打开word开启修订并实时显示批注
  • ¥15 如何解决ldsc的这条报错/index error