江南弄 2022-12-18 20:29 采纳率: 77.8%
浏览 86
已结题

jupyter中的爬虫中的json的数据转换

jupyter中爬取数据时json的格式问题

url = url0+urlencode(params)
        print(url)
        #requests请求,设置请求时间最长为30秒,超时报错
        r = requests.get(url,headers=headers,timeout=30)
        #print(r.text)
        #将请求到的字符串转化为html标签
        html = etree.HTML(r.text)
        #标签定位到该字段
        nr = html.xpath('//script[@type="text/javascript"]/text()')[0].replace('\n','').replace('\t','').replace('window.__SEARCH_RESULT__ = ','')
        #将字符串抓华为json格式
        datas = json.loads(nr)['engine_search_result']
        #循环,获取字段

报错图片

img

img

完整代码

# -*- coding: utf-8 -*-
import requests
import time
import re
import csv
import json
import pandas as pd
from lxml import etree
#创建一个csv文件,设置编码格式
file = open('qcwy.csv','a+',encoding='gbk')
#写入表头
writer  =csv.writer(file)
writer.writerow(['公司','岗位','薪资','福利','工作经验','学历','城市','招聘人数','公司规模','公司方向'])
file.close()
from urllib.parse import urlencode
#页数循环,设置10页
for page in range(1,10):
    try:
        url0 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?'.format(page)
        #设置请求头,防止被网站识别爬虫
        headers = {
            'Connection': 'keep-alive',
            'Host': 'search.51job.com',
            'Cookie': 'guid=eafda637f951289cc3971b74087ee992; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22184f1556385bb4-00d9951ae6397368-7a575474-3686400-184f1556386a8d%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg0ZjE1NTYzODViYjQtMDBkOTk1MWFlNjM5NzM2OC03YTU3NTQ3NC0zNjg2NDAwLTE4NGYxNTU2Mzg2YThkIn0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22184f1556385bb4-00d9951ae6397368-7a575474-3686400-184f1556386a8d%22%7D; ssxmod_itna=eqUxnDu7GQG=6xGq0du7t8mx4=Dgn0phxhx0yDi=TDSxGKidDqxBnWjeDQTdSdfnPh0EGY0A4rwTrGmR051vYj72oElt4B3DEx0=edIixiicDCeDIDWeDiDG4Gml4GtDpxG=Djnz/1lXxGWDmbkDWPDYxDr61KDRxi7DDydCx07DQHk3Rpw9Oo1YABDqnKD9hoDsEbfSnImfRftlEAAmtBovx0kS40OBOHszOoDUDvsg=7NKCR4KQxNdih3EBres7Gt3ZQ4fG6x=Yn45=iepNi43o9Pl0tDi=GtHjUDD; ssxmod_itna2=eqUxnDu7GQG=6xGq0du7t8mx4=Dgn0phxxnIgDi=eDlrGlxjRRieZrTMD6CrLfzOzoWadeRi/qDQ4c2TELIl/2LE3wTZ6DrdD6iaKLXdCbFAdXpdNW6Qq/nx/1lurV+lUkBVILs11ura8igqh/WvFZaIbRPFsP4EOu2F802yhjo5bloHOxc5C=BTesx5x/7mH+8BOw99p6UfQX8E7RKeAEPn=6=SQZxSWEhSDToyAPXLQPdy=3VPfS/aURROGDRe+btCd3805zduC=jm5MQtsKYUHYZWS/naO6y3Nz=Zl194CyT8duyd1CqXndH0NAPe4FvrxK=xm2x=/a0Z=tiEq/EN/==K4QHm0HEwN93/ad+nXh00b+fTS+afXe+EF00M0031m2Oue9Y33YNttfC2s9f893mnuVowUaIE7mp9fICWM=Yn1QGG3qnGR0GKA3LYIpjef8YTDG2KG2WrHiiTYieKc4Qi2iL8grkskCwvxD08DijpYD==; partner=51jobhtml5',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36 Edg/108.0.1462.54'
        }
        #在url后拼接参数,参数固定
        params = {
            'lang': 'c',
            'postchannel': '0000',
            'workyear': '99',
            'cotype': '99',
            'degreefrom': '99',
            'jobterm': '99',
            'companysize': '99',
            'ord_field': '0',
            'dibiaoid': '0',
            'line': '',
            'welfare': '',
        }
        #拼接url
        url = url0+urlencode(params)
        print(url)
        #requests请求,设置请求时间最长为30秒,超时报错
        r = requests.get(url,headers=headers,timeout=30)
        #print(r.text)
        #将请求到的字符串转化为html标签
        html = etree.HTML(r.text)
        #标签定位到该字段
        nr = html.xpath('//script[@type="text/javascript"]/text()')[0].replace('\n','').replace('\t','').replace('window.__SEARCH_RESULT__ = ','')
        #将字符串抓华为json格式
        datas = json.loads(nr)['engine_search_result']
        #循环,获取字段
        for sjs in datas:
            #判断
            if len(sjs['attribute_text']) == 4:
                workyear = sjs['attribute_text'][1]
                education = sjs['attribute_text'][2]
                city = sjs['attribute_text'][0]
                renshu = sjs['attribute_text'][-1]
            else:
                city = sjs['attribute_text'][0]
                renshu = sjs['attribute_text'][-1]
                test = sjs['attribute_text'][1]
                #判断经验是否在test里面
                if '经验' in test:
                    workyear = test
                    education = '无'
                else:
                    education = test
                    workyear = '无'
            company_name = sjs['company_name']
            job_name = sjs['job_name']
            providesalary_text = sjs['providesalary_text'].replace('\\',"")
            jobwelf = sjs['jobwelf'].replace('\\',"")
            companysize_text = sjs['companysize_text'].replace('\\',"")
            companyind_text = sjs['companyind_text'].replace('\\',"")
            #如果为空,直接设置为无
            if not providesalary_text:
                providesalary_text = '无'
            if not jobwelf:
                jobwelf = '无'
            if not companysize_text:
                companysize_text = '无'
            if not companyind_text:
                companyind_text = '无'
            file = open('qcwy.csv', 'a+', encoding='gbk')
            writer = csv.writer(file)
            #将数据每行写入
            writer.writerow([company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text])
            print(company_name,job_name,providesalary_text,jobwelf,workyear,education,city,renshu,companysize_text,companyind_text)
      #异常处理
    except Exception as e:
        print(e)
        time.sleep(1) 
        # break
#将csv转成excel
datas = pd.read_csv('qcwy.csv',encoding='gbk')


根据报错应该是,json格式转换的问题吧

  • 写回答

4条回答 默认 最新

  • gnn_explorer 2022-12-18 22:26
    关注

    值不是json字符串格式, 所以报错: {JSONDecodeError}Expecting value: line 1 column 13 (char 12)

    img

    img

    评论

报告相同问题?

问题事件

  • 系统已结题 12月26日
  • 赞助了问题酬金15元 12月18日
  • 创建了问题 12月18日

悬赏问题

  • ¥15 Fluent udf 编写问题
  • ¥15 求合并两个字节流VB6代码
  • ¥15 Pyqt 如何正确的关掉Qthread,并且释放其中的锁?
  • ¥30 网站服务器通过node.js部署了一个项目!前端访问失败
  • ¥15 WPS访问权限不足怎么解决
  • ¥15 java幂等控制问题
  • ¥15 海湾GST-DJ-N500
  • ¥15 氧化掩蔽层与注入条件关系
  • ¥15 Django DRF 如何反序列化得到Python对象类型数据
  • ¥15 多数据源与Hystrix的冲突