使用scrapy框架爬取网易社会招聘数据(ajax异步),为什么显示这样,如果换成requests爬取能成功,为什么scarpy显示502,哪里有问题啊啊
import scrapy
import json
from jsonpath import jsonpath
from mySpider2.items import Myspider2Item
class WangyiSpider(scrapy.Spider):
name = "wangyi"
allowed_domains = ["163.com"]
start_urls = ["https://hr.163.com/api/hr163/position/queryPage"]
page = 1
def start_requests(self):
payload = {
'currentPage': 1,
'pageSize': 10
}
yield scrapy.Request(
url=self.start_urls[0],
callback=self.parse,
method='POST',
body=json.dumps(payload).encode('utf-8'), # 要转化为json字符串
dont_filter=True, # 关闭去重过滤
headers={'x-ehr-uuid':'88ee0a23-727f-4bf8-b96f-f1338ea1b0',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'Referer':'https://hr.163.com/job-list.html',
'Cookie':'_ntes_nnid=e8d24d6b67619adc709be368c107b5cd,1725195258462; _ntes_nuid=e8d24d6b67619adc709be368c107b5cd; hb_MA-8E16-605C3AFFE11F_source=www.baidu.com; hb_MA-AC55-420C68F83864_source=www.baidu.com; userName=; accountType='}
)
def parse(self, response):
print(response.status)
# 提取数据
print('提取数据')
jsons = response.body() # json字符串
data_lists = jsonpath(jsons, '$..list') # 一个列表,每一项是json字符串
print('-----', len(data_lists))
print(data_lists)
for data in data_lists:
item = Myspider2Item()
item['name'] = jsonpath(data, '$.name')
work_type_mapping = {
'0': '全职',
'1': '实习',
'2': '派遣'
}
item['workType'] = work_type_mapping[jsonpath(data, '$.workType')]
item['place'] = jsonpath(data, '$.workPlaceNameList.0')[0]
item['produce'] = jsonpath(data, '$.productName')[0]
item['postType'] = jsonpath(data, '$.firstPostTypeName')[0]
item['num'] = jsonpath(data, '$.recruitNum')[0]
item['education'] = jsonpath(data, '$.reqEducationName')[0]
item['ask'] = jsonpath(data, '$.requirement')[0]
print(item['name'])
yield item
# # 模拟翻页
# if not jsonpath(jsons, '$..lastPage')[0]:
# self.page += 1
# payload = {
# 'currentPage': self.page,
# 'pageSize': '10'
# }
# yield scrapy.Request(
# url=self.start_urls[0],
# callback=self.parse,
# method='POST',
# body=json.dumps(payload),
# headers={
# 'User_Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"}
# )