用request请求时
(headers,动态cookies都添加了)
但是还是返回“ntaccess deny”,之前是返回“操作频繁”解决了,现在又返回“ntaccess deny”,求解决办法!
import time, requests, jieba, wordcloud, bs4
import re
import matplotlib.pyplot as plt
import csv
import pprint
import matplotlib
from random import * # 引出库
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif' # 使图表中的方框转译成中文
dic = {}
lst = []
pn = []
lst_csv = [] # 创建所需字典和列表
path0 = r'D://招聘信息.csv'
path1 = r'D://lagou.csv'
x = eval(input("输入爬取的页数:"))
for i in range(1, x + 1):
url_login = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
url_crawler= "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers_1 = {
'authority': 'www.lagou.com',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_pyton?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.73'}
data = {
'first': 'true',
'pn': str(i),
'kd': 'python'}
s = requests.Session()
s.get(url_login, headers=headers_1, timeout=5) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
print(cookie)
time.sleep(randint(1, 5))
try:
datas = {'first': 'ture', 'pn': i, 'kd': 'python'}
headers_2 = {
'accept': 'application / json, text / javascript, * / *; q = 0.01',
'Connection': 'close',
'authority': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?city=%C3%A5%C2%85%C2%A8%C3%A5%C2%9B%C2%BD&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.73'}
r = s.post(url_crawler, data=datas, headers=headers_2, cookies=cookie)
r.encoding = 'utf-8'
pprint.pprint(r.json())
with open(path0, 'a', encoding='gb18030') as file:
file.write(r.text)
timeout = 20
print("第{}页采集完成。".format(i))
time.sleep(randint(1, 5))
except Exception as ex:
print("第{}页采集出错,出错原因:{}".format(i, ex)) # 爬取数据