使用以下代码,会一直返回403响应码,代码中的登录部分已经涂抹掉了,而且代码中代理ip应该已经过期不能使用了。 解决问题时,可以使用1-2个代理ip来模拟就OK。 希望大佬能给出一个运行OK的代码,并指出下面代码的问题。 使用的是pycharm,python3.7 问题代码如下:
import requests from lxml import etree import time import csv import re sess = requests.session() headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" } def ip_list(url): resp = requests.get(url) json = resp.json() # print(json) datas = json['data'] ip_list = [] for data in datas: ip = data['ip'] port = data['port'] new_ip = 'https://' + ip + ':' + str(port) ip_list.append(new_ip) print(ip_list) return ip_list def get_token(url): resp = sess.get(url, headers= headers) _token = re.search('<meta name="csrf-token" content="(.*?)">', resp.text).group(1) return _token def login(token,url): data = { "_token": token, "email": "xxxxxxxxxx", "password": "xxxxxxxx", "remember": "on" } resp = sess.post(url, headers= headers, data= data) print(resp.status_code) def get_data(ip_list,fp): list1 = [] for i in range(400): url = 'http://glidedsky.com/level/web/crawler-ip-block-1?page={}'.format(str(i+1)) proxy = {'https': ip_list[0]} print(proxy) resp = sess.get(url, headers=headers,proxies= proxy) print(resp.status_code) html = etree.HTML(resp.text) fp.write(resp.text) ip_list.remove(ip_list[0]) # print(etree.tostring(html)) divs = html.xpath("//div[@class='card-body']//div[@class='col-md-1']//text()") for div in divs: a = div.strip() a = int(a) list1.append(a) print(list1) time.sleep(1) print(list1) sum_code = sum(list1) print(sum_code) return sum_code def main(): ip_list1 = ip_list('http://webapi.http.zhimacangku.com/getip?num=400&type=2&pro=&city=0&yys=0&port=11&time=1&ts=1&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1®ions=') print(ip_list1) url = 'http://glidedsky.com/login' token = get_token(url) login(token,url) sum_code = get_data(ip_list1,fp) print(sum_code) if __name__ == '__main__': main()