def jiexi(url, head, parse_list):
# 循环遍历,二次提取
proxies_list = []
for tr in parse_list:
http_type = tr.xpath('./td[2]/text()').extract_first().replace("代理", "") # extract_first,返回第一个元素,不加first则返回所有元素
http_type = http_type.replace("HTTP,", "")
ip_num = tr.xpath('./td[1]/text()').extract_first()
# port_num = tr.xpath('./td[2]/text()').extract_first()
# print(http_type, ip_num, port_num)
# 构建代理ip字典
# proxies_dict[http_type] = ip_num
'''+ ':' + port_num'''
proxies_dict = {"%s" % http_type: 'HTTPS://%s' % ip_num}
# print(len(proxies_dict))
proxies_list.append(proxies_dict)
# print(type(proxies_list))
# print(len(proxies_list))
print(proxies_list)
# geturl(url, head, proxies_list, proxies_dict)
return proxies_list
``````python
def geturl(url, head, proxies_list, proxies_dict):
html = ""
for proxy in proxies_dict:
response = requests.get(url, headers=head, proxies=random.choice(proxies_list), timeout=3) # 超时报错
if response.status_code != 200:
proxies_dict.remove(proxy)
continue
else:
try:
html = response.text
# print(html)
# print(len(proxies_dict))
gethtml(html) # 提取网页数据
# print(url)
# print(html)
except Exception as error:
print(f"错误异常信息为:{error}")
return html
一个IP爬完一页,下一个IP又重新爬一遍,这种情况应该在哪里写判断。截止19.42发现IP地址在重复写入列表,语法是不是有误,怎么才能一次写入到列表呢