from requests.exceptions import Timeout
from selenium import webdriver
import time
import requests
import lxml
from lxml import etree
import os
import re
import parsel
def check_ip(proxies_list):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
can_use = []
for ip in proxies_list:
try:
requests.get(url='https//www.bilibili.com/', headers = headers, proxies = ip, timeout = 0.1)
if response.status_code == 200:
can_use.append(ip)
except Exception:
print('当前代理ip: ', ip, "请求超时,检测不合格")
finally:
print('当前代理ip: ', ip, '检测通过')
return can_use
proxies_list = []
for page in range(1,8):
print('============正在爬取第{}页数据============'.format(str(page)))
base_url = 'http://www.ip3366.net/free/?stype=1&page{}.format(str(page))'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
response = requests.get(url=base_url , headers = headers )
data = response.text
html_data = parsel.Selector(data)
parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
#代理IP的结构 {'ip的协议’:‘ip: ip端口’}
for tr in parse_list:
http_type = tr.xpath('./td[4]/text()').extract_first() #协议类型
ip_num = tr.xpath('./td[1]/text()').extract_first() #IP地址
ip_port = tr.xpath('./td[2]/text()').extract_first() #IP端口
print(http_type, ip_num, ip_port)
proxies_dict = {}
print(http_type)
proxies_dict[http_type] = ip_num + ':' + ip_port
print('保存成功: ' , proxies_dict)
proxies_list.append(proxies_dict)
print(proxies_list)
print('获取到的代理IP数量 :',len(proxies_list))
print('===================正在检测ip质量============================')
yes_can = check_ip(proxies_list)
print('质量高的: ', yes_can)
print('质量高的代理ip数量: ', len(yes_can))
以下是检测ip质量返回的部分结果
不知道为什么对既显示不合格,又显示检测通过
最后返回的还是一个空列表
===================正在检测ip质量============================
当前代理ip: {'HTTPS': '175.43.57.24:9999'} 请求超时,检测不合格
当前代理ip: {'HTTPS': '175.43.57.24:9999'} 检测通过
当前代理ip: {'HTTP': '180.118.128.220:9000'} 请求超时,检测不合格
当前代理ip: {'HTTP': '180.118.128.220:9000'} 检测通过
当前代理ip: {'HTTP': '182.34.20.143:9999'} 请求超时,检测不合格
当前代理ip: {'HTTP': '182.34.20.143:9999'} 检测通过
当前代理ip: {'HTTP': '183.145.58.210:9000'} 请求超时,检测不合格
当前代理ip: {'HTTP': '183.145.58.210:9000'} 检测通过
当前代理ip: {'HTTP': '61.92.188.117:8080'} 请求超时,检测不合格
当前代理ip: {'HTTP': '61.92.188.117:8080'} 检测通过
当前代理ip: {'HTTPS': '182.46.114.174:9999'} 请求超时,检测不合格
当前代理ip: {'HTTPS': '182.46.114.174:9999'} 检测通过
当前代理ip: {'HTTP': '182.105.201.5:9000'} 请求超时,检测不合格
当前代理ip: {'HTTP': '182.105.201.5:9000'} 检测通过