用爬虫爬取ip代理网站的ip时,用try判断ip是否超时,结果在session.get()前面加了一个await 好像并没有进行判断 timeout设置为3s 好像直接跳转到了except中去 所有的ip全部被打印成不可用 但是我自己测试的时候发现是能用的 而且代码执行的很快 压根没有检测是否超时
代码如下
import asyncio
import json
import requests
from bs4 import BeautifulSoup
import aiohttp
import aiofiles
import asyncore
import json
from lxml import etree
import random
async def get_ip(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as f:
a = await f.text()
bsl = BeautifulSoup(a,'html.parser')
bss = bsl.find('table',width="100%").select('tr')[1:]
for list in bss:
ip = list.select('tr td')[0].text
port = list.select('tr td')[1].text
proxies={
f'https':f'https://{ip}:{port}'
}
asyncio.gather(verify(proxies))
async def verify(proxies):
async with aiohttp.ClientSession() as session:
try:
f = session.get('https://www.baidu.com',proxies=random.choice(proxies),async_timeout = 3)
print('可用代理:{}'.format(proxies))
await write_json(proxies)
except:
print('不可用的:{}'.format(proxies))
async def write_json(proxies):
async with aiofiles.open('ip处理池.json','a') as f:
await json.dump(proxies,f)
async def rea_json():
async with aiofiles.open('ip处理池.json','r')as f:
for i in f.readlines():
content = json.loads(i.strip())
print(content)
async def main():
tasks = []
for i in range(100):
url = f'http://www.66ip.cn/{i}.html'
tasks.append(asyncio.create_task(get_ip(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())