我用httpx写了一个异步爬虫。 我在代码中指定了9个url,按道理 getDomain 会被执行9次。 但是程序运行时发现getDomain只会执行1到3次,其他几次根本没有调用getDomain。 麻烦大家看看是什么原因导致的? 应该不是目标网站屏蔽的问题。
import json
import traceback
import httpx
from datetime import datetime
import time
import re
import asyncio
from httpx import AsyncHTTPTransport, Cookies
cookies =""
async def loginWest(client:httpx.AsyncClient):
headers = {
#'content-length': '100',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'content-type': 'application/x-www-form-urlencoded',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9'
}
payload = 'u_name=tylrr1&u_password=123qwe&adds323sdsdsad=3546234134sadsa233&back_path2=&m=&module=enterzone'
response0 = await client.post("https://www.xxxxx.cn/login.asp", headers=headers, data=payload ,follow_redirects=True)
#print(response0.status_code)
#print(response0.content.decode("gbk"))
if "1902326" in response0.content.decode("gbk") :
print("登录成功")
print(response0.cookies)
#return response0.cookies
async def getDomain(page:int ,client:httpx.AsyncClient):
headers = {
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'accept': 'application/json, text/plain, */*',
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
payload = f'domeq=&domkey=&domeq1=&domkey1=&domuneq=&domunkey=&domuneq1=&domunkey1=&domlen1=&domlen2=&topmoney=&topmoneymax=&price=&pricemax=&expday=&expdaymax=&arrdomext=&domclass=&domleiab=&deldate=®year=®yearmax=&freeyd=0&deltype=&ordby=deldate&ordtp=&sogoupr=&sogouprmax=&baidupr=&baiduprmax=&sgsoulu=&sgsoulumax=&bdsoulu=&bdsoulumax=&bdfanlian=&bdfanlianmax=&wailian=&wailianmax=&sitehis=&sitehismax=&siteinfohis=&siteinfohismax=&bdrenzheng=&wxcheck=&qqcheck=&wallcheck=&bdpingjia=&ismiiban=&guonei=&linktype=&isqy=&viewcount=&sitetitle=&icpwzmc=&haveuser=&isbid=&pageno={page}&pagesize=1000&mode=wedel'
try:
response0 = await client.post("https://www.xxxx.cn/services/newlist.asp",
headers=headers, data=payload )
#print(client.headers)
if response0.status_code == 200:
jsonstr = json.loads(response0.text.replace('\\', '\\\\'))
if jsonstr['code'] == 200:
print("获取列表成功")
total = jsonstr['body']['total']
page = jsonstr['body']['pageno']
itemJson = jsonstr['body']['items']
print({"total": total, "page": page})
return {"total": total, "page": page, "itemJson": itemJson}
else:
print("获取列表失败:",jsonstr)
pass
else:
print("获取列表失败:",response0.status_code)
except Exception as e:
print(e)
pass
async def main():
results = []
client = httpx.AsyncClient()
global cookies
if cookies == "":
cookies = await loginWest(client)
#await asyncio.sleep(1)
print(cookies)
tasks = []
for i in range(1, 10):
# 添加一个协程到列表中
tasks.append(asyncio.ensure_future(getDomain(i, client)))
print(tasks)
results = await asyncio.gather(*tasks)
# 检查响应状态码
for task in tasks :
response = task.result()
#print(response)
#res = await getDomain(i, client=client)
#results.append(res)
#await asyncio.sleep(1)
print(f"本次任务共{len(results)}页")
for item in results:
print(str(item)[:120])
#saveData(item)
await asyncio.sleep(60)
print("任务完成")
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
#运行输出结果如下:
获取列表成功
{'total': 2030589, 'page': 1}
本次任务共9页
{'total': 2030589, 'page': 1, 'itemJson': [{'refsmoney': 0, 'domext': 'online', 'domain':
None
None
None
None
None
None
None
None
任务完成
#上面的任务中,只有1个url完成,其他都显示了None。不知道什么原因