tylrr 2023-07-13 16:28 采纳率: 84.6%
浏览 39
已结题

httpx异步爬取网页时,只有部分爬取成功的问题

我用httpx写了一个异步爬虫。 我在代码中指定了9个url,按道理 getDomain 会被执行9次。 但是程序运行时发现getDomain只会执行1到3次,其他几次根本没有调用getDomain。 麻烦大家看看是什么原因导致的? 应该不是目标网站屏蔽的问题。


import json
import traceback
import httpx
from datetime import datetime
import time
import re
import asyncio
from httpx import AsyncHTTPTransport,  Cookies


cookies =""
async def loginWest(client:httpx.AsyncClient):
    headers = {
        #'content-length': '100',
        'cache-control': 'max-age=0',
        'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'upgrade-insecure-requests': '1',
        'content-type': 'application/x-www-form-urlencoded',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9'
    }
    payload = 'u_name=tylrr1&u_password=123qwe&adds323sdsdsad=3546234134sadsa233&back_path2=&m=&module=enterzone'
    response0 = await client.post("https://www.xxxxx.cn/login.asp", headers=headers, data=payload ,follow_redirects=True)

    #print(response0.status_code)
    #print(response0.content.decode("gbk"))
    if "1902326" in response0.content.decode("gbk") :
        print("登录成功")
    print(response0.cookies)
    #return response0.cookies


async def getDomain(page:int ,client:httpx.AsyncClient):
    headers = {
        'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
        'accept': 'application/json, text/plain, */*',
        'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
        'x-requested-with': 'XMLHttpRequest',
        'sec-ch-ua-mobile': '?0',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
         }

    payload = f'domeq=&domkey=&domeq1=&domkey1=&domuneq=&domunkey=&domuneq1=&domunkey1=&domlen1=&domlen2=&topmoney=&topmoneymax=&price=&pricemax=&expday=&expdaymax=&arrdomext=&domclass=&domleiab=&deldate=&regyear=&regyearmax=&freeyd=0&deltype=&ordby=deldate&ordtp=&sogoupr=&sogouprmax=&baidupr=&baiduprmax=&sgsoulu=&sgsoulumax=&bdsoulu=&bdsoulumax=&bdfanlian=&bdfanlianmax=&wailian=&wailianmax=&sitehis=&sitehismax=&siteinfohis=&siteinfohismax=&bdrenzheng=&wxcheck=&qqcheck=&wallcheck=&bdpingjia=&ismiiban=&guonei=&linktype=&isqy=&viewcount=&sitetitle=&icpwzmc=&haveuser=&isbid=&pageno={page}&pagesize=1000&mode=wedel'

    try:
        response0 = await client.post("https://www.xxxx.cn/services/newlist.asp",
                                 headers=headers, data=payload  )
        #print(client.headers)
        if response0.status_code == 200:
            jsonstr = json.loads(response0.text.replace('\\', '\\\\'))
            if jsonstr['code'] == 200:
                print("获取列表成功")
                total = jsonstr['body']['total']
                page = jsonstr['body']['pageno']
                itemJson = jsonstr['body']['items']
                print({"total": total, "page": page})
                return {"total": total, "page": page, "itemJson": itemJson}
            else:
                print("获取列表失败:",jsonstr)
                pass
        else:
            print("获取列表失败:",response0.status_code)
    except Exception as e:
        print(e)
        pass



async def main():
    results = []
    client = httpx.AsyncClient()
    global cookies
    if cookies == "":
        cookies = await loginWest(client)
        #await asyncio.sleep(1)
        print(cookies)

    tasks = []
    for i in range(1, 10):
        # 添加一个协程到列表中
        tasks.append(asyncio.ensure_future(getDomain(i, client)))
    print(tasks)
    results = await asyncio.gather(*tasks)



    # 检查响应状态码
    for task in tasks :
        response = task.result()
        #print(response)
        #res = await getDomain(i, client=client)
        #results.append(res)
        #await asyncio.sleep(1)

    print(f"本次任务共{len(results)}页")
    for item in results:
        print(str(item)[:120])
        #saveData(item)
    await asyncio.sleep(60)
    print("任务完成")


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

#运行输出结果如下:

获取列表成功
{'total': 2030589, 'page': 1}

本次任务共9页
{'total': 2030589, 'page': 1, 'itemJson': [{'refsmoney': 0, 'domext': 'online', 'domain':
None
None
None
None
None
None
None
None
任务完成

#上面的任务中,只有1个url完成,其他都显示了None。不知道什么原因

  • 写回答

1条回答 默认 最新

  • tylrr 2023-07-14 23:31
    关注

    有大佬指点一下吗?

    评论

报告相同问题?

问题事件

  • 系统已结题 7月21日
  • 修改了问题 7月13日
  • 修改了问题 7月13日
  • 创建了问题 7月13日