pino 曹 2022-11-07 17:18 采纳率: 0%
浏览 15

为什么我的Python协程爬虫运行速度比预想中慢很多?

问题遇到的现象和发生背景

我最近在学习python爬虫,目前尝试从电影天堂上爬取一些电影的信息。然后我想尝试多线程和协程爬虫,爬取三页电影的信息。多线程的运行速度符合预期,但是协程的速度比我预想中慢太多了。从网上查找的资料上来看,协程爬虫的速度甚至应该是在多线程之上的。而我实际测试下来,协程的速度跟同步的几乎一样。我想知道是否我的协程代码编写上有问题?或者是其他方面的原因导致

用代码块功能插入代码,请勿粘贴截图

#协程爬虫

import datetime
import time
import aiohttp
import httpx
import requests
import asyncio
import csv
from bs4 import BeautifulSoup
from datetime import datetime


def find_id(movie_href: str):  # 从链接中取出ID
    char1 = '.'
    char2 = '/'
    npos1 = movie_href.rfind(char1)
    npos2 = movie_href.rfind(char2)
    ID = movie_href[npos2 + 1:npos1]
    return ID


head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm"  # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html"  # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
                  'Safari/537.36',
    'Connection': 'close',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'referer': ''}
httpx.DEFAULT_RETRIES = 5  # 设置重试次数为5
data_list = []
client = httpx.AsyncClient(timeout=None,)
count = 0


async def craw_one_page(i):

    url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
    print(f"开始爬取第{i}页,时间:{start}")
    flag = True
    # while flag:
    #     try:
    #         response = await client.get(url, headers=headers)
    #         # await asyncio.sleep(0.01)
    #         flag = False
    #     except Exception as e:
    #         print("频繁访问!"+str(e))
    #         flag = True
    response = await client.get(url, headers=headers)
    await fetch_content(response)


async def fetch_content(response):
    global count
    # response = await client.get(url, headers=headers)
    response.encoding = 'gb2312'  # 编码格式gb2312
    homepage = response.text  # 主页
    soup = BeautifulSoup(homepage, "lxml")
    movies_list = soup.find_all('a', class_='ulink')  # 取当前页面所有的电影

    for movie in movies_list:
        name = movie.text
        href = "https://m.dytt8.net" + movie['href']
        ID = find_id(href)
        resp = requests.get(href, headers=headers)
        resp.encoding = 'gb2312'
        soup2 = BeautifulSoup(resp.text, "lxml")
        Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
        updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        movie_dict = {
            "ID": ID,
            "电影名称": name,
            "网页": href,
            "磁链": Magnet_URI,
            "更新时间": updated_time}
        # data_list.append(movie_dict)
        print(movie_dict)
        count += 1


async def main():
    arr = [1, 2, 3]
    task = [craw_one_page(i) for i in arr]
    await asyncio.gather(*task)

if __name__ == '__main__':
    start = time.time()
    asyncio.run(main())
    # print(data_list)
    print(f"耗时{(time.time()-start)}秒")
    print(f"爬取电影共计{count}条")


#多线程爬虫

import datetime
import time
import requests
import threading
import csv
from bs4 import BeautifulSoup
from datetime import datetime

count = [0, 0, 0]


def find_id(movie_href: str):  # 从链接中取出ID
    char1 = '.'
    char2 = '/'
    npos1 = movie_href.rfind(char1)
    npos2 = movie_href.rfind(char2)
    ID = movie_href[npos2 + 1:npos1]
    return ID


head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm"  # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html"  # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
                  'Safari/537.36',
    'Connection': 'close',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'referer': ''}
requests.DEFAULT_RETRIES = 5  # 设置重试次数为5
data_list = []


def craw_one_page(i):
    global count
    # for i in range(1, 2):
    url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
    start = time.time()
    print(f"开始爬取第{i}页,时间:{start}")
    response = requests.get(url, headers=headers)
    time.sleep(0.3)
    response.encoding = 'gb2312'  # 编码格式gb2312
    homepage = response.text  # 主页
    soup = BeautifulSoup(homepage, "lxml")
    movies_list = soup.find_all('a', class_='ulink')  # 取当前页面所有的电影

    for movie in movies_list:
        name = movie.text
        href = "https://m.dytt8.net" + movie['href']
        ID = find_id(href)
        resp = requests.get(href, headers=headers)
        resp.encoding = 'gb2312'
        soup2 = BeautifulSoup(resp.text, "lxml")
        Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
        updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        movie_dict = {
            "ID": ID,
            "电影名称": movie.text,
            "网页": href,
            "磁链": Magnet_URI,
            "更新时间": updated_time}
        print(movie_dict)
        count[i - 1] += 1
        data_list.append(movie_dict)
    f = open('test.csv', mode='w', encoding="gb2312", newline="")
    with f:
        w = csv.DictWriter(f, head_list)
        w.writeheader()
        w.writerows(data_list)
    over = time.time()
    print(f"第{i}页爬取完毕,{count[i-1]}条数据,耗时{(over-start)}")


if __name__ == '__main__':
    Sum = 0
    t1 = threading.Thread(target=craw_one_page, args=(1,))
    t2 = threading.Thread(target=craw_one_page, args=(2,))
    t3 = threading.Thread(target=craw_one_page, args=(3,))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()
    for i in range(len(count)):
        Sum += count[i]
    print(f"总共{Sum}条数据")

#         data_list.append(movie_dict)
# f = open('test.csv', mode='w', encoding="gb2312", newline="")
# with f:
#     w = csv.DictWriter(f, head_list)
#     w.writeheader()
#     w.writerows(data_list)

运行结果及报错内容

协程爬虫的运行时间在7090秒左右,基本就是同步爬虫的运行速度,完全没有体现出协程的作用,而多线程爬虫的运行时间在2030秒左右。除此之外,协程爬虫在运行一段时间后有概率有报错。

我的解答思路和尝试过的方法

我将协程和多线程的爬虫代码都放上来,希望可以帮我看看问题是否出现在代码上

我想要达到的结果

希望协程爬虫的运行速度接近或快于多线程爬虫,因为从网上的资料看这样的速度是比较符合预期的。

  • 写回答

1条回答 默认 最新

  • CSDN-Ada助手 CSDN-AI 官方账号 2022-11-07 20:36
    关注
    评论

报告相同问题?

问题事件

  • 创建了问题 11月7日

悬赏问题

  • ¥15 x趋于0时tanx-sinx极限可以拆开算吗
  • ¥500 把面具戴到人脸上,请大家贡献智慧
  • ¥15 任意一个散点图自己下载其js脚本文件并做成独立的案例页面,不要作在线的,要离线状态。
  • ¥15 各位 帮我看看如何写代码,打出来的图形要和如下图呈现的一样,急
  • ¥30 c#打开word开启修订并实时显示批注
  • ¥15 如何解决ldsc的这条报错/index error
  • ¥15 VS2022+WDK驱动开发环境
  • ¥30 关于#java#的问题,请各位专家解答!
  • ¥30 vue+element根据数据循环生成多个table,如何实现最后一列 平均分合并
  • ¥20 pcf8563时钟芯片不启振