问题遇到的现象和发生背景
我最近在学习python爬虫,目前尝试从电影天堂上爬取一些电影的信息。然后我想尝试多线程和协程爬虫,爬取三页电影的信息。多线程的运行速度符合预期,但是协程的速度比我预想中慢太多了。从网上查找的资料上来看,协程爬虫的速度甚至应该是在多线程之上的。而我实际测试下来,协程的速度跟同步的几乎一样。我想知道是否我的协程代码编写上有问题?或者是其他方面的原因导致
用代码块功能插入代码,请勿粘贴截图
#协程爬虫
import datetime
import time
import aiohttp
import httpx
import requests
import asyncio
import csv
from bs4 import BeautifulSoup
from datetime import datetime
def find_id(movie_href: str): # 从链接中取出ID
char1 = '.'
char2 = '/'
npos1 = movie_href.rfind(char1)
npos2 = movie_href.rfind(char2)
ID = movie_href[npos2 + 1:npos1]
return ID
head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm" # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html" # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
'Safari/537.36',
'Connection': 'close',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'referer': ''}
httpx.DEFAULT_RETRIES = 5 # 设置重试次数为5
data_list = []
client = httpx.AsyncClient(timeout=None,)
count = 0
async def craw_one_page(i):
url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
print(f"开始爬取第{i}页,时间:{start}")
flag = True
# while flag:
# try:
# response = await client.get(url, headers=headers)
# # await asyncio.sleep(0.01)
# flag = False
# except Exception as e:
# print("频繁访问!"+str(e))
# flag = True
response = await client.get(url, headers=headers)
await fetch_content(response)
async def fetch_content(response):
global count
# response = await client.get(url, headers=headers)
response.encoding = 'gb2312' # 编码格式gb2312
homepage = response.text # 主页
soup = BeautifulSoup(homepage, "lxml")
movies_list = soup.find_all('a', class_='ulink') # 取当前页面所有的电影
for movie in movies_list:
name = movie.text
href = "https://m.dytt8.net" + movie['href']
ID = find_id(href)
resp = requests.get(href, headers=headers)
resp.encoding = 'gb2312'
soup2 = BeautifulSoup(resp.text, "lxml")
Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
movie_dict = {
"ID": ID,
"电影名称": name,
"网页": href,
"磁链": Magnet_URI,
"更新时间": updated_time}
# data_list.append(movie_dict)
print(movie_dict)
count += 1
async def main():
arr = [1, 2, 3]
task = [craw_one_page(i) for i in arr]
await asyncio.gather(*task)
if __name__ == '__main__':
start = time.time()
asyncio.run(main())
# print(data_list)
print(f"耗时{(time.time()-start)}秒")
print(f"爬取电影共计{count}条")
#多线程爬虫
import datetime
import time
import requests
import threading
import csv
from bs4 import BeautifulSoup
from datetime import datetime
count = [0, 0, 0]
def find_id(movie_href: str): # 从链接中取出ID
char1 = '.'
char2 = '/'
npos1 = movie_href.rfind(char1)
npos2 = movie_href.rfind(char2)
ID = movie_href[npos2 + 1:npos1]
return ID
head_list = ["ID", "电影名称", "网页", "磁链", "更新时间"]
# domain = "https://m.dytt8.net/index2.htm" # 电影天堂域名
# url = "https://m.dytt8.net/html/gndy/dyzz/index.html" # 电影天堂最新电影
# url = "https://m.dytt8.net/html/gndy/dyzz/list_23_1.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 '
'Safari/537.36',
'Connection': 'close',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'referer': ''}
requests.DEFAULT_RETRIES = 5 # 设置重试次数为5
data_list = []
def craw_one_page(i):
global count
# for i in range(1, 2):
url = "https://m.dytt8.net/html/gndy/dyzz/list_23_" + str(i) + ".html"
start = time.time()
print(f"开始爬取第{i}页,时间:{start}")
response = requests.get(url, headers=headers)
time.sleep(0.3)
response.encoding = 'gb2312' # 编码格式gb2312
homepage = response.text # 主页
soup = BeautifulSoup(homepage, "lxml")
movies_list = soup.find_all('a', class_='ulink') # 取当前页面所有的电影
for movie in movies_list:
name = movie.text
href = "https://m.dytt8.net" + movie['href']
ID = find_id(href)
resp = requests.get(href, headers=headers)
resp.encoding = 'gb2312'
soup2 = BeautifulSoup(resp.text, "lxml")
Magnet_URI = soup2.select_one('#Zoom > td > a')['href']
updated_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
movie_dict = {
"ID": ID,
"电影名称": movie.text,
"网页": href,
"磁链": Magnet_URI,
"更新时间": updated_time}
print(movie_dict)
count[i - 1] += 1
data_list.append(movie_dict)
f = open('test.csv', mode='w', encoding="gb2312", newline="")
with f:
w = csv.DictWriter(f, head_list)
w.writeheader()
w.writerows(data_list)
over = time.time()
print(f"第{i}页爬取完毕,{count[i-1]}条数据,耗时{(over-start)}")
if __name__ == '__main__':
Sum = 0
t1 = threading.Thread(target=craw_one_page, args=(1,))
t2 = threading.Thread(target=craw_one_page, args=(2,))
t3 = threading.Thread(target=craw_one_page, args=(3,))
t1.start()
t2.start()
t3.start()
t1.join()
t2.join()
t3.join()
for i in range(len(count)):
Sum += count[i]
print(f"总共{Sum}条数据")
# data_list.append(movie_dict)
# f = open('test.csv', mode='w', encoding="gb2312", newline="")
# with f:
# w = csv.DictWriter(f, head_list)
# w.writeheader()
# w.writerows(data_list)
运行结果及报错内容
协程爬虫的运行时间在7090秒左右,基本就是同步爬虫的运行速度,完全没有体现出协程的作用,而多线程爬虫的运行时间在2030秒左右。除此之外,协程爬虫在运行一段时间后有概率有报错。
我的解答思路和尝试过的方法
我将协程和多线程的爬虫代码都放上来,希望可以帮我看看问题是否出现在代码上
我想要达到的结果
希望协程爬虫的运行速度接近或快于多线程爬虫,因为从网上的资料看这样的速度是比较符合预期的。