使用 Python 协程多并发下载任务数卡住不动

在使用 python 协程下载图片中，最终协程的任务数卡在 97 一直循环，不知道哪里出了问题，有大佬知道什么情况吗，困扰我好久
下图是我运行结果，附上代码。
图片说明

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import time
import os
import pandas as pd
import asyncio
import aiohttp
import aiomysql
from random import randint
import cchardet
import aiofiles
import logging


class sikupicture_Spider(object):
    def __init__(self):
        # self.seens_url = []
        self.loop = asyncio.get_event_loop()
        self.queue = asyncio.PriorityQueue()
        self._workers = 0  # 当前工作数
        self._max_workers = 150  # 最大工作数
        self.overtime = {}  # {url: times,} 记录失败的URL的次数
        self.overtime_threshold = 4
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        }
        self.list_content = []

    async def init_url(self):
        info = pd.read_excel(r"{}".format(os.path.abspath('moban.xlsx'))).fillna('')
        for ite in info.itertuples():
            await self.queue.put((randint(1, 5), getattr(ite, 'url')))

    async def fetch(self, session, url, timeout, headers=None, binary=False, proxy=None):
        _headers = self.headers
        if headers:
            _headers = headers
        try:
            async with session.get(url, headers=_headers, timeout=timeout, proxy=proxy, allow_redirects=False) as resp:
                status_code = resp.status
                if status_code == 403:
                    print("url-403", url)
                    if url in self.overtime:
                        self.overtime[url] += 1
                        if self.overtime[url] > self.overtime_threshold:
                            pass
                        await self.queue.put((randint(1, 5), url))
                    else:
                        self.overtime[url] = 1
                        await self.queue.put((randint(1, 5), url))
                    status_code = 0
                    html = None
                if binary:
                    text = await resp.read()
                    encoding = cchardet.detect(text)
                    html = text.encode(encoding, errors='ignore')
                else:
                    html = await resp.text()

        except TimeoutError:
            print("url-overtime", url)
            if url in self.overtime:
                self.overtime[url] += 1
                if self.overtime[url] > self.overtime_threshold:
                    pass
                await self.queue.put((randint(1, 5), url))
            else:
                self.overtime[url] = 1
                await self.queue.put((randint(1, 5), url))
            status_code = 0
            html = None
        return status_code, html

    async def download_img(self, session, img_url, timeout, url, headers=None, binary=True, proxy=None):
        _headers = self.headers
        if headers:
            _headers = headers
        try:
            async with session.get(img_url, headers=_headers, timeout=timeout, proxy=proxy, allow_redirects=False) as resp:
                status_code = resp.status
                if binary:
                    html = await resp.read()
                else:
                    html = await resp.text()
        except TimeoutError:
            print("url-overtime", img_url)
            if url in self.overtime:
                self.overtime[url] += 1
                if self.overtime[url] > self.overtime_threshold:
                    pass
                else:
                    await self.queue.put((randint(1, 5), url))
            else:
                self.overtime[url] = 1
                await self.queue.put((randint(1, 5), url))
            status_code = 0
            html = None
        return status_code, html

    def parse_source(self, source):
        try:
            response_1 = etree.HTML(source)
        except Exception as err:
            logging.error(f'parse error:{err}')
            url = ""
        else:
            img_url = response_1.xpath("//a[@href='javascript:;']/@supsrc")[0] if len(
                response_1.xpath("//a[@href='javascript:;']/@supsrc")[0]) else ""
        return img_url

    async def process(self, session, url, timeout):
        status, source = await self.fetch(session, url, timeout)
        file_name = url.replace("http://item.secoo.com/", "").replace(".shtml", "")
        if status == 200:
            img_url = self.parse_source(source)
            img_status, img_source = await self.download_img(session, img_url, timeout, url)
            if img_status == 200:
                async with aiofiles.open("F:\\dawnzhu\\picture\\"+file_name+".jpg", "wb") as f:
                    await f.write(img_source)
            self._workers -= 1
            print("任务完成", self._workers, "url_status", status, "img_status", img_status)
        else:
            self._workers -= 1
            print("任务完成", self._workers, "url_status", status,)

    async def loop_crawl(self):
        await self.init_url()
        timeout = aiohttp.ClientTimeout(total=20)
        conn = aiohttp.TCPConnector(loop=self.loop, limit=50, force_close=True, enable_cleanup_closed=True)
        session = aiohttp.ClientSession(connector=conn, timeout=timeout)
        while True:
            if self._workers >= self._max_workers:
                print("work 的判断")
                await asyncio.sleep(5)
                continue
            if self.queue.empty():
                print("队列是否为空....", self._workers)
                await asyncio.sleep(5)
                if self._workers == 0:
                    break
                continue
            _, url = await self.queue.get()
            asyncio.ensure_future(self.process(session, url, timeout))
            self._workers += 1
            print("队列剩余数量", self.queue.qsize(), self._workers)
        await session.close()

    def run(self):
        try:
            self.loop.run_until_complete(self.loop_crawl())
        except KeyboardInterrupt:
            self.loop.close()

if __name__ == '__main__':
    sp = sikupicture_Spider()
    sp.run()

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

1条回答

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
zqbnqsdsmd 2020-09-17 00:13
关注
https://www.csdn.net/gather_23/MtjakgysMTA3MTAtYmxvZwO0O0OO0O0O.html

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

python异步协程和多线程问题 python 有问必答爬虫
2021-11-25 21:18

回答 1 已采纳 aiohttp的高并发用了协程，而request+线程只是多线程，这个不一样。理论上是aiohttp速度比request+线程的快，但爬虫太快很容易被反爬
python使用requests大批量请求卡住怎么解决？ linux python
2019-01-29 16:24

回答 6 已采纳首先你的request最好设置一下超时，如果某个链接长时间没响应，你就会阻塞在那里；然后得百分百确认你的xpath是正确的，且是匹配里面所有url的，这个地方可以打个日志，看看是阻塞在某次循环了，还是
python 的多线程和协程？ python 有问必答
2021-05-25 23:59

回答 2 已采纳协程，英文Coroutines，是一种比线程更加轻量级的存在。正如一个进程可以拥有多个线程一样，一个线程也可以拥有多个协程。最重要的是，协程不是被操作系统内核所管理，而完全是由程序所控制（也就是在用户
Python全栈（四）高级编程技巧之10.Python多任务-协程
2020-02-07 20:32

AI码东道主的博客使用yield完成多任务，消耗的资源比线程、进程更少，yield from相当于一个for循环，并且省去了很多异常处理，协程是Python中另外一种实现多任务的方式，通过yield保存运行状态，用greenlet模块需要人工切换，gevent...
根据python编程从入门到实践书籍安装一个库一直卡着不动 python
2018-08-13 06:32

回答 2 已采纳这个提示你是正在升级brew，brew升级的话本身就是比较慢的，如果你是需要安装pygame库的话，直接pip install pygame 就可以了或者从网站上下载对应的资源就行了（pyt
不同协程执行了同一个任务多次 django python
2023-03-20 09:32

回答 3 已采纳该回答引用GPTᴼᴾᴱᴺᴬᴵ根据您提供的代码，存在以下问题：在主函数中，创建了两个进程，每个进程中又创建了三个线程，每个线程中又创建了五个协程，总共创建了30个协程，但是只放了1000个任务到队列中
Python：多输入数字求和（Python 程序控制结构） python
2021-10-25 22:41

回答 1 已采纳 sum=0 while True: number=input() if number: sum+=int(number) else: break
Python中的并发与协程：从asyncio到aiohttp
2023-09-15 12:29

禅与计算机程序设计艺术的博客随着计算机硬件性能的不断提升，越来越多的应用程序开始采用异步编程模式。协程（Coroutine）是一种比线程更轻量级的执行体，它可以让任务在运行时状态保存上下文信息，并且可以在不同点继续执行。协程通过“yield ...
用Python编程输入的数字以0结束 python
2021-10-29 22:19

回答 1 已采纳 s1=s2=0 while 1: n=int(input()) if n==0: break if n%3==0: s1+=1 if n
Python 多输入数字求和 python
2022-05-25 22:17

回答 3 已采纳空格改成空字符串
python3中怎么利用多线程快速打印数字？ python
2018-08-02 02:11

回答 2 已采纳多线程没办法“快速”打印数字。多线程之所以能提速，是有条件的，它主要解决两个问题，一个是充分利用多个cpu（或者多核cpu），实现并行计算。另一个是异步调用实现延迟隐藏。就打印数字这个
python并发编程多线程/多进程/协程
2022-08-29 17:26

ZHHHHHJ66的博客 python并发编程
[Python]asyncio异步协程停止问题 python 有问必答
2021-11-10 14:03

回答 3 已采纳 RuntimeError: no running event loop的错误没有在运行的事件循环应该是这里的问题吧，loop = asyncio.get_event_loop()
python协程的作用_浅谈Python协程
2020-11-21 00:47

weixin_39705193的博客协程协程，又称微线程，纤程。英文名Coroutine。一句话说明什么是线程：协程是一种用户态的轻量级线程。协程拥有自己的寄存器上下文和栈。协程调度切换时，将寄存器上下文和栈保存到其他地方，在切回来的时候，恢复...
『Python学习笔记』Python实现并发编程(补充joblib和pymysql)
2022-03-06 22:12

AI新视界的博客并发编程都是一项很常用很重要的技巧。比如我们上节课所讲的很常见的爬虫，就被广泛应用在工业界的各个领域。我们每天在各个网站、各个 App 上获取的新闻信息，很大一部分便是通过并发编程版的爬虫获得。 Python连接...
没有解决我的问题, 去提问

悬赏问题

¥50 如何用脚本实现输入法的热键设置
¥20 我想使用一些网络协议或者部分协议也行，主要想实现类似于traceroute的一定步长内的路由拓扑功能
¥30 深度学习，前后端连接
¥15 孟德尔随机化结果不一致
¥15 apm2.8飞控罗盘bad health，加速度计校准失败
¥15 求解O-S方程的特征值问题给出边界层布拉休斯平行流的中性曲线
¥15 谁有desed数据集呀
¥20 手写数字识别运行c仿真时，程序报错错误代码sim211-100
¥15 关于#hadoop#的问题
¥15 (标签-Python|关键词-socket)

使用 Python 协程多并发下载 任务数卡住不动

1条回答

悬赏问题

使用 Python 协程多并发下载任务数卡住不动