python异步爬虫：（视频过大导致的）TimeoutError

爬取目标：原神官网新闻页面上的封面图片及其中的大图与视频
遇到的问题：原神官网视频太大，由于有时限导致任务被取消并报错TimeoutError，结果导致视频字节数据不完整即无法打开

问题出现的部分代码：


    async def request_IV(self, href, special, page_str):
        for src in href:
            src_href = src.group('href')

            self.src_href = src_href

            if self.check('http'):  # 检查得到的内容开头字符串是否是http
                src_href = src_href.replace('\\u002F', '/')
            else:
                special = True
                break

            async with aiohttp.ClientSession() as img_session:
                async with img_session.get(src_href) as picture:
                    # 请求网址，并写入字节数据
                    name = src_href.split('/')[-1]
                    async with aiofiles.open(f"img&video/{name}", 'wb') as f:
                        # 原神官网视频太大，由于有时限导致任务被取消，视频字节数据不完整即无法打开，遇到视频时保证字节写入得以正常进行
                        await f.write(picture.content.read())
                        print(f'下载地址：{src_href} 下载完成！{name}')
        if special:
            await special_href(page_str)

程序整个代码：

import asyncio
import aiohttp
import aiofiles
import re


# 从方法脱离出的函数：
async def cover(value, title):
    if value:
        cover_picture_url = value[0]['url']
        # 请求封面图片网址，获取字节数据并写入：
        # cover_picture = requests.get(cover_picture_url)
        async with aiohttp.ClientSession() as main_session:
            async with main_session.get(cover_picture_url) as cover_picture:
                content = await cover_picture.content.read()
                async with aiofiles.open(f'cover_img/{title}.jpg', 'wb') as cpf:
                    await cpf.write(content)
                    print(f'下载完成！地址：{cover_picture_url}标题名：{title}')


async def special_href(page_str):
    text = await page_str.text()
    special_obj = re.compile(r'window.+?height=.+?src=\\"(?P<href>.+?)\\" width=.+?;</script>')
    if not special_obj.findall(text):
        special_obj = re.compile(r'window.+?height=.+?src=\\"(?P<href>.+?)\\" style=\\"max-width.+?;</script>')
    href_special_obj = special_obj.finditer(text)  # text原：await page_str.text()
    for src in href_special_obj:
        src_href = src.group('href')
        src_href = src_href.replace('\\u002F', '/')
        # special = requests.get(href)
        async with aiohttp.ClientSession() as special_session:
            async with special_session.get(src_href) as special:
                # 请求视频或图片网址，并写入字节数据
                name = src_href.split('/')[-1]
                async with aiofiles.open(f"img&video/{name}", 'wb') as pf:
                    await pf.write(await special.content.read())
                    print(f'下载地址：{src_href} 下载完成！{name}')


class GenshinImgVideo:
    def __init__(self, url: str):
        self.url = url
        # 在后期运行时发现存在的问题：
        self.problem = None
        # 保存匹配到的内容，以检查是否正确：
        self.src_href = None

    # 问题处理
    def check(self, goal_str: str):
        goal = self.src_href[0:4]
        # 检查所获取的网址内容前四位是否是目标字符
        if goal_str == goal:
            return True
        else:
            return False

    async def get_data_list(self):
        tasks = []
        url = self.url
        # resp = requests.get(url)
        async with aiohttp.ClientSession() as data_session:
            async with data_session.get(url) as resp:
                all_dic = await resp.json()
                dicA = all_dic['data']['list']
                # 开始查获相关数据
                for dic in dicA:
                    contentId = dic['contentId']
                    title = dic['title']
                    value = dic['ext'][1]['value']
                    task = asyncio.create_task(self.main(contentId, title, value))
                    tasks.append(task)
                await asyncio.wait(tasks)

    async def request_IV(self, href, special, page_str):
        for src in href:
            src_href = src.group('href')

            self.src_href = src_href

            if self.check('http'):
                src_href = src_href.replace('\\u002F', '/')
            else:
                special = True
                break

            async with aiohttp.ClientSession() as img_session:
                async with img_session.get(src_href) as picture:
                    # 请求网址，并写入字节数据
                    name = src_href.split('/')[-1]
                    async with aiofiles.open(f"img&video/{name}", 'wb') as f:
                        # 原神官网视频太大，由于有时限导致任务被取消，视频字节数据不完整即无法打开，遇到视频时保证字节写入得以正常进行
                        await f.write(picture.content.read())
                        print(f'下载地址：{src_href} 下载完成！{name}')
        if special:
            await special_href(page_str)

    async def enter_to_find(self, contentId):
        # 开始进入子页面
        child_page_url = f'https://ys.mihoyo.com/main/news/detail/{contentId}'

        self.problem = child_page_url  # 应对后期发现的某个问题

        # page_str = requests.get(child_page_url)
        async with aiohttp.ClientSession() as child_session:
            async with child_session.get(child_page_url) as page_str:
                # 寻找文件名并合成视图请求网址
                special = False
                obj_img = re.compile(r'window.+?src=\\"(?P<href>.+?)\\" width=.+?;</script>')
                obj_video = re.compile(r'window.+?src=\\"(?P<href>.+?)\\" style=\\"max-width.+?;</script>')
                obj_video_new = re.compile(r'window.+?poster=.+?src=\\"(?P<href>.+?)\\".+?&hellip;&hellip;', re.S)
                text = await page_str.text()
                href_obj_img = obj_img.finditer(text)
                href_obj_video = obj_video.finditer(text)
                href_obj_video_new = obj_video_new.finditer(text)
                if obj_img.findall(text):
                    await self.request_IV(href_obj_img, special, page_str)
                elif obj_video.findall(text):
                    # 原神官网视频太大，由于有时限导致任务被取消，视频字节数据不完整即无法打开
                    await asyncio.shield(self.request_IV(href_obj_video, special, page_str))
                    """但是这个地方不是问题源头"""
                elif obj_video_new.findall(text):
                    # 原神官网视频太大，由于有时限导致任务被取消，视频字节数据不完整即无法打开
                    await asyncio.shield(self.request_IV(href_obj_video_new, special, page_str))
                    """但是这个地方不是问题源头"""

    async def main(self, contentId, title, value):
        tasks = [
            asyncio.create_task(cover(value, title)),
            asyncio.create_task(self.enter_to_find(contentId))
        ]
        await asyncio.wait(tasks)

    async def run(self):
        try:
            await self.get_data_list()
        except TimeoutError:
            self.write_error(None)
        except aiohttp.ClientPayloadError as eC:
            self.write_error(eC)
        else:
            pass

    def write_error(self, em):
        if em:
            print(em)
            with open('Error log.txt', 'a', encoding='utf-8') as fE:
                fE.write(str(em))
        print(self.problem)
        with open(f'log.txt', 'a', encoding='utf-8') as file:
            file.write(self.problem)
    """
    # 对于后期发现的问题，打印以显示，并保存到日志文件中
    def puts_problem(self, num_time: int):
        print(self.problem)
        with open(f'log{num_time}.txt', 'w', encoding='utf-8') as file:
            file.write(self.problem)"""


async def main(number: int):
    tasks = []
    for i in range(1, number+1):
        url = f'https://content-static.mihoyo.com/content/ysCn/getContentList?pageSize=5&pageNum={i}&channelId=10'
        exe = GenshinImgVideo(url)
        task = asyncio.create_task(exe.run())
        tasks.append(task)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    times = input('请输入所翻页数：')
    try:
        num = int(times)
    except ValueError:
        print('请输入数字！')
    else:
        asyncio.run(main(num))

有人说说用asyncio.shield()进行封装，保证任务不会被取消，但我不太会用，封装picture.content.read()结果一样，封装f.write(picture.content.read())又报TypeError错误，所以谢谢各位帮我解决这个问题了

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

14条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
这次真没糖 2023-06-04 10:12
关注
我根据你的逻辑没有找到原神视频url，不知道这视频传输是什么，只能建议你这用多线程试试看，另外我发现你这所有的地方异步，但是实际上只需要异步下载图片和视频那里就行了，需要运用一个生产者-多个消费者模型。
我一般就用线程池爬取，只要能够获取到准确的数据就行了，效率高低不太重要，快了被封或者把别人服务器搞崩，那是要喝茶的

本回答被题主选为最佳回答 , 对您是否有帮助呢?

解决无用
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容

查看更多回答(13条)

报告相同问题？

关注问题

关于#python#的问题：python爬虫发送请求时添加cookie过长导致报错 python 爬虫
2022-08-25 15:41

回答 3 已采纳这并不是过长导致的，这是格式错误，参考下面步骤，不用一个一个手写参数，直接生成所有请求参数代码就不会报这种错误了：浏览器抓包找到该请求，右键复制-->以cULR格式复制到https://spi
Python爬虫错误：json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) json python 有问必答爬虫
2022-02-22 13:40

回答 5 已采纳 async+await方法的url多了个斜杠，去掉就行了。要不多的那个斜杠接口出错返回的是html代码了，调用json()出错了，内容不是json字符串测试代码如下 import json im
python中{:2d}是什么意思 python 有问必答
2021-07-07 02:04

回答 2 已采纳 d表示要输出一个整数，2表示这个整数至少要占2个字符，如果这个整数只有一位数不足2个字符，会在前面补充空格。
python 基于aiohttp的异步爬虫实战详解
2022-09-14 20:13

程序员王炸的博客接下来我们会详细介绍aiohttp库的用法和爬取实战。aiohttp 是一个支持异步请求的库，它和 asyncio 配合使用，可以使我们非常方便地实现异步请求...以上就是借助协程async和异步aiohttp两个主要模块完成异步爬虫的内容，
Python网络爬虫中json解析失败 json python 有问必答爬虫
2022-02-26 20:51

回答 2 已采纳这个接口返回的是jsonp数据，不是json，要获取text替换掉回调函数名称和前后的括号后才是json数据
Python爬虫配合VPN爬取出现报错 python 爬虫
2021-12-22 17:33

回答 1 已采纳你这个是VPN代理问题，你可以将VPN设置成部分代理，不要全部代理你的网络。
python爬虫html获取不全 html python 爬虫
2022-06-24 19:43

回答 1 已采纳其实有的，但是这个网站应该是为了懒加载把url用base64密了一下，然后再动态加载，其实我下面发的这个就是url 是base64后的url 解码后就是https://s1.aigei.com/
python 异步io_Python中的异步IO：完整的演练
2020-07-15 01:15

cumei1658的博客 python 异步ioAsync IO is a concurrent programming design that has received dedicated support in Python, evolving rapidly from Python 3.4 through... 异步IO是一种并发编程设计，已获得Python的专门支持，从...
python爬虫：soup.select（）抓取信息路径表达问题 python
2019-07-02 18:16

回答 1 已采纳没用过select，但看样子是这样用的 ``` from bs4 import BeautifulSoup import requests url = 'http://bj.xiaozhu
用python抓取爬虫时无法抓取::before与::after之间的内容
2016-10-06 03:24

回答 3 已采纳可能是Ajax异步加载的。需要用selenium等模拟浏览器
Python爬虫时遇到问题： json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) json python 爬虫
2022-07-23 12:09

回答 3 已采纳
pythonapi异步_python基础教程：异步IO 之 API
2020-11-27 17:43

weixin_39930557的博客到了Python最新稳定版 3.7 这个版本，asyncio又做了比较大的调整，把这个库的API分为了高层级API和低层级API，并引入asyncio.run()这样的高级方法，让编写异步程序更加简洁。本节希望提纲挈领地介绍最新 3.7 版的...
关于使用python实现的网页爬虫程序卡死的问题 python 有问必答爬虫
2021-08-07 13:04

回答 3 已采纳你可以用time模块进行计时，每过10分钟先用os.system()重新打开程序，然后调用sys.exit()关闭旧进程如果有用，希望采纳哦~
python 异步io框架_Python的异步IO：API
2021-01-13 19:59

小肉卷的博客到了Python最新稳定版 3.7 这个版本，asyncio又做了比较大的调整，把这个库的API分为了高层级API和低层级API，并引入asyncio.run()这样的高级方法，让编写异步程序更加简洁。本节希望提纲挈领地介绍最新 3.7 版的...
python异步编程asyncio
2022-03-11 11:21

hbase丶的博客前提概要：python因为GIL锁，所以运行都是单线程，导致python运行的速度慢，为此要解决这个问题有多进程、多线程，但是使用这些方法，我们就要多加考虑线程安全问题，顾很麻烦，所以推出了协程。协程运行在线程上，...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 6月18日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
已采纳回答 6月10日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 6月4日

悬赏问题

¥15 matlab数据降噪处理，提高数据的可信度，确保峰值信号的不损失？
¥15 怎么看我在bios每次修改的日志
¥15 python+mysql图书管理系统
¥15 Questasim Error: (vcom-13)
¥15 船舶旋回实验matlab
¥30 SQL 数组，游标，递归覆盖原值
¥15 为什么我的数据接收的那么慢呀有没有完整的 hal 库并代码呀有的话能不能发我一份并且我用 printf 函数显示处理之后的数据，用 debug 就不能运行了呢
¥20 gitlab 中文路径，无法下载
¥15 用动态规划算法均分纸牌
¥30 udp socket，bind 0.0.0.0 ，如何自动选取用户访问的服务器IP来回复数据

python异步爬虫：（视频过大导致的）TimeoutError

14条回答 默认 最新

问题事件

悬赏问题

14条回答默认最新