爬取目标:原神官网新闻页面上的封面图片及其中的大图与视频
遇到的问题:原神官网视频太大,由于有时限导致任务被取消并报错TimeoutError,结果导致视频字节数据不完整即无法打开
问题出现的部分代码:
async def request_IV(self, href, special, page_str):
for src in href:
src_href = src.group('href')
self.src_href = src_href
if self.check('http'): # 检查得到的内容开头字符串是否是http
src_href = src_href.replace('\\u002F', '/')
else:
special = True
break
async with aiohttp.ClientSession() as img_session:
async with img_session.get(src_href) as picture:
# 请求网址,并写入字节数据
name = src_href.split('/')[-1]
async with aiofiles.open(f"img&video/{name}", 'wb') as f:
# 原神官网视频太大,由于有时限导致任务被取消,视频字节数据不完整即无法打开,遇到视频时保证字节写入得以正常进行
await f.write(picture.content.read())
print(f'下载地址:{src_href} 下载完成!{name}')
if special:
await special_href(page_str)
程序整个代码:
import asyncio
import aiohttp
import aiofiles
import re
# 从方法脱离出的函数:
async def cover(value, title):
if value:
cover_picture_url = value[0]['url']
# 请求封面图片网址,获取字节数据并写入:
# cover_picture = requests.get(cover_picture_url)
async with aiohttp.ClientSession() as main_session:
async with main_session.get(cover_picture_url) as cover_picture:
content = await cover_picture.content.read()
async with aiofiles.open(f'cover_img/{title}.jpg', 'wb') as cpf:
await cpf.write(content)
print(f'下载完成!地址:{cover_picture_url}标题名:{title}')
async def special_href(page_str):
text = await page_str.text()
special_obj = re.compile(r'window.+?height=.+?src=\\"(?P<href>.+?)\\" width=.+?;</script>')
if not special_obj.findall(text):
special_obj = re.compile(r'window.+?height=.+?src=\\"(?P<href>.+?)\\" style=\\"max-width.+?;</script>')
href_special_obj = special_obj.finditer(text) # text原:await page_str.text()
for src in href_special_obj:
src_href = src.group('href')
src_href = src_href.replace('\\u002F', '/')
# special = requests.get(href)
async with aiohttp.ClientSession() as special_session:
async with special_session.get(src_href) as special:
# 请求视频或图片网址,并写入字节数据
name = src_href.split('/')[-1]
async with aiofiles.open(f"img&video/{name}", 'wb') as pf:
await pf.write(await special.content.read())
print(f'下载地址:{src_href} 下载完成!{name}')
class GenshinImgVideo:
def __init__(self, url: str):
self.url = url
# 在后期运行时发现存在的问题:
self.problem = None
# 保存匹配到的内容,以检查是否正确:
self.src_href = None
# 问题处理
def check(self, goal_str: str):
goal = self.src_href[0:4]
# 检查所获取的网址内容前四位是否是目标字符
if goal_str == goal:
return True
else:
return False
async def get_data_list(self):
tasks = []
url = self.url
# resp = requests.get(url)
async with aiohttp.ClientSession() as data_session:
async with data_session.get(url) as resp:
all_dic = await resp.json()
dicA = all_dic['data']['list']
# 开始查获相关数据
for dic in dicA:
contentId = dic['contentId']
title = dic['title']
value = dic['ext'][1]['value']
task = asyncio.create_task(self.main(contentId, title, value))
tasks.append(task)
await asyncio.wait(tasks)
async def request_IV(self, href, special, page_str):
for src in href:
src_href = src.group('href')
self.src_href = src_href
if self.check('http'):
src_href = src_href.replace('\\u002F', '/')
else:
special = True
break
async with aiohttp.ClientSession() as img_session:
async with img_session.get(src_href) as picture:
# 请求网址,并写入字节数据
name = src_href.split('/')[-1]
async with aiofiles.open(f"img&video/{name}", 'wb') as f:
# 原神官网视频太大,由于有时限导致任务被取消,视频字节数据不完整即无法打开,遇到视频时保证字节写入得以正常进行
await f.write(picture.content.read())
print(f'下载地址:{src_href} 下载完成!{name}')
if special:
await special_href(page_str)
async def enter_to_find(self, contentId):
# 开始进入子页面
child_page_url = f'https://ys.mihoyo.com/main/news/detail/{contentId}'
self.problem = child_page_url # 应对后期发现的某个问题
# page_str = requests.get(child_page_url)
async with aiohttp.ClientSession() as child_session:
async with child_session.get(child_page_url) as page_str:
# 寻找文件名并合成视图请求网址
special = False
obj_img = re.compile(r'window.+?src=\\"(?P<href>.+?)\\" width=.+?;</script>')
obj_video = re.compile(r'window.+?src=\\"(?P<href>.+?)\\" style=\\"max-width.+?;</script>')
obj_video_new = re.compile(r'window.+?poster=.+?src=\\"(?P<href>.+?)\\".+?……', re.S)
text = await page_str.text()
href_obj_img = obj_img.finditer(text)
href_obj_video = obj_video.finditer(text)
href_obj_video_new = obj_video_new.finditer(text)
if obj_img.findall(text):
await self.request_IV(href_obj_img, special, page_str)
elif obj_video.findall(text):
# 原神官网视频太大,由于有时限导致任务被取消,视频字节数据不完整即无法打开
await asyncio.shield(self.request_IV(href_obj_video, special, page_str))
"""但是这个地方不是问题源头"""
elif obj_video_new.findall(text):
# 原神官网视频太大,由于有时限导致任务被取消,视频字节数据不完整即无法打开
await asyncio.shield(self.request_IV(href_obj_video_new, special, page_str))
"""但是这个地方不是问题源头"""
async def main(self, contentId, title, value):
tasks = [
asyncio.create_task(cover(value, title)),
asyncio.create_task(self.enter_to_find(contentId))
]
await asyncio.wait(tasks)
async def run(self):
try:
await self.get_data_list()
except TimeoutError:
self.write_error(None)
except aiohttp.ClientPayloadError as eC:
self.write_error(eC)
else:
pass
def write_error(self, em):
if em:
print(em)
with open('Error log.txt', 'a', encoding='utf-8') as fE:
fE.write(str(em))
print(self.problem)
with open(f'log.txt', 'a', encoding='utf-8') as file:
file.write(self.problem)
"""
# 对于后期发现的问题,打印以显示,并保存到日志文件中
def puts_problem(self, num_time: int):
print(self.problem)
with open(f'log{num_time}.txt', 'w', encoding='utf-8') as file:
file.write(self.problem)"""
async def main(number: int):
tasks = []
for i in range(1, number+1):
url = f'https://content-static.mihoyo.com/content/ysCn/getContentList?pageSize=5&pageNum={i}&channelId=10'
exe = GenshinImgVideo(url)
task = asyncio.create_task(exe.run())
tasks.append(task)
await asyncio.wait(tasks)
if __name__ == '__main__':
times = input('请输入所翻页数:')
try:
num = int(times)
except ValueError:
print('请输入数字!')
else:
asyncio.run(main(num))
有人说说用asyncio.shield()进行封装,保证任务不会被取消,但我不太会用,封装picture.content.read()结果一样,封装f.write(picture.content.read())又报TypeError错误,所以谢谢各位帮我解决这个问题了