import requests,re
from openpyxl import Workbook
# wb=Workbook()
# ws =wb.active
# ws.append(["详情介绍"])
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
}
# https://www.hjutv.cn/detail_6119.html
page_url="https://www.hjutv.cn/show_2________{}___.html"
detail_url="https://www.hjutv.cn/detail_{}.html"
for i in range(1,28):
url = page_url.format(i)
response = requests.get(url=url, headers=headers)
html = response.text
li_pattern = re.compile(r'<li class="vodlist_item .*?">(.*?)</li>', re.S)
li_list = li_pattern.findall(html)
href_pattern = re.compile(r'detail_(\d+)', re.S)
detail_pattern = re.compile(r'<div class="content_desc context clearfix"><span>([^<]+)', re.S)
name_pattern = re.compile(r'<p class="vodlist_title"><a .*?>(.*?)</a></p>', re.S)
img_pattern =re.compile(r'<a class="vodlist_thumb lazyload" data-original=([^<]+)>')
for li in li_list:
href = href_pattern.findall(li)[0]
href = detail_url.format(href)
# print(href)
name = name_pattern.findall(li)[0]
print(name)
img =img_pattern.findall(li)
with open("./data/{}.jpg".format(name), "wb") as f:
f.write()
<a class="vodlist_thumb lazyload" href="/index.php/detail_6091.html"
data_original="https.jpg" ></a>