import requests
from lxml import etree
from openpyxl import Workbook
wb = Workbook()#实例化工作表
ws = wb.active#激活工作表
ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头
#准备url和headers
headers={
"User-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Mobile Safari/537.36"
}
all_url ="https://www.hjutv.cn/show_2________{}___.html"
base_url="https://www.hjutv.cn/type_2_.html" #base_url="https://www.hjutv.cn/"
for i in range(1,28):
url = all_url.format(i)
response = requests.get(url=url, headers=headers)
html =etree.HTML (response.text)
li_list = html.xpath('//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li')
print(len(li_list))
for li in li_list:
# 1.获取剧名
name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0]
# 2.获取演员
actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()')
if len(actor)==0:
actor="--"
else:
actor=','.join(actor)
# 3.获取更新状态
updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0]
# 4.获取详情页链接
detaillink=li.xpath('.//li[@class="vodlist_item num_1"]/a/@href')[0]
href=base_url+detaillink
print(name,actor,updatestatus,href)
# ws.append([name,actor,updatestatus])
# wb.save("韩剧数据表.xlsx")
抓取韩剧网站多个网页的韩剧详情页链接,图片,现在怎么改才能运行出来
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- CSDN专家-HGJ 2021-12-10 20:09关注
请求的headers和url都要写正确,添加referer,请求头改为 "user-agent",原代码中detaillink的xpath不正确,多页请求时添加延时。测试代码如下,可获取数据。
import requests from lxml import etree from openpyxl import Workbook import time wb = Workbook()#实例化工作表 ws = wb.active#激活工作表 ws.append(["剧名","演员","更新状态","详情页链接"]) # 添加表头 #准备url和headers headers={ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43",'referer': 'https://www.hjutv.cn/show_2___________.html', } all_url = "https://www.hjutv.cn/show_2________{}___.html" base_url="https://www.hjutv.cn/" for i in range(1,3): url = all_url.format(i) response = requests.get(url=url, headers=headers) html =etree.HTML (response.text) li_list = html.xpath( '//ul[@class="vodlist vodlist_wi author*qq3626/95/000 clearfix"]/li') #//*[@id="show_page"]/div[2]/div/div[2]/ul[1]/li[1] print(len(li_list)) for li in li_list: # 1.获取剧名 name = li.xpath('.//p[@class="vodlist_title"]/a/text()')[0] # 2.获取演员 actor = li.xpath('.//p[@class="vodlist_sub"]//a/text()') if len(actor)==0: actor="--" else: actor=','.join(actor) # 3.获取更新状态 updatestatus=li.xpath('.//span[@class="pic_text text_right"]/text()')[0] # 4.获取详情页链接 detaillink=li.xpath('.//a/@href')[0] href=base_url+detaillink print(name,actor,updatestatus,href) ws.append([name,actor,updatestatus,href]) time.sleep(1) wb.save("韩剧数据表.xlsx")
如有帮助,请点采纳。
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 1无用
悬赏问题
- ¥15 2020长安杯与连接网探
- ¥15 关于#matlab#的问题:在模糊控制器中选出线路信息,在simulink中根据线路信息生成速度时间目标曲线(初速度为20m/s,15秒后减为0的速度时间图像)我想问线路信息是什么
- ¥15 banner广告展示设置多少时间不怎么会消耗用户价值
- ¥16 mybatis的代理对象无法通过@Autowired装填
- ¥15 可见光定位matlab仿真
- ¥15 arduino 四自由度机械臂
- ¥15 wordpress 产品图片 GIF 没法显示
- ¥15 求三国群英传pl国战时间的修改方法
- ¥15 matlab代码代写,需写出详细代码,代价私
- ¥15 ROS系统搭建请教(跨境电商用途)