python爬取问题list index out of range

有偿帮忙看一下Python爬取文档信息所出现的问题

import requests
from lxml import etree
import csv
BEST_detail='https://www.ygdy8.net'

url='https://www.ygdy8.net/html/gndy/dyzz/20230630/63864.html'

headers={
    'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}

def get_detail_url(url):
    response=requests.get(url,headers=headers)

    text=response.content.decode('gbk')


    html=etree.HTML(text)

## print(text)
    detail_url=html.xpath("table[@class='tbspan']//a/@herf")
    ## print(detail_url)
    return detail_url

## get_detail_url(url)

def parse_detail_page(url):
    ## print(url)
    movie={}
    response=requests.get(url,headers=headers)
    text=response.content.decode('gbk')
    ## print(text)
    
    
    html=etree.HTML(text)
    title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    ## print(title)，加【0】获取列表里的元素
    movie['title']=title
    zoomE=html.xpath("//div[@id='Zoom']")[0]
    ## print(zoomE)
    cover=zoomE.xpath(".//img/@src")[0]
    ## print(cover) 获取海报图片链接地址
    movie['cover']=cover
    movie['grade']=0
    information=zoomE.xpath(".//text()")
    ## print(information)，解析内容，是个列表，需要对内容解析
    
    for index,info in enumerate(information):
        
        ## print(info) 更简洁
        ## print(index) 列表序号下标
        ## 先获得译名
        if info.startswith("◎译　　名"):
            info=info.replace("◎译　　名","").strip()
            ##print(info)
            movie['translate']=info
        if info.startswith("◎年　　代"):
            info=info.replace("◎年　　代","").strip()
            ##print(info)
            movie['year']=info
        if info.startswith("◎产　　地"):
            info=info.replace("◎产　　地","").strip()
            ##print(info)
            movie['area']=info            
        if info.startswith("◎类　　别"):
            info=info.replace("◎类　　别","").strip()
            ##print(info)
            movie['class']=info  
        if info.startswith("◎语　　言"):
            info=info.replace("◎语　　言","").strip()
            ##print(info)
            movie['language']=info              
        if info.startswith("◎字　　幕"):
            info=info.replace("◎字　　幕","").strip()
            ##print(info)
            movie['zimu']=info                     
        if info.startswith("◎上映日期"):
            info=info.replace("◎上映日期","").strip()
            ##print(info)
            movie['data']=info     
        if info.startswith("◎豆瓣评分"):
            info=info.replace("◎豆瓣评分","").strip()
            ##print(info)
            movie['grade']=info               
        if info.startswith("◎片　　长"):
            info=info.replace("◎片　　长","").strip()
            ##print(info)
            movie['time']=info                           
        if info.startswith("◎导　　演"):
            info=info.replace("◎导　　演","").strip()
            ##print(info)
            movie['director']=info        
        if info.startswith("◎编　　剧"):
            info=info.replace("◎编　　剧","").strip()
            ##print(info)
            movie['writer']=info                  
        if info.startswith("◎主　　演"):
            info=info.replace("◎主　　演","").strip()
            actors=[info]
            for x in range(index+1,1000):
                actor=information[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            ##print(actors)
            movie['actors']=actors
        if info.startswith("◎演　　员"):
            info=info.replace("◎演　　员","").strip()
            actors=[info]
            for x in range(index+1,1000):
                actor=information[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            ##print(actors)
            movie['actors']=actors 
        if info.startswith("◎简　　介"):
            info=info.replace("◎简　　介","").strip()
            profile=information[index+1]
            ##print(profile)
            movie['profile']=profile  
    return movie            
## parse_detail_page(url)##上面这个函数没有return时，若要单独输出，必须在最后面加parse_detail_page(url)

def spider():
    base_url='https://www.ygdy8.net/html/gndy/china/list_4_{}.html'
    movies=[]
    for x in range(3,4):
        url=base_url.format(x)
        detail_urls=get_detail_url(url)
        for detail_url in detail_urls:
            detail_url='https://www.ygdy8.net'+detail_url
            print(detail_url)
            movie=parse_detail_page(detail_url)
            movies.append(movie)
    return movies
    
if __name__=='__main__':
    movie=spider()
    keys=movie[0].keys()
    print(keys)
    with open('movie_info.csv','w',nemline='') as output_file:
        dict_writer=csv.DictWriter(output_file,keys)
        dict_writer.writeheader()
        dict_writer.writerows(movie)
    
   ##运行结果
IndexError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_63540/316892137.py in <module>
    138 if __name__=='__main__':
    139     movie=spider()
--> 140     keys=movie[0].keys()
    141     print(keys)
    142     with open('movie_info.csv','w',nemline='') as output_file:

IndexError: list index out of range

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

8条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
PhoenixRiser 2023-07-05 20:28
关注
XPath中链接应该是 href 而不是 herf

本回答被题主选为最佳回答 , 对您是否有帮助呢?

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

查看更多回答(7条)

报告相同问题？

关注问题

python爬取web of science
2021-12-18 14:05

qq_37760746的博客根据作者姓名在某年到某年所发表的文章，对文章的题目，期刊的影响因子进行爬取 from selenium import webdriver from selenium.webdriver.chrome.options import Options import csv import re # from threading ...
python 爬取百度网页图片
2024-04-24 13:40

@一只前端汪的博客 timeout=10) # Get方式获取网页数据 response.raise_for_status() # 如果响应状态码不是200，则抛出异常 json_info = response.json() for index in range(30): list_image_link.append(json_info['data'][index]['...
爬取电商平台数据，python爬取某维商品数据
2022-01-12 16:16

魔王不会哭的博客爬取电商平台数据，python爬取某维商品数据课程亮点动态数据抓包演示 json数据解析 requests模块的使用保存csv 环境介绍 python 3.8 [最好用和老师一样的版本] pycharm 2021.2 专业版 yyds requests >>&...
实战教程：Python 爬取快手短视频数据（热门内容分析）
2025-11-14 23:54

python 爬虫工程师的博客本文将从实战角度出发，详细介绍如何使用 Python 爬取快手短视频数据，并开展热门内容分析，所有操作均严格遵守平台规则及相关法律法规，确保爬虫行为的合规性。本文聚焦快手短视频热门数据的爬取与分析，通过解析...
python爬取微博热搜数据并保存！
2021-02-22 10:23

「已注销」的博客很多人学习python，不知道从何学起。很多人学习python，掌握了基本语法过后，不知道在哪里寻找案例上手。很多已经做案例的人，却不知道如何去学习更加高深的知识。那么针对这三类人，我给大家提供一个好的学习...
实战！Python 爬取小红书笔记标签数据（热门话题分析）
2025-11-14 23:46

python 爬虫工程师的博客摘要：本文基于小红书平台，探讨了利用Python爬取笔记标签数据并进行热门话题分析的方法。通过requests库获取网页数据，结合BeautifulSoup解析HTML内容，提取并清洗笔记标签信息，最终利用词频统计和可视化分析挖掘...
python爬虫：爬取A站视频信息
2024-10-04 13:11

F码崽的博客技术要求：使用python语言，requests库进行数据爬取，bueatifulsoup和正则表达式提取数据。遵守目标网站的robots.txt文件规定，合理控制爬虫的访问频率。二、数据分析对抓取到的数据进行整理和清洗，去除无效或...
利用Python爬取小米有品的信息
2019-10-08 16:56

IT白鸽的博客爬取小米有品：把两个链接放到了一起，运行一次就可以全部获取（约700）使用的是selenium+chrome+lxml的组合（也很快，因为就一个页面）输出：程序会生成三个文件，两个csv和一个xls csv体积小巧，通用性强 ...
Python 爬虫实战：爬取 CSDN 下载资源，实现关键词自动搜索
2025-10-22 23:43

python 爬虫工程师的博客本文介绍了一种利用Python爬虫技术实现CSDN下载资源自动搜索与筛选的方法。通过分析CSDN搜索页面结构，使用requests库构造动态请求，结合BeautifulSoup解析网页内容，可提取资源标题、作者、下载量等关键信息。文章...
项目二：python爬取豆瓣电影信息并分析
2021-02-25 19:51

&黄焖鸡米饭&的博客对豆瓣电影top250的爬取与分析爬虫时主要运用的库是re，request，Beautifulsoup，lxml，分析时主要运用的是pandas，matplotlib。通过 F12 查看网页源代码，ctrl+shift+i ctrl+shift+n，检查元素，定位要爬取的...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 7月14日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
已采纳回答 7月6日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 7月5日

python爬取问题list index out of range

8条回答 默认 最新

问题事件

8条回答默认最新