岚劲 2023-07-05 20:14 采纳率: 100%
浏览 89
已结题

python爬取问题list index out of range

有偿帮忙看一下Python爬取文档信息所出现的问题

import requests
from lxml import etree
import csv
BEST_detail='https://www.ygdy8.net'

url='https://www.ygdy8.net/html/gndy/dyzz/20230630/63864.html'

headers={
    'User_Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67'
}

def get_detail_url(url):
    response=requests.get(url,headers=headers)

    text=response.content.decode('gbk')


    html=etree.HTML(text)

## print(text)
    detail_url=html.xpath("table[@class='tbspan']//a/@herf")
    ## print(detail_url)
    return detail_url

## get_detail_url(url)

def parse_detail_page(url):
    ## print(url)
    movie={}
    response=requests.get(url,headers=headers)
    text=response.content.decode('gbk')
    ## print(text)
    
    
    html=etree.HTML(text)
    title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    ## print(title),加【0】获取列表里的元素
    movie['title']=title
    zoomE=html.xpath("//div[@id='Zoom']")[0]
    ## print(zoomE)
    cover=zoomE.xpath(".//img/@src")[0]
    ## print(cover) 获取海报图片链接地址
    movie['cover']=cover
    movie['grade']=0
    information=zoomE.xpath(".//text()")
    ## print(information),解析内容,是个列表,需要对内容解析
    
    for index,info in enumerate(information):
        
        ## print(info) 更简洁
        ## print(index) 列表序号下标
        ## 先获得译名
        if info.startswith("◎译  名"):
            info=info.replace("◎译  名","").strip()
            ##print(info)
            movie['translate']=info
        if info.startswith("◎年  代"):
            info=info.replace("◎年  代","").strip()
            ##print(info)
            movie['year']=info
        if info.startswith("◎产  地"):
            info=info.replace("◎产  地","").strip()
            ##print(info)
            movie['area']=info            
        if info.startswith("◎类  别"):
            info=info.replace("◎类  别","").strip()
            ##print(info)
            movie['class']=info  
        if info.startswith("◎语  言"):
            info=info.replace("◎语  言","").strip()
            ##print(info)
            movie['language']=info              
        if info.startswith("◎字  幕"):
            info=info.replace("◎字  幕","").strip()
            ##print(info)
            movie['zimu']=info                     
        if info.startswith("◎上映日期"):
            info=info.replace("◎上映日期","").strip()
            ##print(info)
            movie['data']=info     
        if info.startswith("◎豆瓣评分"):
            info=info.replace("◎豆瓣评分","").strip()
            ##print(info)
            movie['grade']=info               
        if info.startswith("◎片  长"):
            info=info.replace("◎片  长","").strip()
            ##print(info)
            movie['time']=info                           
        if info.startswith("◎导  演"):
            info=info.replace("◎导  演","").strip()
            ##print(info)
            movie['director']=info        
        if info.startswith("◎编  剧"):
            info=info.replace("◎编  剧","").strip()
            ##print(info)
            movie['writer']=info                  
        if info.startswith("◎主  演"):
            info=info.replace("◎主  演","").strip()
            actors=[info]
            for x in range(index+1,1000):
                actor=information[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            ##print(actors)
            movie['actors']=actors
        if info.startswith("◎演  员"):
            info=info.replace("◎演  员","").strip()
            actors=[info]
            for x in range(index+1,1000):
                actor=information[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            ##print(actors)
            movie['actors']=actors 
        if info.startswith("◎简  介"):
            info=info.replace("◎简  介","").strip()
            profile=information[index+1]
            ##print(profile)
            movie['profile']=profile  
    return movie            
## parse_detail_page(url)##上面这个函数没有return时,若要单独输出,必须在最后面加parse_detail_page(url)

def spider():
    base_url='https://www.ygdy8.net/html/gndy/china/list_4_{}.html'
    movies=[]
    for x in range(3,4):
        url=base_url.format(x)
        detail_urls=get_detail_url(url)
        for detail_url in detail_urls:
            detail_url='https://www.ygdy8.net'+detail_url
            print(detail_url)
            movie=parse_detail_page(detail_url)
            movies.append(movie)
    return movies
    
if __name__=='__main__':
    movie=spider()
    keys=movie[0].keys()
    print(keys)
    with open('movie_info.csv','w',nemline='') as output_file:
        dict_writer=csv.DictWriter(output_file,keys)
        dict_writer.writeheader()
        dict_writer.writerows(movie)
    
   ##运行结果
IndexError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_63540/316892137.py in <module>
    138 if __name__=='__main__':
    139     movie=spider()
--> 140     keys=movie[0].keys()
    141     print(keys)
    142     with open('movie_info.csv','w',nemline='') as output_file:

IndexError: list index out of range
 
    
    

  • 写回答

8条回答 默认 最新

  • PhoenixRiser 2023-07-05 20:28
    关注

    XPath中链接应该是 href 而不是 herf

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(7条)

报告相同问题?

问题事件

  • 系统已结题 7月14日
  • 已采纳回答 7月6日
  • 创建了问题 7月5日

悬赏问题

  • ¥15 微信小程序 前端页面内容搜索
  • ¥15 cpu是如何判断当前指令已经执行完毕,然后去执行下条指令的
  • ¥15 C++Codeinject远线程注入
  • ¥15 安装visual studio2022时visualstudiosetup启动不了,闪退。问题代号0x0和0x1389
  • ¥30 java spring boot2.5.3版本websocket连不上
  • ¥15 angular js调外部链接查看pdf
  • ¥15 openFOAM DPMFoam
  • ¥15 将查询到的值,赋值到table指定行中
  • ¥50 docker容器内部启动shell脚本多命令
  • ¥15 请问python的selenium怎么设置referer