俊熙君 2022-08-23 23:23 采纳率: 85.7%
浏览 62
已结题

爬虫失败,中间获取信息的函数没有运行直接跳过了。

问题遇到的现象和发生背景

爬电影信息,真的试了很多方法都没找到问题出来哪里。
现在不知道为什么完全不允许中间的loading_mv函数,直接跳到下一个环节去了。
现在一个大问题……
我把别人写好的源代码复制运行发现还是没有运行函数,但是没有报错

问题相关代码,请勿粘贴截图

from base64 import encode
from dataclasses import replace
from pandas import DataFrame
import requests
from lxml import etree
from bs4 import BeautifulSoup

headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63"
}

start_num = [i for i in range(0,1,25)]

list_url_mv = []

for start in start_num:
url = "https://movie.douban.com/top250?start={}&filter=%22.format(start)
print("正在处理url:",url)

r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")

url_mv_list = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a ")

for index_url in range(len(url_mv_list)):
    url_mv = url_mv_list[index_url]["href"]
    list_url_mv.append(url_mv)
    print(url_mv)

def loading_mv(url,number):
list_mv = []
print("-正在处理第{}部电影-".format(number+1))
list_mv.append(number+1)

response_mv = requests.get(url=url,headers=headers)
soup_mv = BeautifulSoup(response_mv.text,"html.parser")

mv_name = soup_mv.find_all('span',attrs={"property":"v:itemreviewed"})
mv_name = mv_name[0].get_text()
list_mv.append(mv_name)

mv_year = soup_mv.select("span.year")
mv_year = mv_year[0].get_text()[1:5]
list_mv.append(mv_year)

list_mv_director = []
mv_director = soup_mv.find_all('a',attrs={'rel':'v:directedBy'})
for director in mv_director:
    list_mv_director.append(director.get_text())
string_director = '/'.join(list_mv_director)

list_mv_star = []
mv_star = soup_mv.find_all("a",attrs={"rel":"v:starring"})
if mv_star == []:
    list_mv.append(None)
else:
    mv_star = mv_star[0].get_text().strip('/')
    mv_first_star = mv_star[0].split(":")
    list_mv_star.append(mv_first_star[-1].strip())
    # del mv_star[0]
    for star in mv_star:
        list_mv_star.append(star.strip())
    string = '/'.join(list_mv_star)
    list_mv.append(string)

list_mv_type = []
mv_type = soup_mv.find_all("a",attrs={"property":"v:genre"})
for type in mv_type:
    list_mv_type.append(type.get_text())
string_type = '/'.join(list_mv_type)
list_mv.append(string_type)

mv_score = soup_mv.select("strong.ll.rating_num")
mv_score = mv_score[0].get_text()
list_mv.append(mv_score)

mv_evaluation = soup_mv.select("a.rating_people")
mv_evaluation = mv_evaluation[0].get_text().strip()
list_mv.append(mv_evaluation)

mv_plot = soup_mv.find_all("span",attrs={"class":"all hidden"})
if mv_plot == []:
    list_mv.append(None)
else:
    string_plot = mv_plot[0].get_text().strip().split()
    new_string_plot = ' '.join(string_plot)
    list_mv.append(new_string_plot)

list_mv.append(url)

return list_mv

list_all_mv = []

dict_mv_info = {}
for number in range(len(list_url_mv)):
mv_info = loading_mv(list_url_mv[number],number)
list_all_mv.append(mv_info)
print("-运行结束-")

pd = DataFrame(list_all_mv,columns=['电影排名','电影名称','电影导演','电影主演','电影类型','电影评分','评价人数','电影简介','电影链接'])
pd.to_excel(r'D:\Python\Python爬虫\豆瓣电影信息爬取\豆瓣电影前Top250.xlsx')

运行结果及报错内容

输出了空的表格

  • 写回答

1条回答 默认 最新

  • 亖夕 Python领域新星创作者 2022-08-23 23:52
    关注

    我看也没跳过loading_mv函数,该函数有运行,帮你改好了

    img

    from base64 import encode
    from dataclasses import replace
    from pandas import DataFrame
    import requests
    from lxml import etree
    from bs4 import BeautifulSoup
    
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63"
    }
    
    start_num = [i for i in range(0,1,25)]
    
    list_url_mv = []
    
    for start in start_num:
        url = "https://movie.douban.com/top250?start={}&filter=%22.format(start)"
        print("正在处理url:",url)
    
        r = requests.get(url,headers=headers)
        soup = BeautifulSoup(r.text,"html.parser")
    
        url_mv_list = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a ")
        # print(url_mv_list)
    
        for index_url in range(len(url_mv_list)):
            url_mv = url_mv_list[index_url]["href"]
            list_url_mv.append(url_mv)
            print(url_mv)
    def loading_mv(url,number):
        list_mv = []
        print("-正在处理第{}部电影-".format(number+1))
        list_mv.append(number+1)
    
        response_mv = requests.get(url=url,headers=headers)
        soup_mv = BeautifulSoup(response_mv.text,"html.parser")
    
        mv_name = soup_mv.find_all('span',attrs={"property":"v:itemreviewed"})
        mv_name = mv_name[0].get_text()
        list_mv.append(mv_name)
    
        mv_year = soup_mv.select("span.year")
        mv_year = mv_year[0].get_text()[1:5]
        list_mv.append(mv_year)
    
        list_mv_director = []
        mv_director = soup_mv.find_all('a',attrs={'rel':'v:directedBy'})
        for director in mv_director:
            list_mv_director.append(director.get_text())
        string_director = '/'.join(list_mv_director)
    
        list_mv_star = []
        mv_star = soup_mv.find_all("a",attrs={"rel":"v:starring"})
        if mv_star == []:
            list_mv.append(None)
        else:
            mv_star = mv_star[0].get_text().strip('/')
            mv_first_star = mv_star[0].split(":")
            list_mv_star.append(mv_first_star[-1].strip())
            # del mv_star[0]
            for star in mv_star:
                list_mv_star.append(star.strip())
            string = '/'.join(list_mv_star)
            list_mv.append(string)
    
        list_mv_type = []
        mv_type = soup_mv.find_all("a",attrs={"property":"v:genre"})
        for type in mv_type:
            list_mv_type.append(type.get_text())
        string_type = '/'.join(list_mv_type)
        list_mv.append(string_type)
    
        mv_score = soup_mv.select("strong.ll.rating_num")
        mv_score = mv_score[0].get_text()
        list_mv.append(mv_score)
    
        mv_evaluation = soup_mv.select("a.rating_people")
        mv_evaluation = mv_evaluation[0].get_text().strip()
        list_mv.append(mv_evaluation)
    
        mv_plot = soup_mv.find_all("span",attrs={"class":"all hidden"})
        if mv_plot == []:
            list_mv.append(None)
        else:
            string_plot = mv_plot[0].get_text().strip().split()
            new_string_plot = ' '.join(string_plot)
            list_mv.append(new_string_plot)
    
        list_mv.append(url)
    
        return list_mv
    
    
    list_all_mv = []
    
    dict_mv_info = {}
    for number in range(len(list_url_mv)):
        mv_info = loading_mv(list_url_mv[number],number)
        list_all_mv.append(mv_info)
    print("-运行结束-")
    
    pd = DataFrame(list_all_mv,columns=['电影排名','电影名称','电影导演','电影主演','电影类型','电影评分','评价人数','电影简介','电影链接'])
    pd.to_excel(r'豆瓣电影前Top250pppppppp.xlsx')
    
    
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

问题事件

  • 系统已结题 9月1日
  • 已采纳回答 8月24日
  • 修改了问题 8月23日
  • 创建了问题 8月23日

悬赏问题

  • ¥30 这是哪个作者做的宝宝起名网站
  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题
  • ¥15 C#算法问题, 不知道怎么处理这个数据的转换
  • ¥15 YoloV5 第三方库的版本对照问题
  • ¥15 请完成下列相关问题!