Smile_to_destiny 2024-08-12 17:56 采纳率: 50%
浏览 28

DrissionPage代码重构无法正常运行

  • 【DrissionPage】正常运行的代码块,在重构后无法正常运行。

  • 主要问题描述:
    第一个代码获取到的网页数据可以正常向下传递;
    但第二个代码中,小说评论详情页翻页后的新的网页信息无法传入循环中,导致程序仍按照第一次循环的索引进行爬取,结果可想而知,程序必然报错。

  • 以下为正常运行代码:

# -*- coding: utf-8 -*-
"""
@ Project : PycharmProjects
@ File : DrissionPage_webnovel.py
@ IDE : PyCharm
@ Date : 2024/7/15 21:52
"""

import time
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import ElementNotFoundError


def scroll_page():
    # 导入url
    url = 'https://www.webnovel.com/ranking/novel/season/power_rank'
    # 访问网页
    page.get(url)
    # 页面每次加载20个,需要循环滚动
    # 进入循环(该循环滚动速度较慢,可规避反爬虫)
    # i用于计数
    i = 0
    while True:
        try:
            # 尝试寻找End元素
            # 找不到则执行滚动
            # 定位榜单底部END标识
            page.ele('.g_no_result')
            end_button = page.ele('.g_no_result')
            print(f'{end_button.text}:页面已滚动到底部')
            # 找到End元素后,打印进度,并结束循环
            break
        # 预期错误为“找不到该元素”
        except ElementNotFoundError:
            i += 1
            # 打印计数,主要用于显示进度
            print(f'已滚动第{i}次')
            # 操作页面滚动到网页中的等待元素——该元素不可见
            # page.actions.scroll(on_ele='.g_loading j_rank_loading _on')
            # 二选一
            page.scroll.to_bottom()


def get_commentator_info(commentator_name):
    """
    以下代码用于获取评论者的相关信息
    """
    try:
        time.sleep(1)
        print(f'正在获取{commentator_name.text}的用户信息')
        # 模拟鼠标中键点击,以新标签页打开用户主页,并获取数据
        commentator_info_page = commentator_name.click.middle(get_tab=True)
        # 定位到用户数据的div中
        # 定位到用户数据的div中
        commentator_info_div = commentator_info_page.ele('.header mb40')
        # 定位到存放阅读时长、阅读数量的ul
        page_ul = commentator_info_div.ele('tag:ul')
        ul_list = page_ul.eles('tag:strong')
        if len(ul_list) == 3:
            print('该用户是小说作者')
            commentator_if_writer_list.append('Yes')
            # 持续创作时间
            continuous_writing_days = ul_list[0]
            continuous_writing_days_list.append(continuous_writing_days.text)
            # 阅读时长
            reading_time = ul_list[1]
            reading_time_list.append(reading_time.text)
            # 阅读数量
            read_books_number = ul_list[2]
            read_books_number_list.append(read_books_number.text)
            # print(continuous_writing_days.text, reading_time.text, read_books_number.text)
        elif len(ul_list) == 2:
            print('该用户是普通读者')
            commentator_if_writer_list.append('No')
            # 持续创作时间
            continuous_writing_days = 'None'
            continuous_writing_days_list.append(continuous_writing_days)
            # 阅读时长
            reading_time = ul_list[0]
            reading_time_list.append(reading_time.text)
            # 阅读数量
            read_books_number = ul_list[1]
            read_books_number_list.append(read_books_number.text)
            # print(continuous_writing_days, reading_time.text, read_books_number.text)
        # 定位到等级和性别的header
        page_header = commentator_info_div.ele('.lh40 mb16')
        # 等级
        commentator_level_dict = page_header.ele('tag:strong').attrs
        commentator_level = commentator_level_dict['title'].split()[1]
        commentator_level_list.append(commentator_level)
        try:
            # 性别
            gender_dict = page_header.ele('tag:i').attrs
            gender = gender_dict['title']
            gender_list.append(gender)
        except ElementNotFoundError:
            print('该用户未显示性别')
            gender = 'None'
            gender_list.append(gender)
        # 获赞
        good_div = commentator_info_div.ele('.fr fs0')
        good_number_dict = good_div.ele('tag:span').attrs
        good_number = good_number_dict['data-num']
        good_number_list.append(good_number)
        # 定位到注册时间与IP的address
        page_address = commentator_info_div.ele('tag:address')
        strong_list = page_address.eles('tag:strong')
        # 注册时间
        joined_date = strong_list[0].text.split()[0]
        joined_date_list.append(joined_date)
        # IP
        ip = strong_list[1]
        ip_list.append(ip.text)
        # print(commentator_level, gender, good_number, joined_date, ip.text)
        print('该用户的信息已获取完毕,页面自动关闭')
        commentator_info_page.close()
    except Exception as e:
        print(f'获取用户[{commentator_name.text}]的信息失败')



def get_comment(novel_name):
    # 评论
    # 定位评论div,并获取评论列表
    comments_div_all = new_detail_page.ele('.j_revWrap')
    comments_list = comments_div_all.eles('.^j_pageReviewList')
    for comment in comments_list:
            # 定位存放单个评论的div
            comment_div = comment.ele('.oh pr')
            # 评论者
            commentator_name = comment_div.ele('tag:a')
            commentator_name_list.append(commentator_name.text)
            # 评论文本
            comment_text = comment_div.ele('tag:p')
            comment_text_list.append(comment_text.text)
            # 获取评论者的信息
            get_commentator_info(commentator_name)

            # 在此处将获取到的数据保存到对应的文件
            comment_data_filename = f'[{novel_name.text}] comment_text & commentator_info.xlsx'
            if os.path.exists(comment_data_filename):
                commentator_info_dict = {
                    'commentator_name':  commentator_name_list,
                    'commentator_level': commentator_level_list,
                    'commentator_gender': gender_list,
                    'commentator_if_writer': commentator_if_writer_list,
                    'commentator_continuous_writing_days': continuous_writing_days_list,
                    'commentator_reading_time': reading_time_list,
                    'commentator_read_books_number': read_books_number_list,
                    'commentator_get_good_number': good_number_list,
                    'commentator_joined_date': joined_date_list,
                    'commentator_ip': ip_list,
                    'comment_text': comment_text_list
                }

                commentator_info_data = pd.DataFrame(commentator_info_dict)
                with pd.ExcelWriter(comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
                    commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
                print(f'评论者相关信息已写入【{comment_data_filename}】')
            else:
                print('#' * 10 + f'正在新建【{comment_data_filename}】文件' + '#' * 10)
                file_columns = ['commentator_name', 'commentator_level', 'commentator_gender', 'commentator_if_writer',
                                'commentator_continuous_writing_days',
                                'commentator_reading_time',
                                'commentator_read_books_number', 'commentator_get_good_number',
                                'commentator_joined_date', 'commentator_ip', 'comment_text']
                pd.DataFrame(columns=file_columns).to_excel(comment_data_filename,
                                                            sheet_name='comment&commentator_info',
                                                            index=False)
                commentator_info_dict = {
                    'commentator_name': commentator_name_list,
                    'commentator_level': commentator_level_list,
                    'commentator_gender': gender_list,
                    'commentator_if_writer': commentator_if_writer_list,
                    'commentator_continuous_writing_days': continuous_writing_days_list,
                    'commentator_reading_time': reading_time_list,
                    'commentator_read_books_number': read_books_number_list,
                    'commentator_get_good_number': good_number_list,
                    'commentator_joined_date': joined_date_list,
                    'commentator_ip': ip_list,
                    'comment_text': comment_text_list
                }

                commentator_info_data = pd.DataFrame(commentator_info_dict)
                with pd.ExcelWriter(comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
                    commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
                print(f'评论者相关信息已写入【{comment_data_filename}】')


if __name__ == '__main__':
    # 用page直接控制浏览器
    page = ChromiumPage()
    # ranking页面若滚动到底,则第二次运行可注释掉
    # scroll_page()

    # 定义空列表
    # 书名
    title_list = []
    # 作者名
    author_list = []
    # 类型名
    type_list = []
    # 标签名
    tag_list = []
    # 简介
    description_list = []
    # 章节数
    chapter_list = []
    # 阅读量
    views_list = []
    # 热度
    hot_number_list = []
    # 星级评分
    star_score_list = []
    # 评分人数
    reviews_number_list = []
    # 评论者
    commentator_name_list = []
    # 评论内容
    comment_text_list = []
    # 持续更新日期
    continuous_writing_days_list = []
    # 阅读时长
    reading_time_list = []
    # 阅读数量
    read_books_number_list = []
    # 评论者等级
    commentator_level_list = []
    # 评论者性别
    gender_list = []
    # 获赞数
    good_number_list = []
    # 注册日期
    joined_date_list = []
    # IP地址
    ip_list = []
    # 是否作者
    commentator_if_writer_list = []
    # 错误次数
    error_list = []


    # 找到存放rank数据的div
    div = page.ele('.j_rank_wrapper')
    # 获取存放rank数据的网页元素列表
    section_list = div.eles('.df g_hr pt16 pb16')
    # 遍历rank中所有的书
    for section in section_list[5:]:

        # 标题
        title = section.ele('.c_l')
        # 获取title中所有属性及值的字典
        attrs_dict = title.attrs
        title_list.append(title.text)
        # 描述
        desc = section.ele('.fw400 lh20 fs14 ls0.2 c_s ells _2 mb4')
        description_list.append(desc.text)
        # 热度
        hot_num = section.ele('.vam fw400 lh16 fs12')
        hot_number_list.append(hot_num.text)

        # print(f'{title.text}的基本信息已获取')

        # 用于点击,进入新页面
        new_detail_page = title.click.middle(get_tab=True)
        new_detail_page.scroll.to_bottom()
        time.sleep(1)

        """
        该部分代码用于获取小说基本内容
        """

        # 分析新页面
        # 获取基本信息:类型、章节数、阅读量
        div_1 = new_detail_page.ele('.mb12 fw700 lh24 det-hd-detail c_000 fs0')
        info_list = div_1.eles('tag:span')

        # 类型
        type_name = info_list[0].text
        type_list.append(type_name)
        # 章节数
        chapters_number = info_list[1].text.split()[0]
        chapter_list.append(chapters_number)
        # 阅读量
        views_number = info_list[2].text.split()[0]
        views_list.append(views_number)
        # 作者名称
        try:
            author_name = new_detail_page.ele('.c_primary').text
            author_list.append(author_name)
        except ElementNotFoundError:
            author_name_div = new_detail_page.ele('.fw700 ell dib vam fs16 fw500')
            author_name = author_name_div.ele('tag:span').text
            author_list.append(author_name)

        # tags
        novel_tags_list = []
        tags_list = new_detail_page.eles('.m-tag dib vam mb8')
        for tag in tags_list:
            novel_tags_list.append(tag.text)
        tag_list.append(novel_tags_list)

        # 星级评分
        try:
            div_star = new_detail_page.ele('.clearfix mb20 rev-tit')
            star_score = div_star.ele('tag:small').text
            star_score_list.append(star_score)
        except ElementNotFoundError:
            star_score = 'Not enough ratings'
            star_score_list.append(star_score)
        # 评论人数
        reviews_number = div_star.ele('tag:i').text
        reviews_number_list.append(reviews_number)
        # 打印查看结果
        # print(type_name, chapters_number, views_number, author_name, novel_tags_list, star_score, reviews_number)
        print(f'{title.text}的基本信息已获取')

        # 保存数据
        novel_rank_info_filename = f'annual_rank_info.xlsx'
        if os.path.exists(novel_rank_info_filename):
            novel_rank_info_dict = {
                'title': title_list,
                'type': type_list,
                'chapter_number': chapter_list,
                'views': views_list,
                'author': author_list,
                'tags': tag_list,
                'star_score': star_score_list,
                'reviews_number': reviews_number_list
            }
            # print(f'novel_rank_info_dic:{novel_rank_info_dict}')
            novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
            with pd.ExcelWriter(novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
                novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
            print(f'小说排行相关信息已写入【{novel_rank_info_filename}】')
        else:
            print('#' * 10 + f'正在新建【{novel_rank_info_filename}】文件' + '#' * 10)
            file_cloumns = ['title', 'type', 'chapter_number',
                            'views', 'author', 'tags', 'star_score', 'reviews_number']
            pd.DataFrame(columns=file_cloumns).to_excel(novel_rank_info_filename,
                                                        sheet_name='novel_rank_info', index=False)
            novel_rank_info_dict = {
                'title': title_list,
                'type': type_list,
                'chapter_number': chapter_list,
                'views': views_list,
                'author': author_list,
                'tags': tag_list,
                'star_score': star_score_list,
                'reviews_number': reviews_number_list
            }
            # print(f'novel_rank_info_dic:{novel_rank_info_dict}')
            novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
            with pd.ExcelWriter(novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
                novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
            print(f'小说排行相关信息已写入【{novel_rank_info_filename}】')
        # new_detail_page.close()

        """
        # 下方代码用于获取评论内容
        """
        try:
            # 找到Next按钮
            # next_button = new_detail_page.ele('.ui-page ui-page-next')
            next_button = new_detail_page.ele('.ui-page ui-page-next')
            # 当前页信息
            # current_page = new_detail_page.ele('.ui-page ui-page-current').text

            # 获取Next按钮中的所有attributes
            next_button_dict = next_button.attrs
            control_number = len(next_button_dict)
            while True:
                # 若Next按钮不能点击,则只有一个attributes
                if control_number != 1:
                    time.sleep(1)
                    
                    # 若Next中的attributes不唯一,则爬取评论后点击下一页
                    
                    get_comment(novel_name=title)

                    next_button = new_detail_page.ele('.ui-page ui-page-next')
                    next_button_dict = next_button.attrs
                    control_number = len(next_button_dict)
                    next_button.click()
                    # print(f'已经获取{title.text}的第{current_page}页的评论数据,并点击进入下一页')
                    print(f'已经获取{title.text}当前页的评论数据,并点击进入下一页')
                else:
                    
                    # 若Next只有一个attributes,则代表Next不能再点击,则爬取评论后,关闭该页面
                    
                    print('该文档无下一页')
                    get_comment(novel_name=title)
                    print('所有评论已获取完成,自动关闭当前页面')

                    new_detail_page.close()
                    break
        except ElementNotFoundError:
            print('该文档无下一页')
            get_comment(novel_name=title)
            print('所有评论已获取完成,自动关闭当前页面')
            new_detail_page.close()



- 以下为尝试重构代码:

```python
# -*- coding: utf-8 -*-
"""
@ Project : PycharmProjects
@ File : test04.py
@ IDE : PyCharm
@ Date : 2024/7/17 18:21
"""
import os
import time
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import ElementNotFoundError


class SpiderWebnovel:
    def __init__(self):
        """
        定义全局变量
        """
        # 用page直接控制浏览器
        self.page = ChromiumPage()
        # 找到存放rank数据的div
        self.div = self.page.ele('.j_rank_wrapper')
        # 获取存放rank数据的网页元素列表
        self.section_list = self.div.eles('.df g_hr pt16 pb16')

        # 定义空列表
        # 书名
        self.title_list = []
        # 作者名
        self.author_list = []
        # 类型名
        self.type_list = []
        # 标签名
        self.tag_list = []
        # 简介
        # self.description_list = []
        # 章节数
        self.chapter_list = []
        # 阅读量
        self.views_list = []
        # 热度
        self.hot_number_list = []
        # 星级评分
        self.star_score_list = []
        # 评分人数
        self.reviews_number_list = []
        # 评论者
        self.commentator_name_list = []
        # 评论内容
        self.comment_text_list = []
        # 是否为小说创作者
        self.commentator_if_writer_list = []
        # 持续更新日期
        self.continuous_writing_days_list = []
        # 阅读时长
        self.reading_time_list = []
        # 阅读数量
        self.read_books_number_list = []
        # 评论者等级
        self.commentator_level_list = []
        # 评论者性别
        self.gender_list = []
        # 获赞数
        self.good_number_list = []
        # 注册日期
        self.joined_date_list = []
        # IP地址
        self.ip_list = []

    def scroll_page(self):
        """
        该部分用于进入排行ranking页面,并操控页面持续滚动,直到加载全部的小说信息
        """
        # 导入url
        url = 'https://www.webnovel.com/ranking/novel/season/power_rank'
        # 访问网页
        self.page.get(url)
        # 页面每次加载20个,需要循环滚动
        # 进入循环(该循环滚动速度较慢,可规避反爬虫)
        # i用于计数
        i = 0
        while True:
            try:
                # 尝试寻找End元素
                # 找不到则执行滚动
                # 定位榜单底部END标识
                self.page.ele('.g_no_result')
                end_button = self.page.ele('.g_no_result')
                print(f'{end_button.text}:页面已滚动到底部')
                # 找到End元素后,打印进度,并结束循环
                break
            # 预期错误为“找不到该元素”
            except ElementNotFoundError:
                i += 1
                # 打印计数,主要用于显示进度
                print(f'已滚动第{i}次')
                # 操作页面滚动到网页中的等待元素——该元素不可见
                # self.page.actions.scroll(on_ele='.g_loading j_rank_loading _on')
                # 二选一
                self.page.scroll.to_bottom()

    def get_novel_info(self):
        """
        该部分代码用于逐个点击小说,进入小说详情页
        获取小说的基本信息:类型、章节数、阅读量
        """

    def get_comment(self, comments_list):
        # comments_list = self.new_detail_page.eles('.^j_pageReviewList')
        for comment in comments_list:

            # 定位存放单个评论的div
            comment_div = comment.ele('.oh pr')
            # 评论者
            self.commentator_name = comment_div.ele('tag:a')
            self.commentator_name_list.append(self.commentator_name.text)
            # 评论文本
            comment_text = comment_div.ele('tag:p')
            self.comment_text_list.append(comment_text.text)
            # 获取评论者的信息
            self.get_commentator_info()
            # 在此处将获取到的数据保存到对应的文件
            self.comment_data_filename = f'[{self.title.text}] comment_text & commentator_info.xlsx'
            if os.path.exists(self.comment_data_filename):
                self.save_commentator_info()
            else:
                print('#' * 10 + f'正在新建【{self.comment_data_filename}】文件' + '#' * 10)
                file_columns = ['commentator_name', 'commentator_level', 'commentator_gender', 'commentator_if_writer',
                                'commentator_continuous_writing_days',
                                'commentator_reading_time',
                                'commentator_read_books_number', 'commentator_get_good_number',
                                'commentator_joined_date', 'commentator_ip', 'comment_text']
                pd.DataFrame(columns=file_columns).to_excel(self.comment_data_filename,
                                                            sheet_name='comment&commentator_info',
                                                            index=False)
                self.save_commentator_info()

    def get_commentator_info(self):
        """
        以下代码用于获取评论者的相关信息
        """
        # time.sleep(1)
        print('*' * 10 + f'正在获取{self.commentator_name.text}的用户信息' + '*' * 10)
        # 模拟鼠标中键点击,以新标签页打开用户主页,并获取数据
        commentator_info_page = self.commentator_name.click.middle(get_tab=True)
        # 定位到用户数据的div中
        # 定位到用户数据的div中
        commentator_info_div = commentator_info_page.ele('.header mb40')
        # 定位到存放阅读时长、阅读数量的ul
        page_ul = commentator_info_div.ele('tag:ul')
        ul_list = page_ul.eles('tag:strong')
        if len(ul_list) == 3:
            print('该用户是小说作者')
            self.commentator_if_writer_list.append('Yes')
            # 持续创作时间
            continuous_writing_days = ul_list[0]
            self.continuous_writing_days_list.append(continuous_writing_days.text)
            # 阅读时长
            reading_time = ul_list[1]
            self.reading_time_list.append(reading_time.text)
            # 阅读数量
            read_books_number = ul_list[2]
            self.read_books_number_list.append(read_books_number.text)
            # print(continuous_writing_days.text, reading_time.text, read_books_number.text)
        elif len(ul_list) == 2:
            print('该用户是普通读者')
            self.commentator_if_writer_list.append('No')
            # 持续创作时间
            continuous_writing_days = 'None'
            self.continuous_writing_days_list.append(continuous_writing_days)
            # 阅读时长
            reading_time = ul_list[0]
            self.reading_time_list.append(reading_time.text)
            # 阅读数量
            read_books_number = ul_list[1]
            self.read_books_number_list.append(read_books_number.text)
            # print(continuous_writing_days, reading_time.text, read_books_number.text)
        # 定位到等级和性别的header
        page_header = commentator_info_div.ele('.lh40 mb16')
        # 等级
        commentator_level_dict = page_header.ele('tag:strong').attrs
        commentator_level = commentator_level_dict['title'].split()[1]
        self.commentator_level_list.append(commentator_level)
        try:
            # 性别
            gender_dict = page_header.ele('tag:i').attrs
            gender = gender_dict['title']
            self.gender_list.append(gender)
        except ElementNotFoundError:
            print('该用户未显示性别')
            gender = 'None'
            self.gender_list.append(gender)
        # 获赞
        good_div = commentator_info_div.ele('.fr fs0')
        good_number_dict = good_div.ele('tag:span').attrs
        good_number = good_number_dict['data-num']
        self.good_number_list.append(good_number)
        # 定位到注册时间与IP的address
        page_address = commentator_info_div.ele('tag:address')
        strong_list = page_address.eles('tag:strong')
        # 注册时间
        joined_date = strong_list[0].text.split()[0]
        self.joined_date_list.append(joined_date)
        # IP
        ip = strong_list[1]
        self.ip_list.append(ip.text)
        # print(commentator_level, gender, good_number, joined_date, ip.text)
        print('*' * 10 + '该用户的信息已获取完毕,页面自动关闭' + '*' * 10)
        commentator_info_page.close()

    def save_rank_info(self):
        novel_rank_info_dict = {
            'title': self.title_list,
            'type': self.type_list,
            'chapter_number': self.chapter_list,
            'views': self.views_list,
            'author': self.author_list,
            'tags': self.tag_list,
            'star_score': self.star_score_list,
            'reviews_number': self.reviews_number_list
        }
        # print(f'novel_rank_info_dic:{novel_rank_info_dict}')
        novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
        with pd.ExcelWriter(self.novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
            novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
        print(f'小说排行相关信息已写入【{self.novel_rank_info_filename}】')

    def save_commentator_info(self):
        commentator_info_dict = {
            'commentator_name': self.commentator_name_list,
            'commentator_level': self.commentator_level_list,
            'commentator_gender': self.gender_list,
            'commentator_if_writer': self.commentator_if_writer_list,
            'commentator_continuous_writing_days': self.continuous_writing_days_list,
            'commentator_reading_time': self.reading_time_list,
            'commentator_read_books_number': self.read_books_number_list,
            'commentator_get_good_number': self.good_number_list,
            'commentator_joined_date': self.joined_date_list,
            'commentator_ip': self.ip_list,
            'comment_text': self.comment_text_list
        }

        commentator_info_data = pd.DataFrame(commentator_info_dict)
        with pd.ExcelWriter(self.comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
            commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
        print(f'评论者相关信息已写入【{self.comment_data_filename}】')

    def run(self):

        # ranking页面若滚动到底,则第二次运行可注释掉
        # self.scroll_page()

        # 遍历rank中所有的书
        for section in self.section_list:
            # 标题
            self.title = section.ele('.c_l')
            self.title_list.append(self.title.text)
            new_detail_page = self.title.click.middle(get_tab=True)
            new_detail_page.scroll.to_bottom()
            print(f'已进入【{self.title.text}】详情页,并滚动到底部')

            print('=' * 10 + f'正在爬取【{self.title.text}】的评论信息' + '=' * 10)

            try:
                # 找到Next按钮
                next_button = new_detail_page.ele('.ui-page ui-page-next')
                # 当前页信息
                # current_page = new_detail_page.ele('.ui-page ui-page-current').text
                # 获取Next按钮中的所有attributes
                control_number = len(next_button.attrs)
                comments_list = new_detail_page.eles('.^j_pageReviewList')
                # 下方代码用于获取评论内容
                while True:
                    # 若Next按钮不能点击,则只有一个attributes
                    if control_number != 1:
                        # 若Next中的attributes不唯一,则爬取评论后点击下一页
                        self.get_comment(comments_list)
                        next_button.click()
                        comments_list = new_detail_page.eles('.^j_pageReviewList')
                        # print(f'已经获取{title.text}的第{current_page}页的评论数据,并点击进入下一页')
                        print('*' * 10 + f'已经获取当前页的评论数据,并点击进入下一页' + '*' * 10)
                    else:
                        # 若Next只有一个attributes,则代表Next不能再点击,则爬取评论后,关闭该页面
                        print('该评论区无下一页')
                        comments_list = new_detail_page.eles('.^j_pageReviewList')
                        self.get_comment(comments_list)
                        print('*' * 10 + '所有评论已获取完成,自动关闭当前页面' + '*' * 10)
                        new_detail_page.close()
                        break
            except ElementNotFoundError:
                print('该评论区无下一页')
                comments_list = new_detail_page.eles('.^j_pageReviewList')
                self.get_comment(comments_list)
                print('*' * 10 + '所有评论已获取完成,自动关闭当前页面' + '*' * 10)
                new_detail_page.close()


if __name__ == '__main__':
    spider = SpiderWebnovel()
    spider.run()
  • 写回答

1条回答 默认 最新

  • 梦幻精灵_cq 2024-08-12 19:56
    关注
    • 这么“繁杂”的事儿,还是ai方便。您且听听我ai学习搭子“智谱清言”app的建议吧。我认为,如果全盘重构,按原逻辑重写,更能高效防错。如果部分重构,仅对“重构部分动手脚”,确保数据和变量的引用正确,才不易出错。

    • ai说:
      您的重构尝试已经取得了一些进展,但仍然存在一些问题和不一致之处。以下是对您重构代码的分析和建议:
    1. 全局变量和属性

      • SpiderWebnovel 类中,您定义了多个全局变量。这些变量应该在类内部管理,而不是直接定义为类属性。建议将这些变量定义为类的方法属性,并在适当的方法中进行操作。
    2. 异常处理

      • get_commentator_info 方法中,您使用了 ElementNotFoundError 来检查性别信息,但 ElementNotFoundError 是用于检查元素是否被找到的。建议使用 AttributeError 来检查属性是否存在。
    3. 代码重复

      • get_commentator_info 方法中,您多次打印了信息,并且代码结构重复。建议将这些重复的代码提取为单独的方法。
    4. 方法命名

      • 方法命名应该具有描述性,例如 save_rank_info 方法应该保存的是小说排行信息,而不是评论者信息。
    5. 数据提取和保存

      • get_commentator_info 方法中,您多次将信息追加到列表中。建议将这些信息直接添加到字典中,以便更容易地保存到文件。
    6. 文件名和路径

      • save_commentator_info 方法中,您使用了 self.comment_data_filename 变量来创建文件名。这个变量在类定义中没有定义,因此会导致错误。建议使用 self.page.get_absolute_url() 来获取当前页面的绝对URL,并使用这个URL作为文件名的一部分。
    7. 代码结构

      • 建议将爬取和保存数据的方法分开,以便更容易理解和维护。

    以下是对您重构代码的一些具体建议:

    class SpiderWebnovel:
        def __init__(self):
            self.page = ChromiumPage()
            self.div = self.page.ele('.j_rank_wrapper')
            self.section_list = self.div.eles('.df g_hr pt16 pb16')
            self.title_list = []
            self.author_list = []
            self.type_list = []
            self.tag_list = []
            self.description_list = []
            self.chapter_list = []
            self.views_list = []
            self.hot_number_list = []
            self.star_score_list = []
            self.reviews_number_list = []
            self.commentator_name_list = []
            self.comment_text_list = []
            self.commentator_if_writer_list = []
            self.continuous_writing_days_list = []
            self.reading_time_list = []
            self.read_books_number_list = []
            self.commentator_level_list = []
            self.gender_list = []
            self.good_number_list = []
            self.joined_date_list = []
            self.ip_list = []
    
        def get_novel_info(self):
            # ... 省略具体实现 ...
    
        def get_comment(self, comments_list):
            # ... 省略具体实现 ...
    
        def get_commentator_info(self):
            # ... 省略具体实现 ...
    
        def save_rank_info(self):
            # ... 省略具体实现 ...
    
        def save_commentator_info(self):
            # ... 省略具体实现 ...
    
        def run(self):
            # ... 省略具体实现 ...
    
    if __name__ == '__main__':
        spider = SpiderWebnovel()
        spider.run()
    

    这些建议只是一个开始,具体的重构工作需要根据代码的实际用途和结构来进一步进行。希望这些建议能对您有所帮助。


    评论

报告相同问题?

问题事件

  • 创建了问题 8月12日