【DrissionPage】正常运行的代码块,在重构后无法正常运行。
主要问题描述:
第一个代码获取到的网页数据可以正常向下传递;
但第二个代码中,小说评论详情页翻页后的新的网页信息无法传入循环中,导致程序仍按照第一次循环的索引进行爬取,结果可想而知,程序必然报错。以下为正常运行代码:
# -*- coding: utf-8 -*-
"""
@ Project : PycharmProjects
@ File : DrissionPage_webnovel.py
@ IDE : PyCharm
@ Date : 2024/7/15 21:52
"""
import time
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import ElementNotFoundError
def scroll_page():
# 导入url
url = 'https://www.webnovel.com/ranking/novel/season/power_rank'
# 访问网页
page.get(url)
# 页面每次加载20个,需要循环滚动
# 进入循环(该循环滚动速度较慢,可规避反爬虫)
# i用于计数
i = 0
while True:
try:
# 尝试寻找End元素
# 找不到则执行滚动
# 定位榜单底部END标识
page.ele('.g_no_result')
end_button = page.ele('.g_no_result')
print(f'{end_button.text}:页面已滚动到底部')
# 找到End元素后,打印进度,并结束循环
break
# 预期错误为“找不到该元素”
except ElementNotFoundError:
i += 1
# 打印计数,主要用于显示进度
print(f'已滚动第{i}次')
# 操作页面滚动到网页中的等待元素——该元素不可见
# page.actions.scroll(on_ele='.g_loading j_rank_loading _on')
# 二选一
page.scroll.to_bottom()
def get_commentator_info(commentator_name):
"""
以下代码用于获取评论者的相关信息
"""
try:
time.sleep(1)
print(f'正在获取{commentator_name.text}的用户信息')
# 模拟鼠标中键点击,以新标签页打开用户主页,并获取数据
commentator_info_page = commentator_name.click.middle(get_tab=True)
# 定位到用户数据的div中
# 定位到用户数据的div中
commentator_info_div = commentator_info_page.ele('.header mb40')
# 定位到存放阅读时长、阅读数量的ul
page_ul = commentator_info_div.ele('tag:ul')
ul_list = page_ul.eles('tag:strong')
if len(ul_list) == 3:
print('该用户是小说作者')
commentator_if_writer_list.append('Yes')
# 持续创作时间
continuous_writing_days = ul_list[0]
continuous_writing_days_list.append(continuous_writing_days.text)
# 阅读时长
reading_time = ul_list[1]
reading_time_list.append(reading_time.text)
# 阅读数量
read_books_number = ul_list[2]
read_books_number_list.append(read_books_number.text)
# print(continuous_writing_days.text, reading_time.text, read_books_number.text)
elif len(ul_list) == 2:
print('该用户是普通读者')
commentator_if_writer_list.append('No')
# 持续创作时间
continuous_writing_days = 'None'
continuous_writing_days_list.append(continuous_writing_days)
# 阅读时长
reading_time = ul_list[0]
reading_time_list.append(reading_time.text)
# 阅读数量
read_books_number = ul_list[1]
read_books_number_list.append(read_books_number.text)
# print(continuous_writing_days, reading_time.text, read_books_number.text)
# 定位到等级和性别的header
page_header = commentator_info_div.ele('.lh40 mb16')
# 等级
commentator_level_dict = page_header.ele('tag:strong').attrs
commentator_level = commentator_level_dict['title'].split()[1]
commentator_level_list.append(commentator_level)
try:
# 性别
gender_dict = page_header.ele('tag:i').attrs
gender = gender_dict['title']
gender_list.append(gender)
except ElementNotFoundError:
print('该用户未显示性别')
gender = 'None'
gender_list.append(gender)
# 获赞
good_div = commentator_info_div.ele('.fr fs0')
good_number_dict = good_div.ele('tag:span').attrs
good_number = good_number_dict['data-num']
good_number_list.append(good_number)
# 定位到注册时间与IP的address
page_address = commentator_info_div.ele('tag:address')
strong_list = page_address.eles('tag:strong')
# 注册时间
joined_date = strong_list[0].text.split()[0]
joined_date_list.append(joined_date)
# IP
ip = strong_list[1]
ip_list.append(ip.text)
# print(commentator_level, gender, good_number, joined_date, ip.text)
print('该用户的信息已获取完毕,页面自动关闭')
commentator_info_page.close()
except Exception as e:
print(f'获取用户[{commentator_name.text}]的信息失败')
def get_comment(novel_name):
# 评论
# 定位评论div,并获取评论列表
comments_div_all = new_detail_page.ele('.j_revWrap')
comments_list = comments_div_all.eles('.^j_pageReviewList')
for comment in comments_list:
# 定位存放单个评论的div
comment_div = comment.ele('.oh pr')
# 评论者
commentator_name = comment_div.ele('tag:a')
commentator_name_list.append(commentator_name.text)
# 评论文本
comment_text = comment_div.ele('tag:p')
comment_text_list.append(comment_text.text)
# 获取评论者的信息
get_commentator_info(commentator_name)
# 在此处将获取到的数据保存到对应的文件
comment_data_filename = f'[{novel_name.text}] comment_text & commentator_info.xlsx'
if os.path.exists(comment_data_filename):
commentator_info_dict = {
'commentator_name': commentator_name_list,
'commentator_level': commentator_level_list,
'commentator_gender': gender_list,
'commentator_if_writer': commentator_if_writer_list,
'commentator_continuous_writing_days': continuous_writing_days_list,
'commentator_reading_time': reading_time_list,
'commentator_read_books_number': read_books_number_list,
'commentator_get_good_number': good_number_list,
'commentator_joined_date': joined_date_list,
'commentator_ip': ip_list,
'comment_text': comment_text_list
}
commentator_info_data = pd.DataFrame(commentator_info_dict)
with pd.ExcelWriter(comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
print(f'评论者相关信息已写入【{comment_data_filename}】')
else:
print('#' * 10 + f'正在新建【{comment_data_filename}】文件' + '#' * 10)
file_columns = ['commentator_name', 'commentator_level', 'commentator_gender', 'commentator_if_writer',
'commentator_continuous_writing_days',
'commentator_reading_time',
'commentator_read_books_number', 'commentator_get_good_number',
'commentator_joined_date', 'commentator_ip', 'comment_text']
pd.DataFrame(columns=file_columns).to_excel(comment_data_filename,
sheet_name='comment&commentator_info',
index=False)
commentator_info_dict = {
'commentator_name': commentator_name_list,
'commentator_level': commentator_level_list,
'commentator_gender': gender_list,
'commentator_if_writer': commentator_if_writer_list,
'commentator_continuous_writing_days': continuous_writing_days_list,
'commentator_reading_time': reading_time_list,
'commentator_read_books_number': read_books_number_list,
'commentator_get_good_number': good_number_list,
'commentator_joined_date': joined_date_list,
'commentator_ip': ip_list,
'comment_text': comment_text_list
}
commentator_info_data = pd.DataFrame(commentator_info_dict)
with pd.ExcelWriter(comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
print(f'评论者相关信息已写入【{comment_data_filename}】')
if __name__ == '__main__':
# 用page直接控制浏览器
page = ChromiumPage()
# ranking页面若滚动到底,则第二次运行可注释掉
# scroll_page()
# 定义空列表
# 书名
title_list = []
# 作者名
author_list = []
# 类型名
type_list = []
# 标签名
tag_list = []
# 简介
description_list = []
# 章节数
chapter_list = []
# 阅读量
views_list = []
# 热度
hot_number_list = []
# 星级评分
star_score_list = []
# 评分人数
reviews_number_list = []
# 评论者
commentator_name_list = []
# 评论内容
comment_text_list = []
# 持续更新日期
continuous_writing_days_list = []
# 阅读时长
reading_time_list = []
# 阅读数量
read_books_number_list = []
# 评论者等级
commentator_level_list = []
# 评论者性别
gender_list = []
# 获赞数
good_number_list = []
# 注册日期
joined_date_list = []
# IP地址
ip_list = []
# 是否作者
commentator_if_writer_list = []
# 错误次数
error_list = []
# 找到存放rank数据的div
div = page.ele('.j_rank_wrapper')
# 获取存放rank数据的网页元素列表
section_list = div.eles('.df g_hr pt16 pb16')
# 遍历rank中所有的书
for section in section_list[5:]:
# 标题
title = section.ele('.c_l')
# 获取title中所有属性及值的字典
attrs_dict = title.attrs
title_list.append(title.text)
# 描述
desc = section.ele('.fw400 lh20 fs14 ls0.2 c_s ells _2 mb4')
description_list.append(desc.text)
# 热度
hot_num = section.ele('.vam fw400 lh16 fs12')
hot_number_list.append(hot_num.text)
# print(f'{title.text}的基本信息已获取')
# 用于点击,进入新页面
new_detail_page = title.click.middle(get_tab=True)
new_detail_page.scroll.to_bottom()
time.sleep(1)
"""
该部分代码用于获取小说基本内容
"""
# 分析新页面
# 获取基本信息:类型、章节数、阅读量
div_1 = new_detail_page.ele('.mb12 fw700 lh24 det-hd-detail c_000 fs0')
info_list = div_1.eles('tag:span')
# 类型
type_name = info_list[0].text
type_list.append(type_name)
# 章节数
chapters_number = info_list[1].text.split()[0]
chapter_list.append(chapters_number)
# 阅读量
views_number = info_list[2].text.split()[0]
views_list.append(views_number)
# 作者名称
try:
author_name = new_detail_page.ele('.c_primary').text
author_list.append(author_name)
except ElementNotFoundError:
author_name_div = new_detail_page.ele('.fw700 ell dib vam fs16 fw500')
author_name = author_name_div.ele('tag:span').text
author_list.append(author_name)
# tags
novel_tags_list = []
tags_list = new_detail_page.eles('.m-tag dib vam mb8')
for tag in tags_list:
novel_tags_list.append(tag.text)
tag_list.append(novel_tags_list)
# 星级评分
try:
div_star = new_detail_page.ele('.clearfix mb20 rev-tit')
star_score = div_star.ele('tag:small').text
star_score_list.append(star_score)
except ElementNotFoundError:
star_score = 'Not enough ratings'
star_score_list.append(star_score)
# 评论人数
reviews_number = div_star.ele('tag:i').text
reviews_number_list.append(reviews_number)
# 打印查看结果
# print(type_name, chapters_number, views_number, author_name, novel_tags_list, star_score, reviews_number)
print(f'{title.text}的基本信息已获取')
# 保存数据
novel_rank_info_filename = f'annual_rank_info.xlsx'
if os.path.exists(novel_rank_info_filename):
novel_rank_info_dict = {
'title': title_list,
'type': type_list,
'chapter_number': chapter_list,
'views': views_list,
'author': author_list,
'tags': tag_list,
'star_score': star_score_list,
'reviews_number': reviews_number_list
}
# print(f'novel_rank_info_dic:{novel_rank_info_dict}')
novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
with pd.ExcelWriter(novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
print(f'小说排行相关信息已写入【{novel_rank_info_filename}】')
else:
print('#' * 10 + f'正在新建【{novel_rank_info_filename}】文件' + '#' * 10)
file_cloumns = ['title', 'type', 'chapter_number',
'views', 'author', 'tags', 'star_score', 'reviews_number']
pd.DataFrame(columns=file_cloumns).to_excel(novel_rank_info_filename,
sheet_name='novel_rank_info', index=False)
novel_rank_info_dict = {
'title': title_list,
'type': type_list,
'chapter_number': chapter_list,
'views': views_list,
'author': author_list,
'tags': tag_list,
'star_score': star_score_list,
'reviews_number': reviews_number_list
}
# print(f'novel_rank_info_dic:{novel_rank_info_dict}')
novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
with pd.ExcelWriter(novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
print(f'小说排行相关信息已写入【{novel_rank_info_filename}】')
# new_detail_page.close()
"""
# 下方代码用于获取评论内容
"""
try:
# 找到Next按钮
# next_button = new_detail_page.ele('.ui-page ui-page-next')
next_button = new_detail_page.ele('.ui-page ui-page-next')
# 当前页信息
# current_page = new_detail_page.ele('.ui-page ui-page-current').text
# 获取Next按钮中的所有attributes
next_button_dict = next_button.attrs
control_number = len(next_button_dict)
while True:
# 若Next按钮不能点击,则只有一个attributes
if control_number != 1:
time.sleep(1)
# 若Next中的attributes不唯一,则爬取评论后点击下一页
get_comment(novel_name=title)
next_button = new_detail_page.ele('.ui-page ui-page-next')
next_button_dict = next_button.attrs
control_number = len(next_button_dict)
next_button.click()
# print(f'已经获取{title.text}的第{current_page}页的评论数据,并点击进入下一页')
print(f'已经获取{title.text}当前页的评论数据,并点击进入下一页')
else:
# 若Next只有一个attributes,则代表Next不能再点击,则爬取评论后,关闭该页面
print('该文档无下一页')
get_comment(novel_name=title)
print('所有评论已获取完成,自动关闭当前页面')
new_detail_page.close()
break
except ElementNotFoundError:
print('该文档无下一页')
get_comment(novel_name=title)
print('所有评论已获取完成,自动关闭当前页面')
new_detail_page.close()
- 以下为尝试重构代码:
```python
# -*- coding: utf-8 -*-
"""
@ Project : PycharmProjects
@ File : test04.py
@ IDE : PyCharm
@ Date : 2024/7/17 18:21
"""
import os
import time
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import ElementNotFoundError
class SpiderWebnovel:
def __init__(self):
"""
定义全局变量
"""
# 用page直接控制浏览器
self.page = ChromiumPage()
# 找到存放rank数据的div
self.div = self.page.ele('.j_rank_wrapper')
# 获取存放rank数据的网页元素列表
self.section_list = self.div.eles('.df g_hr pt16 pb16')
# 定义空列表
# 书名
self.title_list = []
# 作者名
self.author_list = []
# 类型名
self.type_list = []
# 标签名
self.tag_list = []
# 简介
# self.description_list = []
# 章节数
self.chapter_list = []
# 阅读量
self.views_list = []
# 热度
self.hot_number_list = []
# 星级评分
self.star_score_list = []
# 评分人数
self.reviews_number_list = []
# 评论者
self.commentator_name_list = []
# 评论内容
self.comment_text_list = []
# 是否为小说创作者
self.commentator_if_writer_list = []
# 持续更新日期
self.continuous_writing_days_list = []
# 阅读时长
self.reading_time_list = []
# 阅读数量
self.read_books_number_list = []
# 评论者等级
self.commentator_level_list = []
# 评论者性别
self.gender_list = []
# 获赞数
self.good_number_list = []
# 注册日期
self.joined_date_list = []
# IP地址
self.ip_list = []
def scroll_page(self):
"""
该部分用于进入排行ranking页面,并操控页面持续滚动,直到加载全部的小说信息
"""
# 导入url
url = 'https://www.webnovel.com/ranking/novel/season/power_rank'
# 访问网页
self.page.get(url)
# 页面每次加载20个,需要循环滚动
# 进入循环(该循环滚动速度较慢,可规避反爬虫)
# i用于计数
i = 0
while True:
try:
# 尝试寻找End元素
# 找不到则执行滚动
# 定位榜单底部END标识
self.page.ele('.g_no_result')
end_button = self.page.ele('.g_no_result')
print(f'{end_button.text}:页面已滚动到底部')
# 找到End元素后,打印进度,并结束循环
break
# 预期错误为“找不到该元素”
except ElementNotFoundError:
i += 1
# 打印计数,主要用于显示进度
print(f'已滚动第{i}次')
# 操作页面滚动到网页中的等待元素——该元素不可见
# self.page.actions.scroll(on_ele='.g_loading j_rank_loading _on')
# 二选一
self.page.scroll.to_bottom()
def get_novel_info(self):
"""
该部分代码用于逐个点击小说,进入小说详情页
获取小说的基本信息:类型、章节数、阅读量
"""
def get_comment(self, comments_list):
# comments_list = self.new_detail_page.eles('.^j_pageReviewList')
for comment in comments_list:
# 定位存放单个评论的div
comment_div = comment.ele('.oh pr')
# 评论者
self.commentator_name = comment_div.ele('tag:a')
self.commentator_name_list.append(self.commentator_name.text)
# 评论文本
comment_text = comment_div.ele('tag:p')
self.comment_text_list.append(comment_text.text)
# 获取评论者的信息
self.get_commentator_info()
# 在此处将获取到的数据保存到对应的文件
self.comment_data_filename = f'[{self.title.text}] comment_text & commentator_info.xlsx'
if os.path.exists(self.comment_data_filename):
self.save_commentator_info()
else:
print('#' * 10 + f'正在新建【{self.comment_data_filename}】文件' + '#' * 10)
file_columns = ['commentator_name', 'commentator_level', 'commentator_gender', 'commentator_if_writer',
'commentator_continuous_writing_days',
'commentator_reading_time',
'commentator_read_books_number', 'commentator_get_good_number',
'commentator_joined_date', 'commentator_ip', 'comment_text']
pd.DataFrame(columns=file_columns).to_excel(self.comment_data_filename,
sheet_name='comment&commentator_info',
index=False)
self.save_commentator_info()
def get_commentator_info(self):
"""
以下代码用于获取评论者的相关信息
"""
# time.sleep(1)
print('*' * 10 + f'正在获取{self.commentator_name.text}的用户信息' + '*' * 10)
# 模拟鼠标中键点击,以新标签页打开用户主页,并获取数据
commentator_info_page = self.commentator_name.click.middle(get_tab=True)
# 定位到用户数据的div中
# 定位到用户数据的div中
commentator_info_div = commentator_info_page.ele('.header mb40')
# 定位到存放阅读时长、阅读数量的ul
page_ul = commentator_info_div.ele('tag:ul')
ul_list = page_ul.eles('tag:strong')
if len(ul_list) == 3:
print('该用户是小说作者')
self.commentator_if_writer_list.append('Yes')
# 持续创作时间
continuous_writing_days = ul_list[0]
self.continuous_writing_days_list.append(continuous_writing_days.text)
# 阅读时长
reading_time = ul_list[1]
self.reading_time_list.append(reading_time.text)
# 阅读数量
read_books_number = ul_list[2]
self.read_books_number_list.append(read_books_number.text)
# print(continuous_writing_days.text, reading_time.text, read_books_number.text)
elif len(ul_list) == 2:
print('该用户是普通读者')
self.commentator_if_writer_list.append('No')
# 持续创作时间
continuous_writing_days = 'None'
self.continuous_writing_days_list.append(continuous_writing_days)
# 阅读时长
reading_time = ul_list[0]
self.reading_time_list.append(reading_time.text)
# 阅读数量
read_books_number = ul_list[1]
self.read_books_number_list.append(read_books_number.text)
# print(continuous_writing_days, reading_time.text, read_books_number.text)
# 定位到等级和性别的header
page_header = commentator_info_div.ele('.lh40 mb16')
# 等级
commentator_level_dict = page_header.ele('tag:strong').attrs
commentator_level = commentator_level_dict['title'].split()[1]
self.commentator_level_list.append(commentator_level)
try:
# 性别
gender_dict = page_header.ele('tag:i').attrs
gender = gender_dict['title']
self.gender_list.append(gender)
except ElementNotFoundError:
print('该用户未显示性别')
gender = 'None'
self.gender_list.append(gender)
# 获赞
good_div = commentator_info_div.ele('.fr fs0')
good_number_dict = good_div.ele('tag:span').attrs
good_number = good_number_dict['data-num']
self.good_number_list.append(good_number)
# 定位到注册时间与IP的address
page_address = commentator_info_div.ele('tag:address')
strong_list = page_address.eles('tag:strong')
# 注册时间
joined_date = strong_list[0].text.split()[0]
self.joined_date_list.append(joined_date)
# IP
ip = strong_list[1]
self.ip_list.append(ip.text)
# print(commentator_level, gender, good_number, joined_date, ip.text)
print('*' * 10 + '该用户的信息已获取完毕,页面自动关闭' + '*' * 10)
commentator_info_page.close()
def save_rank_info(self):
novel_rank_info_dict = {
'title': self.title_list,
'type': self.type_list,
'chapter_number': self.chapter_list,
'views': self.views_list,
'author': self.author_list,
'tags': self.tag_list,
'star_score': self.star_score_list,
'reviews_number': self.reviews_number_list
}
# print(f'novel_rank_info_dic:{novel_rank_info_dict}')
novel_rank_info_data = pd.DataFrame(novel_rank_info_dict)
with pd.ExcelWriter(self.novel_rank_info_filename, mode='a', if_sheet_exists='overlay') as writer:
novel_rank_info_data.to_excel(writer, sheet_name='novel_rank_info', index=False)
print(f'小说排行相关信息已写入【{self.novel_rank_info_filename}】')
def save_commentator_info(self):
commentator_info_dict = {
'commentator_name': self.commentator_name_list,
'commentator_level': self.commentator_level_list,
'commentator_gender': self.gender_list,
'commentator_if_writer': self.commentator_if_writer_list,
'commentator_continuous_writing_days': self.continuous_writing_days_list,
'commentator_reading_time': self.reading_time_list,
'commentator_read_books_number': self.read_books_number_list,
'commentator_get_good_number': self.good_number_list,
'commentator_joined_date': self.joined_date_list,
'commentator_ip': self.ip_list,
'comment_text': self.comment_text_list
}
commentator_info_data = pd.DataFrame(commentator_info_dict)
with pd.ExcelWriter(self.comment_data_filename, mode='a', if_sheet_exists='overlay') as writer:
commentator_info_data.to_excel(writer, sheet_name='comment&commentator_info', index=False)
print(f'评论者相关信息已写入【{self.comment_data_filename}】')
def run(self):
# ranking页面若滚动到底,则第二次运行可注释掉
# self.scroll_page()
# 遍历rank中所有的书
for section in self.section_list:
# 标题
self.title = section.ele('.c_l')
self.title_list.append(self.title.text)
new_detail_page = self.title.click.middle(get_tab=True)
new_detail_page.scroll.to_bottom()
print(f'已进入【{self.title.text}】详情页,并滚动到底部')
print('=' * 10 + f'正在爬取【{self.title.text}】的评论信息' + '=' * 10)
try:
# 找到Next按钮
next_button = new_detail_page.ele('.ui-page ui-page-next')
# 当前页信息
# current_page = new_detail_page.ele('.ui-page ui-page-current').text
# 获取Next按钮中的所有attributes
control_number = len(next_button.attrs)
comments_list = new_detail_page.eles('.^j_pageReviewList')
# 下方代码用于获取评论内容
while True:
# 若Next按钮不能点击,则只有一个attributes
if control_number != 1:
# 若Next中的attributes不唯一,则爬取评论后点击下一页
self.get_comment(comments_list)
next_button.click()
comments_list = new_detail_page.eles('.^j_pageReviewList')
# print(f'已经获取{title.text}的第{current_page}页的评论数据,并点击进入下一页')
print('*' * 10 + f'已经获取当前页的评论数据,并点击进入下一页' + '*' * 10)
else:
# 若Next只有一个attributes,则代表Next不能再点击,则爬取评论后,关闭该页面
print('该评论区无下一页')
comments_list = new_detail_page.eles('.^j_pageReviewList')
self.get_comment(comments_list)
print('*' * 10 + '所有评论已获取完成,自动关闭当前页面' + '*' * 10)
new_detail_page.close()
break
except ElementNotFoundError:
print('该评论区无下一页')
comments_list = new_detail_page.eles('.^j_pageReviewList')
self.get_comment(comments_list)
print('*' * 10 + '所有评论已获取完成,自动关闭当前页面' + '*' * 10)
new_detail_page.close()
if __name__ == '__main__':
spider = SpiderWebnovel()
spider.run()