为什么爬出来没有数据的
import pandas as pd
import requests
import re
import time
data_list = []
for i in range(1,5,1):
print("正在爬取第" + str(i) + "页")
#构建访问的网址,这个网址可有讲究了
first = 'https://read.douban.com/ebook/57468782/comments?page='
last = '&commentType=Review&sort=SCORE_DESC'
url = first + str(i) + last
#访问的头文件,还带这个cookie
headers ={
# 用的哪个浏览器
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63',
# 从哪个页面发出的数据申请,每个网站可能略有不同
'referer': 'https://read.douban.com/ebook/57468782/comments?page=1&commentType=Review&sort=SCORE_DESC',
# 哪个用户想要看数据,是游客还是注册用户,建议使用登录后的cookie
'cookie':'bid=CtjGQQts6qw; _ga=GA1.1.854662765.1661781900; _ga=GA1.3.854662765.1661781900; _gid=GA1.3.317441563.1661781904; __gads=ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg; _pk_ses.100001.a7dd=*; __gpi=UID=0000093d4aa6ff27:T=1661830434:RT=1661860990:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g; _ga_RXNMP372GL=GS1.1.1661860989.7.1.1661861051.59.0.0; _pk_id.100001.a7dd=684c82dba8991abc.1661781900.5.1661861051.1661850445.'
}
#尝试获取数据(这里的数据应该是从json里面获取的)
try:
data = requests.get(url,headers = headers).text
time.sleep(5)
result = re.findall('content":"(.*?)"title"',data)
data_list.extend(result)
except:
print("本页爬取失败")
df = pd.DataFrame()
df["评论"] = data_list
df.to_excel("评论_汇总.xlsx")