import requests
from lxml import etree
import time
import pandas as pd
for a in range(2):
print("正在爬取第" + str(a) + "页的数据")
url = 'https://movie.douban.com/top250?start={}'.format(a*25)
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
data = requests.get(url,headers = header)
s = etree.HTML(data.text)
电影名称 = []
电影网址 = []
电影评分 = []
评价人数 = []
评价详情 = []
上映日期等 = []
movies_name = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
movies_href = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href')
movies_score = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]/text()')
movies_num = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[4]/text()')
movies_desc = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span/text()')
movies_ye = s.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[2]')
if len(评价详情)==0:
movies_desc = ["Nan"]
电影名称.append(movies_name)
电影网址.append(movies_href)
电影评分.append(movies_score)
评价人数.append(movies_num)
评价详情.append(movies_desc)
上映日期等.append(movies_ye)
time.sleep(1)
df = pd.DataFrame()
df["aa"] = 电影名称[0]
df["bb"] = 电影网址[0]
df["cc"] = 电影评分[0]
df["dd"] = 评价人数[0]
df["ff"] = 上映日期等[0]
try:
df.to_excel(excel_writer = r"C:\users\admin\desktop\豆瓣电影Top250.xls",encoding="gbk")
except:
print("当页数据写入失败")