import requests
from lxml import etree
import time
for a in range(10):
url= "https://movie.douban.com/top250?start={}&filter=".format(a*25)
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400'
}
response= requests.get(url,headers=headers).text
#print(response.text)
#第二步,通过Xpath来获得有效信息
html= etree.HTML(response)
lis = html.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
for li in lis :
title = li.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0]
rating_num = li.xpath("./div/div[2]/div[2]/div/span[2]/text()")[0]
pj= li.xpath("./div/div[2]/div[2]/div/span[4]/text()")[0]
href = li.xpath("./div/div[2]/div[1]/a/@href")[0]
#print(title,rating_num,pj,href)
time.sleep(0.5)
#保存数据
with open(r"书单",'a+',encoding="utf-8")as f:
f.write("{},{},{},{}".format(title,rating_num,pj
,href))
f.write("\n")