为何我写的爬虫爬取豆瓣top250个电影,他重复爬了前25个电影10
def get_first_text(data_list):
try:
return data_list[0].strip()
except:
return "未获取到数据"
urls=['https://movie.douban.com/top250?start={i*25}&filter=' for i in range(10)]
count=1
for url in urls:
print(f"爬取:{url}")
res=requests.get(url=url,headers=headers,timeout=10)
print(res.status_code)
html=etree.HTML(res.text)
lis=html.xpath('//*[@id="content"]/div/div[1]/ol/li')
print(len(lis))
for li in lis:
title=get_first_text(li.xpath('./div/div[2]/div[1]/a/span[1]/text()'))
src=get_first_text(li.xpath('./div/div[2]/div[1]/a/@href'))
director=get_first_text(li.xpath('./div/div[2]/div[2]/p[1]/text()'))
score=get_first_text(li.xpath('./div/div[2]/div[2]/div/span[2]/text()'))
comment=get_first_text(li.xpath('./div/div[2]/div[2]/div/span[4]/text()'))
summary=get_first_text(li.xpath('./div/div[2]/div[2]/p[2]/span/text()'))