根据注释写代码
import requests
from lxml import etree
import time
import csv
import re
class Crawl(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 新建文件 data.csv 来储存数据。(1分)
#TODO:
self.f = open(, , , ,)
row = "NAME,SCORE,VOTES,RELEASE_DATE,REGIONS,GENRES,PPROTAGONIST,PPROTAGONIST\n"
self.f.write(row)
self.page = 0
def parse(self, url):
response = requests.get(url=url, headers=self.headers)
parse_html = etree.HTML(response.text)
# 判断页面是否有数据 (1分)
#TODO:
li_list = parse_html.xpath()
if len(li_list):
self.parse_next(parse_html)
# 构建翻页url (1分)
#TODO:
self.page =
url =
time.sleep(1)
self.parse(url)
else:
return
def parse_next(self,parse_html):
# 解析页面,获取 Title,RatingNum,Number,Info1 (4分)
#TODO:
Title = parse_html.xpath()
RatingNum = parse_html.xpath()
Number = parse_html.xpath()
Info1 = parse_html.xpath()
Info = [x.strip() for x in Info1 if x.strip() != '']
Infos1 = [Info[i] for i in range(0, len(Info), 2)]
Infos2 = [Info[i] for i in range(1, len(Info)+1, 2)]
for title,ratingNum, number, info1, info2 in zip(Title, RatingNum, Number, Infos1, Infos2):
item = {}
info_all = info2.strip().split('/')
info_time, info_state = info_all[:2]
info_type = info_all[-1]
item['NAME'] = title
# 处理item['SCORE']的数据 (1分)
#TODO:
item['SCORE'] =
item['VOTES'] = re.findall(r'(\d+)',number)[0]
item['RELEASE_DATE'] = re.findall(r'(\d+)',info_time)[0]
item['REGIONS'] = "".join(re.findall(r'([^\d()])',info_state)).strip()
item['GENRES'] = info_type.strip()
item['DIRECTOR'] = "".join(re.findall(r'导演:(.*)\xa0', info1)).strip()
item['PPROTAGONIST'] = "".join(re.findall(r'主演:(.*)', info1))
self.save_data(item)
def save_data(self, item):
f = csv.writer(self.f)
f.writerow(item.values())
def main(self):
url = "https://movie.douban.com/top250?start={}&filter=".format(self.page)
self.parse(url)
self.f.close()
print("\n=====数据采集完成=====")
if name == "main":
crawl = Crawl()
crawl.main()