问题已在代码相应位置
import requests
from lxml import etree
import time
import csv
import codecs
import unicodedata
# 获得每一页的html,一共10页:
def get_source(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}
respose=requests.get(url,headers=headers)
# print(respose.text.encode('utf-8'))
respose.encoding='utf-8'
# print(respose.text)
html=etree.HTML(respose.text)
# print(html)
return (html)
# 把每一页的需要的电影名 评分 导演 名句等获取,然后形成一个列表里面是字典的形式返回:[{dict1},{dict2}...]
def get_date(html):
all_li=html.xpath('//*[@id="content"]/div/div[1]/ol/li')
# print(all_li)
movielist=[]
for li in all_li:
dict={}
title=li.xpath('div/div[2]/div[1]/a/span/text()')[0]
# print(title)
# print(title[0]+'---'+title[1])
rate_num=li.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
# print(rate_num)
inq=li.xpath('div//span[@class="inq"]/text()') #### 问题1:为什么这里写了[0]后,就会运行到保存完成第六次后报错????
# print(inq)
movieinfo1=li.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip().split("/")[0]
movieinfo=unicodedata.normalize('NFKC', movieinfo1)
# print(movieinfo)
dict['标题']=title
dict['电影详情']=movieinfo
dict['评分']=rate_num
dict['名言']=inq
# print(dict)
movielist.append(dict)
# print(movielist)
return movielist
time.sleep(5)
# return (title,rate_num,inq,mioveinfo)
# 保存每一个返回的列表,存到csv文件里去
def sverfile(movielist):
with codecs.open('.豆瓣top250信息.csv','w','utf-8') as f:
filehaed=csv.DictWriter(f,fieldnames=['标题','电影详情','评分','名言'])
filehaed.writeheader() ###写入csv表头
for eval in movielist:
# print(eval)
filehaed.writerow(eval)
print("保存完成")
for i in range(10):
movielist1=[]
url="https://movie.douban.com/top250?start={}&filter=".format(i*25)
html = get_source(url)
movielist1=get_date(html)
print(movielist1)
sverfile(movielist1)
time.sleep(5)
# 问题2:为什么保存后打开excle文件后全是乱码,而且只保存了26行