原始代码
import requests
import lxml.html
import csv
doubanUrl = 'https://movie.douban.com/top250?start={}&filter='
def getSource(url):
response = requests.get(url)
response.encoding = 'utf-8'
return response.content
def getEveryItem(source):
selector = lxml.html.document_fromstring(source)
movieitemlist = selector.Xpath('//div[@class="info"]')
movieList = []
for eachMovie in movieitemlist:
movieDict = {}
title = eachMovie.Xpath('div[@class="hd"/a/span[@class="title"]/text()')
otherTitle = eachMovie.Xpath('div[@class="hd"/a/span[@class="other"]/text()')
link = eachMovie.Xpath('div[@class="hd"/a/@href')
star = eachMovie.Xpath('div[@class="bd"/div[@class="star"]/span[@class="rating_num"]/text()')
quote = eachMovie.Xpath('div[@class="bd"/p[@class="quote"]/span/text()')
movieDict['title'] = ''.join(title+otherTitle)
movieDict['url'] = link
movieDict['star'] = star
movieDict['quote'] = quote
print(movieDict)
movieList.append(movieDict)
return movieList
def writeData(movieList):
with open('MovieDouban.csv','w',encoding='UTF-8') as f:
writer = csv.DictWriter(f,fieldnames=['title','star','quote','url'])
writer.writeheader()
for each in movieList:
write.writerow(each)
if __name__=='__main__':
movieList = []
for i in range(10):
pageLink = doubanUrl.format(i * 25)
print(pageLink)
source = getSource(pageLink)
movieList += getEveryItem(source)
#movieList = movieList + getEveryItem(source)
print(movieList[:10])
writeData(movieList)
报错如下
C:\Users\abc\AppData\Local\Programs\Python\Python38-32\python.exe C:/Users/abc/.PyCharmCE2019.3/config/scratches/scratch_1.py
https://movie.douban.com/top250?start=0&filter=
Traceback (most recent call last):
File "C:/Users/abc/.PyCharmCE2019.3/config/scratches/scratch_1.py", line 63, in <module>
movieList += getEveryItem(source)
File "C:/Users/abc/.PyCharmCE2019.3/config/scratches/scratch_1.py", line 18, in getEveryItem
selector = lxml.html.document_fromstring(source)
File "C:\Users\abc\AppData\Local\Programs\Python\Python38-32\lib\site-packages\lxml\html\__init__.py", line 763, in document_fromstring
raise etree.ParserError(
lxml.etree.ParserError: Document is empty
Process finished with exit code 1
系统报错该怎么解决?