from bs4 import BeautifulSoup #网页解析 获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error#制定URL 获取网页数据
import xlwt #进行excel操作
import sqlite3 #进行SQLite数据库操作
def main():
baseurl = "https://movie.douban.com/top250?start="
#爬取网页
dateList = getDate(baseurl)
savepath = ".\\豆瓣电影top250.xls"
# 保存数据
saveDate()
askURL("https://movie.douban.com/top250?start=")
#爬取网页
def getDate(baseurl):
datelist = []
# 逐一解析数据
return dateList
#得到一个指定的一个url的网页内容
def askURL(url):
head = {
"User - Agent":" Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 91.0.4472.77Safari / 537.36"
}
#用户代理,表示告诉豆瓣服务器 我们是什么类型的机器 浏览器(告诉机器,我们能接收到什么水平的)
request=urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read(),decode("utf-8")
print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
#保存数据
def saveDate(savepath):
pass
if __name__ == '__main__': #当程序执行时
#调用函数
main()
出现这个错误
Traceback (most recent call last):
File "F:\python\douban\venv\spider.py", line 60, in <module>
main()
File "F:\python\douban\venv\spider.py", line 15, in main
dateList = getDate(baseurl)
File "F:\python\douban\venv\spider.py", line 27, in getDate
return dateList
NameError: name 'dateList' is not defined