import urllib.request,urllib.error
from bs4 import BeautifulSoup
import re
import xlwt
import sqlite3
def main():
baseurl="https://movie.douban.com/top250?star="
#1.爬取网页
datalist=getData(baseurl)
#2.逐一解析网页
#3.保存网页
askURL("https://movie.douban.com/top250?star=")
#爬取网页
def getData(baseurl):
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url) #保存
#2.逐一解析网页
soup=BeautifulSoup(html,"html.parser") #(形成对象 )(BeautifulSoup有2个属性,一个是要解析的文件,一个是解析器)
for item in soup.find_all('div ',class_="item"):
# print(item)
return datalist #调用datalist="",调用了值,但是要有返回值
#得到指定一个URL的网页内容
def askURL(url):
head={ #模拟浏览器头部信息,向豆瓣服务器发送信息
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
}
request=urllib.request.Request(url,headers=head) #发送请求
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code) #编码问题
if hasattr(e,"reason"):
print(e.reason) #错误原因
return html
if __name__=="__main__": #当程序执行时#程序入口,确定程序从哪执行
#调用函数
main()
爬的豆瓣信息出来了但也有报错
报错:
Traceback (most recent call last):
File "f:/爬虫/1/爬取豆瓣.py", line 53, in <module>
main()
File "f:/爬虫/1/爬取豆瓣.py", line 13, in main
datalist=getData(baseurl)
File "f:/爬虫/1/爬取豆瓣.py", line 28, in getData
soup=BeautifulSoup(html,"html.parser") #(形成对象 )(BeautifulSoup有2个属性,一个是要解析的文件,一个是解析器)
File "D:\python\lib\site-packages\bs4\__init__.py", line 310, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'NoneType' has no len()