from urllib import request
import urllib
import time
def loadpage(fullurl,filename):
print("正在下载:",filename)
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.
0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400"
}
req=request.Request(fullurl,headers=header)
response=request.urlopen(req).read()
return response
def writepage(html,filename):
print("正在保存:",filename)
with open(filename,"wb") as f:
f.write(html)#wb表示以二进制的形式写入文件
print("------------------")
def tiebaSpider(url,begin,end):#构造url
for page in range(begin,end+1):
pn=(page-1)*50
#每次请求完成的url
fullurl=url+"&pn="+str(pn)#因为pn是数字,所以用str转换成字符串
filename="e:/nsxt/第"+str(page)+"页.html"#每次请求后保存的文件名
html=loadpage(fullurl,filename)#爬取网页
writepage(html,filename)#把获取的信息写到本地
if name == 'main':#开始执行
kw=input("请输入要爬取的贴吧名:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
url="http://tieba.baidu.com/f?"
key=urllib.parse.urlencode({"kw":kw})
a=url+key
tiebaSpider(url,begin,end)
time.sleep(8)