# coding=utf-8
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
findLink = re.compile(r"alt=\"(.*?)\"")
findImgSrc = re.compile(r'<div class="comment-star (?<!\d)(star\d+)"</div>')
findRating = re.compile(r'<div class="comment-time">(.*?)<\/div>')
findJudge = re.compile(r'<p class="comment-con">(.*?)<\/p>/i')
def main():
baseurl = "https://item.jd.com/100027211987.html#comment" #要爬取的网页链接
# 1.爬取网页
datalist = getData(baseurl)
savepath = "小米手环7pro.xls"
saveData(datalist,savepath)
def getData(baseurl, comment=None):
datalist = [] # 用来存储爬取的网页信息
url = baseurl + str(10)
html = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="comment-item"): # 查找符合要求的字符串
data = []
item = str(comment-item)
link = re.findall(findLink, item)[0]
data.append(link)
imgSrc = re.findall(findImgSrc, item)[0]
data.append(imgSrc)
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
# 保存数据到表格
def saveData(datalist,savepath):
print("save.......")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
sheet = book.add_sheet('小米手环7pro', cell_overwrite_ok=True) #创建工作表
col = ("用户名","评分","日期","评价")
for i in range(0,4):
sheet.write(0,i,col[i]) #列名
for i in range(0, min(len(datalist), 10)):
data = datalist[i]
for j in range(0,4):
sheet.write(i+1,j,data[j]) #数据
book.save(savepath) #保存
# 保存数据到数据库
if __name__ == "__main__": # 当程序执行时
# 调用函数
main()
# init_db("movietest.db")
print("爬取完毕!")
该代码可以成功运行,但生成的excel文件内无爬取的数据存在,请问应如何解决?