import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
lst1=[]
lst2=[]
lst3=[]
url = 'https://www.mi.com/shop/category/list'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
url_all = i.xpath('./li/a/@href') # 获取到全部商品url
# 补全商品链接中的缺陷
for i in url_all:
if 'https:' in i :
url1 = i
headers1 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost1 = requests.get(url1, headers=headers1).text
ele1 = etree.HTML(respost1)
script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
for aq in script1:
con1 = aq.split(',')
for aw in con1:
ac = re.findall('"product_id":"(.*?)"', aw)
if ac: #数据做保存
for xc in ac:
lst1.append(xc)
# print(xc)
else:
a = 'https:' + i
url2 = a
drive = webdriver.Chrome()
drive.maximize_window()
drive.get(f'{url2}')
time.sleep(1) # 加载1秒
idall = drive.page_source # 获取当前页面信息
ida = re.findall('6.64.2.(.*?)&', idall) # 获取当前页id
for qe in ida:
if qe.isdigit(): # 判断是否为纯数字
lst2.append(qe)
# print(lst2)
drive.quit()
lst3 = lst1 +lst2
lst4= list(set(lst3)) #去重 保存所有ID
# print(lst4)
lst5=[]
lst6=[]
lst7=[]
lst8=[]
lst9=[]
acx = 0
for w2 in lst4:
id = w2
url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
headers3 = {'referer': 'https://www.mi.com/',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
respost3 = requests.get(url3, headers=headers3)
data = respost3.text
con1 = data.split(',')
for i in con1:
idp = re.findall('"product_id":"(.*?)"', i) # ID
if idp:
lst6.append(idp)
mani = re.findall('"comments_total":(.*)', i) # 总评数
if mani:
lst7.append(mani)
zop = re.findall('"comments_good":(.*)', i) # 好评数
if zop:
lst8.append(zop)
hop = re.findall('"satisfy_per":"(.*?)"', i) # 满意度
if hop:
lst9.append(hop)
url4 = f'https://www.mi.com/shop/comment/{id}.html'
headers4 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost4 = requests.get(url4, headers=headers4).text
name = re.findall('<title>【(.*)怎么样,好不好】用户评价-小米商城</title>', respost4)
if name:
lst5.append(name)
data_list = []
for a, b, c, d,e in zip(lst5, lst6, lst7, lst8,lst9):
x = {}
x['商品名称'] = a
x['id'] = b
x['总评数'] = c
x['好评数'] = d
x['满意度'] = e
data_list.append(x)
with open('小米商城.csv', 'w', encoding='gbk',newline='') as f:
write = csv.DictWriter(f, fieldnames=['商品名称', 'id', '总评数', '好评数', '满意度'])
write.writeheader()
write.writerows(data_list)
爬虫一直循环运行,不结束进程
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- 畅游星辰大海 2022-12-26 21:40关注
import requests from lxml import etree import re from selenium import webdriver import time import csv lst1=[] lst2=[] lst3=[] # 定义一个计数器 count = 0 # 设置循环条件 while count < 5: url = 'https://www.mi.com/shop/category/list' headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'} respost = requests.get(url,headers=headers).text ele = etree.HTML(respost) ulall = ele.xpath('//ul[@class="children-list clearix"]') for i in ulall: url_all = i.xpath('./li/a/@href') # 获取到全部商品url # 补全商品链接中的缺陷 for i in url_all: if 'https:' in i : url1 = i headers1 = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'} respost1 = requests.get(url1, headers=headers1).text ele1 = etree.HTML(respost1) script1 = ele1.xpath('//script[@type="text/javascript"]/text()') for aq in script1: con1 = aq.split(',') for aw in con1: ac = re.findall('"product_id":"(.*?)"', aw) if ac: #数据做保存 for xc in ac: lst1.append(xc) # print(xc) else: a = 'https:' + i url2 = a drive = webdriver.Chrome() drive.maximize_window() drive.get(f'{url2}') time.sleep(1) # 加载1秒 idall = drive.page_source # 获取当前页面信息 ida = re.findall('6.64.2.(.*?)&', idall) # 获取当前页id for qe in ida: if qe.isdigit(): # 判断是否为纯数字 lst2.append(qe) # print(lst2) drive.quit() l lst3 = lst1 +lst2 lst4= list(set(lst3)) #去重 保存所有ID # print(lst4) lst5=[] lst6=[] lst7=[] lst8=[] lst9=[] acx = 0 for w2 in lst4: id = w2 url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6' headers3 = {'referer': 'https://www.mi.com/', 'accept': 'application/json, text/plain, */*', 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="86", "Google Chrome";v="86"', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225 respost3 = requests.get(url3, headers=headers3).text # print(respost3) ele3 = etree.HTML(respost3) script3 = ele3.xpath('//script[@type="text/javascript"]/text()') for x in script3: con3 = x.split(',') for z in con3: xc3 = re.findall('"content":"(.*?)"', z) if xc3: for ax in xc3: lst5.append(ax) good = re.findall('"good":"(.*?)"', respost3) for g in good: lst6.append(g) general = re.findall('"general":"(.*?)"', respost3) for g in general: lst7.append(g) poor = re.findall('"poor":"(.*?)"', respost3) for p in poor: lst8.append(p) ac = re.findall('"all":"(.*?)"', respost3) for a in ac: lst9.append(a) acx = acx + 1 print(f'爬取第{acx}条数据') # 保存 with open('mi.csv', 'a', newline='', encoding='utf-8') as f: write = csv.writer(f) for i in range(len(lst4)): data = [lst4[i], lst5[i], lst6[i], lst7[i], lst8[i], lst9[i]] write.writerow(data) # 清空列表,便于下次保存 lst1.clear() lst2.clear() lst3.clear() lst4.clear() lst5.clear() lst6.clear() lst7.clear() lst8.clear() lst9.clear() acx = 0 # 设置终止条件,爬取五次后退出循环 if ulall.index(i) == 4: break
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报