import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
url = ''
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/ QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
url_all = i.xpath('./li/a/@href') # 获取到全部商品url
# 补全商品链接中的缺陷
for i in url_all:
if 'https:' in i :
url1 = i
headers1 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/ QQBrowser/11.4.5225.400'}
respost1 = requests.get(url1, headers=headers1).text
ele1 = etree.HTML(respost1)
script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
for aq in script1:
con1 = aq.split(',')
for aw in con1:
ac = re.findall('"product_id":"(.*?)"', aw)
if ac: #数据做保存
for xc in ac:
# print(xc)
a = 'https:' + i
url2 = a
drive = webdriver.Chrome()
time.sleep(1) # 加载1秒
idall = drive.page_source # 获取当前页面信息
ida = re.findall('6.64.2.(.*?)&', idall) # 获取当前页id
for qe in ida:
if qe.isdigit(): # 判断是否为纯数字
# print(lst2)
lst3 = lst1 +lst2
lst4= list(set(lst3)) #去重 保存所有ID
# print(lst4)
acx = 0
for w2 in lst4:
id = w2
url3 = f'{id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
headers3 = {'referer': '',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36'}
respost3 = requests.get(url3, headers=headers3)
data = respost3.text
con1 = data.split(',')
for i in con1:
idp = re.findall('"product_id":"(.*?)"', i) # ID
if idp:
mani = re.findall('"comments_total":(.*)', i) # 总评数
if mani:
zop = re.findall('"comments_good":(.*)', i) # 好评数
if zop:
hop = re.findall('"satisfy_per":"(.*?)"', i) # 满意度
if hop:
url4 = f'{id}.html'
headers4 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/ QQBrowser/11.4.5225.400'}
respost4 = requests.get(url4, headers=headers4).text
name = re.findall('<title>【(.*)怎么样,好不好】用户评价-小米商城</title>', respost4)
if name:
data_list = []
for a, b, c, d,e in zip(lst5, lst6, lst7, lst8,lst9):
x = {}
x['商品名称'] = a
x['id'] = b
x['总评数'] = c
x['好评数'] = d
x['满意度'] = e
with open('小米商城.csv', 'w', encoding='gbk',newline='') as f:
write = csv.DictWriter(f, fieldnames=['商品名称', 'id', '总评数', '好评数', '满意度'])
畅游星辰大海 2022-12-26 21:40
