「已注销」 2022-12-26 21:12 采纳率: 100%
浏览 66
已结题

爬虫一直循环运行,不结束进程


import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
lst1=[]
lst2=[]
lst3=[]
url = 'https://www.mi.com/shop/category/list'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
    url_all = i.xpath('./li/a/@href')  # 获取到全部商品url
    # 补全商品链接中的缺陷
    for i in url_all:
        if 'https:' in i :
            url1 = i
            headers1 = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
            respost1 = requests.get(url1, headers=headers1).text
            ele1 = etree.HTML(respost1)
            script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
            for aq in script1:
                con1 = aq.split(',')
                for aw in con1:
                    ac = re.findall('"product_id":"(.*?)"', aw)
                    if ac:  #数据做保存
                        for xc in ac:
                            lst1.append(xc)
                            # print(xc)
        else:
            a = 'https:' + i
            url2 = a
            drive = webdriver.Chrome()
            drive.maximize_window()
            drive.get(f'{url2}')
            time.sleep(1)  # 加载1秒
            idall = drive.page_source  # 获取当前页面信息
            ida = re.findall('6.64.2.(.*?)&', idall)  # 获取当前页id
            for qe in ida:
                if qe.isdigit():  # 判断是否为纯数字
                    lst2.append(qe)
                    # print(lst2)
            drive.quit()
    lst3 = lst1 +lst2
    lst4= list(set(lst3)) #去重 保存所有ID
    # print(lst4)
    lst5=[]
    lst6=[]
    lst7=[]
    lst8=[]
    lst9=[]
    acx = 0
    for w2 in lst4:
        id = w2
        url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
        headers3 = {'referer': 'https://www.mi.com/',
                           'accept': 'application/json, text/plain, */*',
                           'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
                           'sec-ch-ua-mobile': '?0',
                           'sec-ch-ua-platform': "Windows",
                           'sec-fetch-dest': 'script',
                           'sec-fetch-mode': 'no-cors',
                           'sec-fetch-site': 'same-site',
                           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
        respost3 = requests.get(url3, headers=headers3)
        data = respost3.text
        con1 = data.split(',')
        for i in con1:
            idp = re.findall('"product_id":"(.*?)"', i)   # ID
            if idp:
                lst6.append(idp)
            mani = re.findall('"comments_total":(.*)', i)  # 总评数
            if mani:
                lst7.append(mani)
            zop = re.findall('"comments_good":(.*)', i)    # 好评数
            if zop:
                lst8.append(zop)
            hop = re.findall('"satisfy_per":"(.*?)"', i)   # 满意度
            if hop:
                lst9.append(hop)
        url4 = f'https://www.mi.com/shop/comment/{id}.html'
        headers4 = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
        respost4 = requests.get(url4, headers=headers4).text
        name = re.findall('<title>【(.*)怎么样,好不好】用户评价-小米商城</title>', respost4)
        if name:
            lst5.append(name)
        data_list = []
        for a, b, c, d,e in zip(lst5, lst6, lst7, lst8,lst9):
            x = {}
            x['商品名称'] = a
            x['id'] = b
            x['总评数'] = c
            x['好评数'] = d
            x['满意度'] = e
            data_list.append(x)
        with open('小米商城.csv', 'w', encoding='gbk',newline='') as f:
            write = csv.DictWriter(f, fieldnames=['商品名称', 'id', '总评数', '好评数', '满意度'])
            write.writeheader()
            write.writerows(data_list)
  • 写回答

2条回答 默认 最新

  • 畅游星辰大海 2022-12-26 21:40
    关注
    import requests
    from lxml import etree
    import re
    from selenium import webdriver
    import time
    import csv
    
    lst1=[]
    lst2=[]
    lst3=[]
    
    # 定义一个计数器
    count = 0
    
    # 设置循环条件
    while count < 5:
        url = 'https://www.mi.com/shop/category/list'
        headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
        respost = requests.get(url,headers=headers).text
        ele = etree.HTML(respost)
        ulall = ele.xpath('//ul[@class="children-list clearix"]')
        for i in ulall:
            url_all = i.xpath('./li/a/@href')  # 获取到全部商品url
            # 补全商品链接中的缺陷
            for i in url_all:
                if 'https:' in i :
                    url1 = i
                    headers1 = {
                        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
                    respost1 = requests.get(url1, headers=headers1).text
                    ele1 = etree.HTML(respost1)
                    script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
                    for aq in script1:
                        con1 = aq.split(',')
                        for aw in con1:
                            ac = re.findall('"product_id":"(.*?)"', aw)
                            if ac:  #数据做保存
                                for xc in ac:
                                    lst1.append(xc)
                                    # print(xc)
                else:
                    a = 'https:' + i
                    url2 = a
                    drive = webdriver.Chrome()
                    drive.maximize_window()
                    drive.get(f'{url2}')
                    time.sleep(1)  # 加载1秒
                    idall = drive.page_source  # 获取当前页面信息
                    ida = re.findall('6.64.2.(.*?)&', idall)  # 获取当前页id
                    for qe in ida:
                        if qe.isdigit():  # 判断是否为纯数字
                            lst2.append(qe)
                            # print(lst2)
                    drive.quit()
        l
        lst3 = lst1 +lst2
        lst4= list(set(lst3)) #去重 保存所有ID
        # print(lst4)
        lst5=[]
        lst6=[]
        lst7=[]
        lst8=[]
        lst9=[]
        acx = 0
        for w2 in lst4:
            id = w2
            url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
            headers3 = {'referer': 'https://www.mi.com/',
                                   'accept': 'application/json, text/plain, */*',
                                   'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="86", "Google Chrome";v="86"',
                                   'sec-fetch-site': 'same-origin',
                                   'sec-fetch-mode': 'cors',
                                   'sec-fetch-dest': 'empty',
                                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225
            respost3 = requests.get(url3, headers=headers3).text
            # print(respost3)
            ele3 = etree.HTML(respost3)
            script3 = ele3.xpath('//script[@type="text/javascript"]/text()')
            for x in script3:
                con3 = x.split(',')
                for z in con3:
                    xc3 = re.findall('"content":"(.*?)"', z)
                    if xc3:
                        for ax in xc3:
                            lst5.append(ax)
            good = re.findall('"good":"(.*?)"', respost3)
            for g in good:
                lst6.append(g)
            general = re.findall('"general":"(.*?)"', respost3)
            for g in general:
                lst7.append(g)
            poor = re.findall('"poor":"(.*?)"', respost3)
            for p in poor:
                lst8.append(p)
            ac = re.findall('"all":"(.*?)"', respost3)
            for a in ac:
                lst9.append(a)
            acx = acx + 1
            print(f'爬取第{acx}条数据')
        # 保存
        with open('mi.csv', 'a', newline='', encoding='utf-8') as f:
            write = csv.writer(f)
            for i in range(len(lst4)):
                data = [lst4[i], lst5[i], lst6[i], lst7[i], lst8[i], lst9[i]]
                write.writerow(data)
        # 清空列表,便于下次保存
        lst1.clear()
        lst2.clear()
        lst3.clear()
        lst4.clear()
        lst5.clear()
        lst6.clear()
        lst7.clear()
        lst8.clear()
        lst9.clear()
        acx = 0
        # 设置终止条件,爬取五次后退出循环
        if ulall.index(i) == 4:
            break
    
    
    
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

问题事件

  • 系统已结题 1月3日
  • 已采纳回答 12月26日
  • 创建了问题 12月26日

悬赏问题

  • ¥15 关于#java#的问题:找一份能快速看完mooc视频的代码
  • ¥15 这种微信登录授权 谁可以做啊
  • ¥15 请问我该如何添加自己的数据去运行蚁群算法代码
  • ¥20 用HslCommunication 连接欧姆龙 plc有时会连接失败。报异常为“未知错误”
  • ¥15 网络设备配置与管理这个该怎么弄
  • ¥20 机器学习能否像多层线性模型一样处理嵌套数据
  • ¥20 西门子S7-Graph,S7-300,梯形图
  • ¥50 用易语言http 访问不了网页
  • ¥50 safari浏览器fetch提交数据后数据丢失问题
  • ¥15 matlab不知道怎么改,求解答!!