月岛圆子 2023-02-15 18:12 采纳率: 40%
浏览 57
已结题

可以运行出来数据,但是爬出来只有一行

爬虫可以运行出来数据,但是结束运行后文件只有一行数据,其他数据没有传到文件中

img

#!coding=utf-8
import requests
import re
import random
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


###格式化时间戳
def timestamp_to_date(time_stamp, format_string="%Y-%m-%d %H:%M:%S"):
    time_array = time.localtime(int(time_stamp))
    str_date = time.strftime(format_string, time_array)
    return str_date


def de_duplication(lst):  ##去重不改变原数据顺序
    de_du = list(set(lst))
    de_du.sort(key=lst.index)
    return de_du


class xm(object):

    ###  获取分类
    def get_categoryList(self):
        url = 'https://youpin.mi.com/app/shopv3/pipe'
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '130',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'youpin.mi.com',
            'Origin': 'https://youpin.mi.com',
            'Referer': 'https://youpin.mi.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
        }
        data = {
            'data': '{"result": {"model": "Homepage", "action": "GetGroup2ClassInfo", "parameters": {}}}',
        }
        req = requests.post(url=url, headers=headers, data=data, verify=False).json()
        groups = req['result']['result']['data']['groups']
        df = pd.DataFrame(columns=('一级分类ID', '一级分类', '二级分类ID', '二级分类'))
        x = 0
        for i in groups:
            for j in i:
                class1_name = j['class']['name']  ##一级分类
                ucid1 = j['class']['ucid']  ##一级分类ID
                for k in j['sub_class']:
                    class2_name = k['name']  ##二级分类
                    ucid2 = k['ucid']  # 二级分类ID
                    df.loc[x] = [ucid1, class1_name, ucid2, class2_name]
                    x = x + 1
        # df.to_csv('list.csv',index=False, encoding="GB18030")
        return df

    ##获取商品数据
    def get_items_ID(self):
        s = requests.session()
        df = self.get_categoryList()
        cateList = df['一级分类ID'].values.tolist()
        catename = df['一级分类'].values.tolist()
        cateList = de_duplication(cateList)
        catename = de_duplication(catename)
        df_item = pd.DataFrame(columns=(
        '一级分类ID', '一级分类', '二级分类', '二级分类ID', '商品ID', '商品名称', '商品简介', '商品图片', '上架时间',
        '更新时间', '零售价', '商品URL', '评分', '好评率', '评论数', '评论观点'))
        x = 0
        url = 'https://youpin.mi.com/app/shopv3/pipe'
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '145',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cookie': 'youpindistinct_id=1677376dd950-07a9ccd760a322-514b2f1f; UM_distinctid=1677376de97317-013da9709fcd2b-514b2f1f-1fa400-1677376de98e12; Hm_lvt_f60d40663f1e63b337d026672aca065b=1543830429; mjclient=PC; youpin_sessionid=16777d642ba-0b57d1ad5863d-1ee2; CNZZDATA1267968936=1240670798-1543827326-%7C1543899301; Hm_lpvt_f60d40663f1e63b337d026672aca065b=1543903790',
            'Host': 'youpin.mi.com',
            'Origin': 'https://youpin.mi.com',
            'Referer': 'https://youpin.mi.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
        }
        s.headers.update(headers)
        for i in range(len(cateList)):  ##一级分类目录商品
            data = {
                'data': '{"uClassList": {"model": "Homepage", "action": "BuildHome", "parameters": {"id": "' + str(
                    cateList[i]) + '"}}}'
            }
            req = s.post(url=url, data=data, verify=False).json()
            itemdata = req['result']['uClassList']['data']
            for j in itemdata:
                if 'content' in j:
                    content_name = j['content']['name']  ##二级分类
                    ucid = j['content']['ucid']  # 二级分类ID
                    for k in j['data']:
                        try:
                            gid = k['gid']  ##商品ID
                            name = k['name']  ##商品名称
                            summary = k['summary']  ##商品简介
                            pic_url = k['pic_url']  ##商品图片
                            ctime = timestamp_to_date(k['ctime'])  ##上架时间
                            utime = timestamp_to_date(k['utime'])  ##更新时间
                            price_min = int(k['price_min']) / 100  ##价格
                            itemurl = k['url']  ##商品链接
                            commentdata = self.get_comment(gid)
                            avg_score = commentdata[0]
                            positive_rate = commentdata[1]
                            count = commentdata[2]
                            comment = commentdata[3]
                            df_item.loc[x] = [cateList[i], catename[i], ucid, content_name, gid, name, summary, pic_url,
                                              ctime, utime, price_min, itemurl, avg_score, positive_rate, count,
                                              comment]
                            print(cateList[i], catename[i], ucid, content_name, gid, name, summary, pic_url, ctime,
                                  utime, price_min, itemurl, avg_score, positive_rate, count, comment)
                            x = x + 1
                        except:
                            print(j)

        df_item.to_csv('df_item.csv', index=False, encoding="GB18030")
        return df_item

    ##获取单个商品ID评论数及评论观点数据
    def get_comment(self, id):
        url = 'https://youpin.mi.com/app/shopv3/pipe'
        UserAgentlist = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.104',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.5.4000',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',

        ]
        ran = random.randint(0, len(UserAgentlist) - 1)
        UserAgen = UserAgentlist[ran]
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '364',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'youpin.mi.com',
            'Origin': 'https://youpin.mi.com',
            'Referer': 'https://youpin.mi.com/detail?gid={}'.format(str(id)),
            'User-Agent': UserAgen
        }
        postdata = {
            'data': '{"overView":{"model":"Product","action":"CommentIndexV2","parameters":{"gid":' + str(
                id) + '}},"list":{"model":"Product","action":"CommentListOnly","parameters":{"index_type":0,"gid":' + str(
                id) + ',"pindex":1,"psize":10,"tag_name":null}}}'
        }
        req = requests.post(url=url, data=postdata, headers=headers, verify=False).text
        js = json.loads(req)
        data = js['result']['overView']['data']
        comment = ''
        avg_score = '0'
        positive_rate = '0'
        count = '0'
        if data != []:
            avg_score = data['avg_score']  ##评分
            positive_rate = data['positive_rate']  ####好评率
            count = data['tags'][0]['count']  ##评论数
            for i in data['tags']:
                comment = str(comment) + str(i['name']) + '(' + str(i['count']) + ')'  ##评论观点
        commentdata = [avg_score, positive_rate, count, comment]
        print(commentdata)
        return commentdata


if __name__ == '__main__':
    xm = xm()
    xm.get_items_ID()

img

  • 写回答

1条回答 默认 最新

  • 阳光宅男xxb 2023-02-15 22:21
    关注

    可能写入文件的地方有问题,等我给你调试下:
    错误原因是你的getComment方法没有请求到数据,提示'Model Product not exists action CommentIndexV2' ,应该是你的参数中的action的问题。应该是接口参数变了导致的。望采纳!!!

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论 编辑记录

报告相同问题?

问题事件

  • 系统已结题 2月24日
  • 已采纳回答 2月16日
  • 修改了问题 2月15日
  • 创建了问题 2月15日

悬赏问题

  • ¥15 用stata实现聚类的代码
  • ¥15 请问paddlehub能支持移动端开发吗?在Android studio上该如何部署?
  • ¥170 如图所示配置eNSP
  • ¥20 docker里部署springboot项目,访问不到扬声器
  • ¥15 netty整合springboot之后自动重连失效
  • ¥15 悬赏!微信开发者工具报错,求帮改
  • ¥20 wireshark抓不到vlan
  • ¥20 关于#stm32#的问题:需要指导自动酸碱滴定仪的原理图程序代码及仿真
  • ¥20 设计一款异域新娘的视频相亲软件需要哪些技术支持
  • ¥15 stata安慰剂检验作图但是真实值不出现在图上