qq_42840701
空空12315
采纳率100%
2019-03-15 10:56

爬虫失败 - 京东搜索结果

5
已采纳

导出的结果是一张只有表头没有数据的空表。

#信息采集:名称、价格、评论数、商家名称等
import requests
from lxml import etree
from pandas import DataFrame
import pandas as pd

jdInfoAll=DataFrame()
for i in range(1,4):
    url="https://search.jd.com/Search?keyword=bosch&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=1.his.0.0&ev=exbrand_%E5%8D%9A%E4%B8%96%EF%BC%88BOSCH%EF%BC%89%5E&page="+str(i)
    res=requests.get(url)
    res.encoding='utf-8'
    root=etree.HTML(res.text)
    name=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-name p-name-type-2"]/a/em/text()[2]')
    for i in range(0,len(name)):
        name[i]=re.sub('\s','',name[i])
        print(i)

    #sku
    sku=root.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku')
    print(sku)

    #价格
    price=[]
    comment=[]
    for i in range(0,len(sku)):
        thissku=sku[i]
        priceurl="https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_"+str(thissku)
        pricedata=requests.get(priceurl)
        pricepat='"p":"(.*?)"}'
        thisprice=re.compile(pricepat).findall(pricedata.text)   
        price=price+thisprice

        commenturl="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(thissku)
        commentdata=requests.get(commenturl)
        commentpat='"CommentCount":(.*?),"'
        thiscomment=re.compile(commentpat).findall(commentdata.text)
        comment=comment+thiscomment

    #商家名称
    shopname=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-shop"]/span/a/@title')
    print(shopname)

    jdInfo=DataFrame([name,price,shopname,comment]).T
    jdInfo.columns=['产品名称','价格','商家名称','评论数']
    jdInfoAll=pd.concat([jdInfoAll,jdInfo])
jdInfoAll.to_excel('jdInfoAll.xls')
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

2条回答

  • u013887652 人间再无张居正 2年前

    由于我的版本是Python3.6,从lxml导入etree的方法有所改变,所以我下面的代码和你有些不同,你可以根据自己的环境进行修改,相信你也看的懂

    import requests
    # from lxml import etree
    from lxml import html
    from pandas import DataFrame
    import pandas as pd
    import re
    jdInfoAll=DataFrame()
    etree = html.etree
    for i in range(1,4):
        url="https://search.jd.com/Search?keyword=bosch&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=1.his.0.0&ev=exbrand_%E5%8D%9A%E4%B8%96%EF%BC%88BOSCH%EF%BC%89%5E&page="+str(i)
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        res=requests.get(url, headers=headers, allow_redirects=False)
        res.encoding='utf-8'
        root=etree.HTML(res.text)
        name=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-name p-name-type-2"]/a/em/text()[2]')
        for i in range(0,len(name)):
            name[i]=re.sub('\s','',name[i])
            print(i)
    
        #sku
        sku=root.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku')
        print(sku)
    
        #价格
        price=[]
        comment=[]
        for i in range(0, len(sku)):
            thissku=sku[i]
            priceurl="https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_"+str(thissku)
            pricedata=requests.get(priceurl, headers=headers, allow_redirects=False)
            pricepat='"p":"(.*?)"}'
            thisprice=re.compile(pricepat).findall(pricedata.text)
            price=price+thisprice
    
            commenturl="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(thissku)
            commentdata=requests.get(commenturl, headers=headers, allow_redirects=False)
            commentpat='"CommentCount":(.*?),"'
            thiscomment=re.compile(commentpat).findall(commentdata.text)
            comment=comment+thiscomment
    
        #商家名称
        shopname=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-shop"]/span/a/@title')
        print(shopname)
    
        jdInfo=DataFrame([name,price,shopname,comment]).T
        jdInfo.columns=['产品名称','价格','商家名称','评论数']
        jdInfoAll=pd.concat([jdInfoAll,jdInfo])
    jdInfoAll.to_excel('jdInfoAll.xls')
    
    

    运行结果展示

    图片说明

    图片说明

    如果可以希望采纳

    点赞 评论 复制链接分享
  • qq_35081747 元气皮皮 2年前

    你可以打印一下你的res.text里面的内容是window.location.href=&#39;<a href="https://passport.jd.com/uc/login">https://passport.jd.com/uc/login</a>&#39;,把你导向登录界面,所以你能获取到数据都有鬼了。。

    点赞 评论 复制链接分享