爬虫失败 - 京东搜索结果

导出的结果是一张只有表头没有数据的空表。

#信息采集:名称、价格、评论数、商家名称等
import requests
from lxml import etree
from pandas import DataFrame
import pandas as pd

jdInfoAll=DataFrame()
for i in range(1,4):
    url="https://search.jd.com/Search?keyword=bosch&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=1.his.0.0&ev=exbrand_%E5%8D%9A%E4%B8%96%EF%BC%88BOSCH%EF%BC%89%5E&page="+str(i)
    res=requests.get(url)
    res.encoding='utf-8'
    root=etree.HTML(res.text)
    name=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-name p-name-type-2"]/a/em/text()[2]')
    for i in range(0,len(name)):
        name[i]=re.sub('\s','',name[i])
        print(i)

    #sku
    sku=root.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku')
    print(sku)

    #价格
    price=[]
    comment=[]
    for i in range(0,len(sku)):
        thissku=sku[i]
        priceurl="https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_"+str(thissku)
        pricedata=requests.get(priceurl)
        pricepat='"p":"(.*?)"}'
        thisprice=re.compile(pricepat).findall(pricedata.text)   
        price=price+thisprice

        commenturl="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(thissku)
        commentdata=requests.get(commenturl)
        commentpat='"CommentCount":(.*?),"'
        thiscomment=re.compile(commentpat).findall(commentdata.text)
        comment=comment+thiscomment

    #商家名称
    shopname=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-shop"]/span/a/@title')
    print(shopname)

    jdInfo=DataFrame([name,price,shopname,comment]).T
    jdInfo.columns=['产品名称','价格','商家名称','评论数']
    jdInfoAll=pd.concat([jdInfoAll,jdInfo])
jdInfoAll.to_excel('jdInfoAll.xls')

2个回答

由于我的版本是Python3.6,从lxml导入etree的方法有所改变,所以我下面的代码和你有些不同,你可以根据自己的环境进行修改,相信你也看的懂

import requests
# from lxml import etree
from lxml import html
from pandas import DataFrame
import pandas as pd
import re
jdInfoAll=DataFrame()
etree = html.etree
for i in range(1,4):
    url="https://search.jd.com/Search?keyword=bosch&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=1.his.0.0&ev=exbrand_%E5%8D%9A%E4%B8%96%EF%BC%88BOSCH%EF%BC%89%5E&page="+str(i)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    res=requests.get(url, headers=headers, allow_redirects=False)
    res.encoding='utf-8'
    root=etree.HTML(res.text)
    name=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-name p-name-type-2"]/a/em/text()[2]')
    for i in range(0,len(name)):
        name[i]=re.sub('\s','',name[i])
        print(i)

    #sku
    sku=root.xpath('//*[@id="J_goodsList"]/ul/li/@data-sku')
    print(sku)

    #价格
    price=[]
    comment=[]
    for i in range(0, len(sku)):
        thissku=sku[i]
        priceurl="https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_"+str(thissku)
        pricedata=requests.get(priceurl, headers=headers, allow_redirects=False)
        pricepat='"p":"(.*?)"}'
        thisprice=re.compile(pricepat).findall(pricedata.text)
        price=price+thisprice

        commenturl="https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds="+str(thissku)
        commentdata=requests.get(commenturl, headers=headers, allow_redirects=False)
        commentpat='"CommentCount":(.*?),"'
        thiscomment=re.compile(commentpat).findall(commentdata.text)
        comment=comment+thiscomment

    #商家名称
    shopname=root.xpath('//*[@id="J_goodsList"]/ul/li[@class="gl-item"]/div/div[@class="p-shop"]/span/a/@title')
    print(shopname)

    jdInfo=DataFrame([name,price,shopname,comment]).T
    jdInfo.columns=['产品名称','价格','商家名称','评论数']
    jdInfoAll=pd.concat([jdInfoAll,jdInfo])
jdInfoAll.to_excel('jdInfoAll.xls')

运行结果展示

图片说明

图片说明

如果可以希望采纳

qq_42840701
空空12315 验证了,是可行的,谢谢您嘞~
10 个月之前 回复

你可以打印一下你的res.text里面的内容是window.location.href=&#39;<a href="https://passport.jd.com/uc/login">https://passport.jd.com/uc/login</a>&#39;,把你导向登录界面,所以你能获取到数据都有鬼了。。

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问