python爬取某宝产生的问题 5C

爬取当前页与第二页的数据相同

import requests
import re
import os

#获取商品页面
def requestUrl(url):
 try:
   # 设置userAgent
   headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36","Cookie":"xx"}
   response = requests.get(url, headers=headers,timeout=5)
   # 查看返回状态
   response.raise_for_status()
   response.encoding = response.apparent_encoding
   htmlText = response.text
   return htmlText
 except:
   print("error")
#查找关键词
def getTheText(text,ulist):
  if(len(ulist)!=0):
     ulist=[]
  jiaGeRepx=re.compile(r'\"view_price\"\:\"[\d\.]*\"')
  nameRepx=re.compile(r'\"raw_title\"\:\".*?\"')
  jiaGeList = jiaGeRepx.findall(text)
  nameList = nameRepx.findall(text)

  for i in range(len(nameList)):
      try:
          name = eval(nameList[i].split(":")[1])
          jiaGe = eval(jiaGeList[i].split(":")[1])
          ulist.append([name,jiaGe])
      except:
          name = nameList[i].split(":")[1]
          jiaGe = jiaGeList[i].split(":")[1]
          ulist.append([name, jiaGe])
count=0
##将结果输出到文件
def formatText(ulist,shenDu):
   jieGuo=""
   biaoDaShi = "{0:^9}\t{1:^50}\t{2:^9}\r\n"
   if (shenDu==0):
       xuHao = "序号"
       shangPin = "商品"
       jiaGe="价格"
       jieGuo = biaoDaShi.format(xuHao,shangPin,jiaGe)
   for tag in ulist :
       global count
       count+=1
       xuHao = str(count)
       shangPin = tag[0]
       jiaGe = str(tag[1])
       jieGuo += biaoDaShi.format(xuHao,shangPin,jiaGe)
   dstUrl = 'D://商品.txt'
   with open(dstUrl, 'a') as f:
    f.write(jieGuo)
    f.close


def main():
   uinfo = []
   shangPin = "书包"
   shenDu=2
   for i in range(shenDu):
        htmlText = requestUrl("https://s.某宝.com/search?q="+shangPin+"&s="+str(44*i))
        getTheText(htmlText,uinfo)
        formatText(uinfo,i)



if __name__ == '__main__':
    main()

url中的&s=44代表第二页

https://s.某宝.com/search?q=书包=3&ntoffset=3&p4ppushleft=1%2C48&s=44

结果
文件重复写入page1

2个回答

不要想着爬某宝了 ,他们的反爬相当的牛比

即使程序开发完了,爬几下就会弹出复杂的验证,就爬不下去了

tmlText = requestUrl("https://s.某宝.com/search?q="+shangPin+"&s="+str(44*i))
你这个s=0 s=44
for i in range(shenDu): 修改为
for i in range(44):
tmlText = requestUrl("https://s.某宝.com/search?q="+shangPin+"&s="+str(i))
看看

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问