爬取当前页与第二页的数据相同
import requests
import re
import os
#获取商品页面
def requestUrl(url):
try:
# 设置userAgent
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36","Cookie":"xx"}
response = requests.get(url, headers=headers,timeout=5)
# 查看返回状态
response.raise_for_status()
response.encoding = response.apparent_encoding
htmlText = response.text
return htmlText
except:
print("error")
#查找关键词
def getTheText(text,ulist):
if(len(ulist)!=0):
ulist=[]
jiaGeRepx=re.compile(r'\"view_price\"\:\"[\d\.]*\"')
nameRepx=re.compile(r'\"raw_title\"\:\".*?\"')
jiaGeList = jiaGeRepx.findall(text)
nameList = nameRepx.findall(text)
for i in range(len(nameList)):
try:
name = eval(nameList[i].split(":")[1])
jiaGe = eval(jiaGeList[i].split(":")[1])
ulist.append([name,jiaGe])
except:
name = nameList[i].split(":")[1]
jiaGe = jiaGeList[i].split(":")[1]
ulist.append([name, jiaGe])
count=0
##将结果输出到文件
def formatText(ulist,shenDu):
jieGuo=""
biaoDaShi = "{0:^9}\t{1:^50}\t{2:^9}\r\n"
if (shenDu==0):
xuHao = "序号"
shangPin = "商品"
jiaGe="价格"
jieGuo = biaoDaShi.format(xuHao,shangPin,jiaGe)
for tag in ulist :
global count
count+=1
xuHao = str(count)
shangPin = tag[0]
jiaGe = str(tag[1])
jieGuo += biaoDaShi.format(xuHao,shangPin,jiaGe)
dstUrl = 'D://商品.txt'
with open(dstUrl, 'a') as f:
f.write(jieGuo)
f.close
def main():
uinfo = []
shangPin = "书包"
shenDu=2
for i in range(shenDu):
htmlText = requestUrl("https://s.某宝.com/search?q="+shangPin+"&s="+str(44*i))
getTheText(htmlText,uinfo)
formatText(uinfo,i)
if __name__ == '__main__':
main()
url中的&s=44代表第二页
https://s.某宝.com/search?q=书包=3&ntoffset=3&p4ppushleft=1%2C48&s=44
结果
文件重复写入page1