公司的任务,爬取一个网站上的数据,用的是python。通过requests抓包,得到了基本信息,包含个数、最大页数、每页行数等。前几页一切正常,但每次到最后一页的时候,requests返回的最大页数就和在网页上开启开发者模式得到不一样了,并且没有任何规律
url = 'https://glxy.mot.gov.cn/company/getCompanyAchieveList.do'
def get_the_max(comid, infopage):
params = {
"companyId": comid,
"type": 11,
"page": 1,
"rows": 15,
"sourceInfo": infopage
}
res = requests.post(url, headers=headers, params=params)
pagemax = res.json()["pageObj"]["maxPage"]
#comic = 'fe620882312c467fb935f5c5a8ebaa35'
获得结果pagemax = 4
前三页一切正常,但是翻页到第4页也就是最后一页时,从res得到的pagemax就变成了6,但用chrome开启开发者模式去查找pagemax还是4
我尝试在请求最后一页的res时将pagemax也就是page的请求修改为最初的4,但是最后得到的数据依旧有错误
想正常爬取所有页的数据
要获取的字段内容是["rows"]中的全部内容
附上全部代码
因为数据量很大,这里只以一个comid为例
并且希望可以帮忙优化一下写入excel的方法,提高速度,公司要的很着急,我一边看视频一边写程序,很垃圾,勿喷
可以加钱!!
import openpyxl
import requests
import pandas as pd
url = 'https://glxy.mot.gov.cn/company/getCompanyAchieveList.do'
def get_the_max(infopage):
params = {
"companyId": 'fe620882312c467fb935f5c5a8ebaa35',
"type": 11,
"page": 1,
"rows": 15,
"sourceInfo": infopage
}
headers = {
'user-agnet': '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'''
}
res = requests.post(url, headers=headers, params=params)
pagemax = res.json()["pageObj"]["maxPage"]
countsize = res.json()["pageObj"]["countSize"]
print(params.get("companyId"), infopage, pagemax, countsize)
return pagemax, countsize
'''爬取信息'''
def req_the_info(page, infopage, startrow, pagemax, row=15):
params = {
"companyId": 'fe620882312c467fb935f5c5a8ebaa35',
"type": 11,
"page": page,
"rows": row,
"sourceInfo": infopage
}
headers = {
'user-agnet': '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'''
}
res = requests.post(url, headers=headers, params=params)
# print(res.status_code)
lst = []
for i in range(0, row):
qq = res.json()["pageObj"]["maxPage"]
cont = res.json()["rows"][i]
print(infopage, page, qq, i, row)
if qq != pagemax:
startrow = write_the_wrong(lst=lst, pagemax=pagemax, row=row, infopage=infopage, startrow=startrow)
return startrow
else:
lst.append(cont)
pf = pd.DataFrame(list(lst))
pf.to_excel(file_path, encoding='utf-8', startrow=startrow, index=False, header=False)
startrow = startrow + row
# print(startrow)
return startrow
def write_the_wrong(lst, pagemax, row, infopage, startrow):
params = {
"companyId": 'fe620882312c467fb935f5c5a8ebaa35',
"type": 11,
"page": pagemax,
"rows": row,
"sourceInfo": infopage
}
headers = {
'user-agnet': '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'''
}
res = requests.post(url, headers=headers, params=params)
for ct in range(0, row):
print(infopage, page, pagemax, ct, row)
contnt = res.json()["rows"][ct]
lst.append(contnt)
pff = pd.DataFrame(list(lst))
pff.to_excel(file_path, encoding='utf-8', startrow=startrow, index=False, header=False)
startrow = startrow + row
return startrow
'''main'''
# lst = []
file_path = pd.ExcelWriter('/Users/finnleen/Desktop/content.xlsx', engine='openpyxl')
start_row = 0
for comid in list_the_id.ret:
for infopage in range(1, 4):
pagemax, countsize = get_the_max(comid=comid, infopage=infopage)
if countsize == 0:
continue
else:
n = countsize % 15
for page in range(1, pagemax + 1):
if page == pagemax and n != 0:
start_row = req_the_info(page=page, infopage=infopage, row=n, startrow=start_row, pagemax=pagemax)
else:
start_row = req_the_info(page=page, infopage=infopage, startrow=start_row, pagemax=pagemax)
file_path.close()
再附一些comic,以供测试
5dadcd6f90d5461fa2d2c37dcefe6426
affc37528e3242339f24ae92a0dc164c
cd0d6fe58f384b169b5ebf059c94e69e