请有识之士帮忙看下,好人一生平安:
1 . 爬该网站:网址 https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?keyword=%E6%95%B0%E6%8D%AE,,解密后网址:https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?keyword=数据
2.目标:搜索出‘数据’相关的信息,爬出的搜索信息部分被JS封装,
3.代码如下:
import ssl # 防止验证报错
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request as ur
import urllib.parse as up
import requests
import re,json
import lxml.etree as le
# import xpath_tool
import pymongo
kw = '数据'
data = {
'keyword': kw
}
data_url = up.urlencode(data) # encode
url = 'https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?'+data_url
headers = {'User-Agent':'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11'}
req = ur.Request(
url = url,
headers = headers
)
content = ur.urlopen(req).read().decode('utf-8', 'ignore')
with open('数据.html','w',encoding='utf-8') as f:
f.write(content)
新增用request.post方法 尝试失败:
url_js = 'https://www.cqggzy.com/interface/rest/inteligentSearch/getFullTextData'
headers_js = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7",
"Connection": "keep-alive",
"Content-Length": "615",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "JSESSIONID=139064B5D7DE210BE4B58893B41C2C7F; __jsluid_s=55e35824b3517a59c43c1b750043c288; cookie_www=19398923; Hm_lvt_3b83938a8721dadef0b185225769572a=1614754494,1614777887; Hm_lpvt_3b83938a8721dadef0b185225769572a=1614781676",
"Host": "www.cqggzy.com",
"Origin": "https://www.cqggzy.com",
"Referer": "https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?keyword=%E6%95%B0%E6%8D%AE",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
data_js = {
"token": "",
"pn": 0,
"rn": 18,
"sdt": "",
"edt": "",
"wd": " ",
"inc_wd": "",
"exc_wd": "",
"fields": "title",
"cnum": "001",
"sort": {"istop":0,"ordernum":0,"webdate":0,"rowid":0},
"ssort": "title",
"cl": 200,
"terminal": "",
"condition": [
{"fieldName": "categorynum", "equal": "014001001", "notEqual": None, "equalList": None, "notEqualList": None,
"isLike": True, "likeType": 2},
{"fieldName": "titlenew", "equal": "数据", "notEqual": None, "equalList": None, "notEqualList": None,
"isLike": True,
"likeType": 0}],
"time": None,
"highlights": "title",
"statistics": None,
"unionCondition": [],
"accuracy": "",
"noParticiple": "0",
"searchRange": None,
"isBusiness": "1"
}
res=requests.post(url=url_js,headers=headers_js,data=data_js)
print('res:', res)
print('res',res.text)
F12->network->XHR->F5,获得的headers作为headers和form data 作为data
打印结果一个返回500,一个空白
res: <Response [500]>
res: