爬取某网站搜索结果,爬取结果:分 一级页面为搜索文章标题 和 一级页面各搜索结果对应的二级页面文章内容。
处理过程:一级页面成功获取;二级页面有一些不能成功获取的,但是对不能成功获取的二级页面文章,将其二级页面的url放入一个list以备后续研究。通过 try
获取二级页面url
except HTTPError
另一种方法获取二级页面url
except Exception
失败的url放入list
问题:明明已经声明了在HTTPError的情况的处理流程,但是仍然报错HTTPError
请各路好汉指导,好人一生平安!
import ssl # 防止验证报错
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request as ur
import urllib.parse as up
import requests
import re,json
import lxml.etree as le
import urllib
from urllib.error import HTTPError
import pymongo
cookies = {
'JSESSIONID': '849E8C6BD918EA04C6143AA6C8E344FD',
'__jsluid_s': '55e35824b3517a59c43c1b750043c288',
'cookie_www': '36802747',
'Hm_lvt_3b83938a8721dadef0b185225769572a': '1614777887,1614821020,1614873247,1614910666',
'Hm_lpvt_3b83938a8721dadef0b185225769572a': '1614934564',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://www.cqggzy.com',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?keyword=%E6%95%B0%E6%8D%AE',
'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7',
}
data = '{"token":"","pn":36,"rn":18,"sdt":"","edt":"","wd":" ","inc_wd":"","exc_wd":"","fields":"title","cnum":"001","sort":"{\\"istop\\":0,\\"ordernum\\":0,\\"webdate\\":0,\\"rowid\\":0}","ssort":"title","cl":200,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014001001","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"\u6570\u636E","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":null,"highlights":"title","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"0","searchRange":null,"isBusiness":"1"}'.encode('utf-8')
# 把data, cookies, headers, url放入request.post() 以获取一级页面的response
response = requests.post('https://www.cqggzy.com/interface/rest/inteligentSearch/getFullTextData', headers=headers, cookies=cookies, data=data)
print(response.json()) # 打印查看获取内容;根据结果,此处成功获取
title_results = response.json() # 把response.json复制给变量以方便后面爬取二级页面使用
http_exists = True # 设置一个bool, 以方便判断二级页面的url是否成功获取
error_html = [] # 如果二级页面的url两次都没成功获取,则把第二次打开失败的url放进一个list,以便后续研究
# search_data = {}
# client = pymongo.MongoClient()
# db = client.get_database('public_resource') # db = client.db3
# c = db.get_collection('search_data')
for piece in title_results['result']['records']: # 对一级页面获取的结果,遍历每个结果,提取信息以组建每个结果对应的二级页面的url
# search_data = {}
print(piece['title']) # 每次遍历查看一级页面的标题
try:
url_date = re.findall('.{10}',piece['pubinwebdate'])[0]
url_date_revised = url_date.replace('-','')
print(url_date_revised)
href = 'https://www.cqggzy.com/xxhz/014001/014001001/'+piece['categorynum']+'/'+url_date_revised+'/'+piece['infoid']+'.html'
print(href)
req = ur.Request(
url = href,
headers = {
'User-Agent': 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
}
)
content1 = ur.urlopen(req).read() # 打开组建的url
except urllib.error.HTTPError: # 如果组建的url不能成功打开,换一种方式组建url
url_date = re.findall('.{10}',piece['infodate'])[0]
url_date_revised = url_date.replace('-','')
print(url_date_revised)
href = 'https://www.cqggzy.com/xxhz/014001/014001001/'+piece['categorynum']+'/'+url_date_revised+'/'+piece['infoid']+'.html'
print(href)
req = ur.Request(
url = href,
headers = {
'User-Agent': 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
}
)
content1 = ur.urlopen(req).read()
except Exception: # 如果组建的url还不能成功打开,把此时的url放进一个列表
error_html.append(piece['title'])
http_exists = False # 用bool变量表示url获取失败
if http_exists == True: # url获取成功,才进一步获取二级页面的内容
contentx = le.HTML(content1)
content = contentx.xpath("//div[contains(@class,'epoint-article-content')]//text()")
content = " ".join(content)
content = content.replace('\t','')
content = content.replace('\r', '')
content = content.replace('\n', '')
content = content.replace(' ','')
# c.insert_one(search_data)
# search_data[piece['title']] = content
http_exists = True # 每次遍历最后把 url变量设置为True
print(error_html) # 查看失败的urlr的列表