llovespain
jose@huang
采纳率100%
2021-03-07 01:59

HTTPError已经通过try...except...进行异常处理,仍报HTTPError

爬取某网站搜索结果,爬取结果:分 一级页面为搜索文章标题 和 一级页面各搜索结果对应的二级页面文章内容

处理过程:一级页面成功获取;二级页面有一些不能成功获取的,但是对不能成功获取的二级页面文章,将其二级页面的url放入一个list以备后续研究。通过 try

                             获取二级页面url

                     except HTTPError

                             另一种方法获取二级页面url

                    except Exception

                             失败的url放入list

问题:明明已经声明了在HTTPError的情况的处理流程,但是仍然报错HTTPError

请各路好汉指导,好人一生平安!

import ssl # 防止验证报错
ssl._create_default_https_context = ssl._create_unverified_context
import urllib.request as ur
import urllib.parse as up
import requests
import re,json
import lxml.etree as le
import urllib
from urllib.error import HTTPError
import pymongo

cookies = {
    'JSESSIONID': '849E8C6BD918EA04C6143AA6C8E344FD',
    '__jsluid_s': '55e35824b3517a59c43c1b750043c288',
    'cookie_www': '36802747',
    'Hm_lvt_3b83938a8721dadef0b185225769572a': '1614777887,1614821020,1614873247,1614910666',
    'Hm_lpvt_3b83938a8721dadef0b185225769572a': '1614934564',
}

headers = {
    'Connection': 'keep-alive',
    'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'https://www.cqggzy.com',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://www.cqggzy.com/xxhz/014001/014001001/zbggjyxx-page.html?keyword=%E6%95%B0%E6%8D%AE',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7',
}

data = '{"token":"","pn":36,"rn":18,"sdt":"","edt":"","wd":" ","inc_wd":"","exc_wd":"","fields":"title","cnum":"001","sort":"{\\"istop\\":0,\\"ordernum\\":0,\\"webdate\\":0,\\"rowid\\":0}","ssort":"title","cl":200,"terminal":"","condition":[{"fieldName":"categorynum","equal":"014001001","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":2},{"fieldName":"titlenew","equal":"\u6570\u636E","notEqual":null,"equalList":null,"notEqualList":null,"isLike":true,"likeType":0}],"time":null,"highlights":"title","statistics":null,"unionCondition":[],"accuracy":"","noParticiple":"0","searchRange":null,"isBusiness":"1"}'.encode('utf-8')

# 把data, cookies, headers, url放入request.post() 以获取一级页面的response
response = requests.post('https://www.cqggzy.com/interface/rest/inteligentSearch/getFullTextData', headers=headers, cookies=cookies, data=data)
print(response.json()) # 打印查看获取内容;根据结果,此处成功获取
title_results = response.json() # 把response.json复制给变量以方便后面爬取二级页面使用

http_exists = True # 设置一个bool, 以方便判断二级页面的url是否成功获取
error_html = [] # 如果二级页面的url两次都没成功获取,则把第二次打开失败的url放进一个list,以便后续研究
# search_data = {}
# client = pymongo.MongoClient()
# db = client.get_database('public_resource')  # db = client.db3
# c = db.get_collection('search_data')
for piece in title_results['result']['records']: # 对一级页面获取的结果,遍历每个结果,提取信息以组建每个结果对应的二级页面的url
    # search_data = {}
    print(piece['title']) # 每次遍历查看一级页面的标题
    try:
        url_date = re.findall('.{10}',piece['pubinwebdate'])[0]
        url_date_revised = url_date.replace('-','')
        print(url_date_revised)
        href = 'https://www.cqggzy.com/xxhz/014001/014001001/'+piece['categorynum']+'/'+url_date_revised+'/'+piece['infoid']+'.html'
        print(href)
        req = ur.Request(
            url = href,
            headers = {
                'User-Agent': 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
            }
        )
        content1 = ur.urlopen(req).read() # 打开组建的url
    except urllib.error.HTTPError: # 如果组建的url不能成功打开,换一种方式组建url
        url_date = re.findall('.{10}',piece['infodate'])[0]
        url_date_revised = url_date.replace('-','')
        print(url_date_revised)
        href = 'https://www.cqggzy.com/xxhz/014001/014001001/'+piece['categorynum']+'/'+url_date_revised+'/'+piece['infoid']+'.html'
        print(href)
        req = ur.Request(
            url = href,
            headers = {
                'User-Agent': 'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
            }
        )
        content1 = ur.urlopen(req).read()
    except Exception: # 如果组建的url还不能成功打开,把此时的url放进一个列表
        error_html.append(piece['title'])
        http_exists = False # 用bool变量表示url获取失败

    if http_exists == True: # url获取成功,才进一步获取二级页面的内容
        contentx = le.HTML(content1)
        content = contentx.xpath("//div[contains(@class,'epoint-article-content')]//text()")
        content = " ".join(content)
        content = content.replace('\t','')
        content = content.replace('\r', '')
        content = content.replace('\n', '')
        content = content.replace('                   ','')
        # c.insert_one(search_data)
        # search_data[piece['title']] = content
    http_exists = True # 每次遍历最后把 url变量设置为True
print(error_html) # 查看失败的urlr的列表
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

1条回答

  • funny123 coagenth 1月前

    一般是try...except...finally...异常处理。你将第一个和第二个except合并,用if ... else...试试看。

    点赞 评论 复制链接分享

为你推荐