有些时候会报“no such files"还有终端有些时候会提示Some characters could not be decoded,

对了当时为了实验验收把urlretrive可能会报的no such file给except了，大佬们可以在urlretrive这行底下的那个except右边加上(error.URLError, TimeoutError)


from urllib import request, parse, robotparser, error
from bs4 import BeautifulSoup
from threading import Thread, Lock
from queue import Queue, Empty
import string
import json
import time
import re

lock = Lock()
start_url, legal_urls = 'http://today.hit.edu.cn/', ['today.hit.edu.cn']
header = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
url_queue, craw_res, all_files = Queue(), [], set()  # 程序运行中，已经下载过的资源文件
url_num, url_with_file = 0, 0  # 动态变量，随已爬取网页变化
num, file_url_num = 1300, 100  # 定值，实验要求参数
rps, craw_delay = [], 0  # 礼貌规则集合，爬取时间间隔


def get_urls(k=1.2):
    
    url2visit = [start_url]  # 将要访问的URL队列
    for url in url2visit:
        print('抓取进度：%.2f%%' % (float(len(url2visit)) / (num * 1.2) * 100))
        try:
            f = request.urlopen(request.Request(url, headers=header))
        except error.URLError:
            continue
        else:
            bs = BeautifulSoup(f.read(), 'html.parser', from_encoding='gb18030')
            if len(url2visit) >= k * num:
                return url2visit
            for href in filter(None, map(lambda item: item.get('href'), bs.find_all('a'))):  # 获取所有a标签中的href的值
                if href.startswith('http://') and href.split('/')[2] in legal_urls:
                    href = parse.quote(href, safe=string.printable)  # 中文字符需要处理
                    if href not in url2visit:
                        url2visit.append(href)  # 不违反机器人排除协议且未曾出现过，则将其加入到待访问URL队列中


def craw_url(url, img_dir='./信息检索实验1抓取/附件/'):
    # def process_img_url(url_of_img: str) -> str:
    #     symbols = {'&', '!', '?'}
    #     for idx, char in enumerate(url_of_img):
    #         if char in symbols:
    #             url_of_img = url_of_img[:idx]
    #             break
    #     url_of_img = ('http:' if url_of_img.startswith('//') else '') + url_of_img
    #     return url_of_img

    global url_num, url_with_file, craw_res
    try:
        f = request.urlopen(request.Request(url,headers=header))
    except error.URLError:
        return False
    else:
        bs = BeautifulSoup(f.read(), 'html.parser', from_encoding='gb18030')

        # 信息处理：获取标题、正文、附件
        title_tag = bs.find('title')
        if title_tag :
            title = title_tag.getText()  # 新闻标题
            para = filter(None, [tag.text.replace('\n', '').replace(' ', '') for tag in bs.select('p')])  # 正文
            files, flag = set(), False  # flag标志是否已经标识为有照片
            text_file_pattern = re.compile(r"http://today.hit.edu.cn/sites/today1.prod1.dpweb1.hit.edu.cn/[%\w/]*\.[\w]+")
            text_match=re.findall(text_file_pattern,str(bs))
            for text_url in text_match:  
                    file_name = text_url.split('/')[-1]  # 保存的文件名
                    file_path = img_dir + file_name  # 文件相对地址
                    if file_name not in files:
                        
                            try:
                                request.urlretrieve(text_url, file_path)  # 下载图片到file_path中
                            except :
                                continue
                            else:
                                flag = True
                                files.add(file_name)
            lock.acquire()
            if url_num >= num and url_with_file >= file_url_num:
                lock.release()
                return
            url_num += 1
            if flag:
                url_with_file += 1
            craw_res.append({'url': url, 'title': title, 'paragraphs': ' '.join(para), 'file_name': list(files)})
            print('处理进度：%.2f%%' % (float(url_num) / num * 100))
            lock.release()


def output(craw_file='./信息检索实验1抓取/列表.json') -> list:  # 以json格式输出爬取的结果到文件中
    res = [json.dumps(item, ensure_ascii=False) for item in craw_res]  # 构造输出的json格式列表
    with open(craw_file, 'a', encoding='utf-8') as f:
        f.write('\n'.join(res))
    return res


class MyThread(Thread):  # 多线程爬取网页，采用线程安全的Queue储存url
    def run(self) -> None:
        while True:
            try:
                lock.acquire()
                url = url_queue.get_nowait()
                if url_num >= num and url_with_file >= file_url_num:
                    lock.release()
                    break
                
                lock.release()
                craw_url(url)
            except:
                break


def main(thread_num=15):
    # 初始化所有的机器人排除协议
    # global rps
    # print('*' * 100 + '\n开始获取机器人排除协议信息...')
    # rps = [robotparser.RobotFileParser('https://' + legal_url + '/robots.txt') for legal_url in legal_urls]
    # for rp in rps:
    #     rp.read()

    # 抓取网页URL
    print('获取完毕.\n' + '*' * 100 + '\n开始获取网页地址...')
    time0 = time.time()
    for url in get_urls():
        url_queue.put(url)
    time1 = time.time()
    print('获取完毕，用时：' + str(time1 - time0) + '秒\n' + '*' * 100 + '\n开始处理网页内容...')

    # 爬取处理网页内容
    threads = [MyThread(name='Thread' + str(idx)) for idx in range(thread_num)]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    time2 = time.time()
    print('处理完成，用时：' + str(time2 - time1), '秒\n' + '*' * 100 + '\n开始导出结果...')

    output()
    print('导出完成。\n总用时%d秒。' % (time.time() - time0))


if __name__ == '__main__':
    main()

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

报告相同问题？

关注问题

python错误：ValueError: No JSON object could be decoded json python
2019-03-14 10:35

回答 1 已采纳代码里有输入错误，而且你要获取的url地址，取不到东西
请问这是为什么FileNotFoundError: [Errno 2] No such file or directory: 'img_path' python
2022-07-25 23:16

回答 4 已采纳这里应该是引用的变量img_path把r’‘去掉
Python代码在自带的idle中可以运行，但是在vscode中无法运行。 python
2022-01-21 19:20

回答 2 已采纳是不是你没有给vs code配置python啊？vs code不是直接就能用的
python解释语言
2021-12-19 23:57

python无人驾驶医学芯片的博客 A class method can be called either on the class (such as C.f()) or on an instance (such as C().f()). The instance is ignored except for its class. If a class method is called for a derived class, ...
Python到Laravel上传mp4文件 json laravel php python
2018-02-15 00:33

回答 1 已采纳 I'd look to this stackoverflow post on how to send a file using request post: file_ = {'file': ('
解析json_decoded php数组的最佳方法 php
2018-11-13 21:14

回答 1 已采纳 You can convert your array in a query url and parse it ... like this <?php $inputArray = arra
提交到SQS时将消息自动编码为base64的规则 java python
2015-10-08 15:00

回答 1 已采纳 You probably want to encode your messages as something because SQS does not accept every possible
pandas io tools（使用python处理数据时候经常用到）读csv，TXT
2016-10-18 11:04

mishidemudong的博客 CSV & Text files The two workhorse functions for reading text files (a.k.a. flat files) are read_csv() and read_table(). They both use the same parsing code to intelligently convert tabular data...
根据数组的索引显示不同的HTML元素 html php python
2018-09-09 02:25

回答 1 已采纳 Your code is close. The idea is to formulate the 1d array in terms of rows and columns somehow. My
如何在Perl中迭代多维json_decoded对象来创建一个新的json对象？ json perl php
2013-07-18 03:48

回答 1 已采纳 Well, you aren't really going wrong; it is just the JSON format that is being as stupid as possibl
我怎样才能让PHP忽略扩展，就像PHPUnit在Windows命令提示符中的做法一样？ php
2016-08-24 23:02

回答 1 已采纳 This has nothing to do with Windows' command prompt, and nothing to do with ignoring extensions.
【Python学习笔记】内置函数（官网）
2021-12-09 18:11

nnecdz的博客 The Python interpreter has a number of functions and types built into it that are always available. They are listed here in alphabetical order. Built-in Functions A abs() aiter...
使用PHP编辑会出现语法错误 php
2017-04-19 03:50

回答 1 已采纳 Problem is here $line = '$top' Here you are using single quotes which defines the value of $line a
Python标准库(非常经典的各种模块介绍)
2018-08-06 08:43

qq_41804164的博客 Python标准库(非常经典的各种模块介绍) 2017年08月05日 16:43:52 阅读数：10141 06/07 20:10:08 编译 0.1. 关于本书 0.2. 代码约定 0.3. 关于例子 0.4. 如何联系我们核心模块 1.1. 介绍 1.2. _ _builtin...
《Deep Learning With Python second edition》英文版读书笔记：第十一章DL for text: NLP、Transformer、Seq2Seq
2022-03-18 20:16

阿正的梦工坊的博客 python深度学习第二版读书笔记
Java文件– java.nio.file.Files类
2020-07-13 13:04

cunchi4221的博客 Java Files class was introduced in Java 1.7 and is a part of java.nio.file package. Java Files类是Java 1.7中引入的，是java.nio.file包的一部分。 Java文件类 (Java Files Class) Java Files class ...
pandas io tools（使用python处理数据时候经常用到）
2013-05-31 17:25

overstack的博客 CSV & Text files The two workhorse functions for reading text files (a.k.a. flat files) are read_csv() and read_table(). They both use the same parsing code to intelligently convert tabular data...
PYTHON2.7 中文手册翻译：unicode HOWTO（未完成）
2018-03-01 23:01

sskywatcher的博客本HOWTO手册讨论了Python 2.x’s 对Unicode的支持情况，并且解释了在liyongunicode工作室可能会遇到的一系列问题.对于 Python 3 version，请参考 <https://docs.python.org/3/howto/unicode.html>。 ...
python标准库模块
2018-10-22 14:12

ROES的博客 "Since the functions in the C runtime library are not part of the Win32 API, we believe the number of applications that will be affected by this bug to be very limited." - Microsoft, January 1999 ...
Python 3.x 的新特征
2018-06-28 12:25

junchengberry的博客 What’s New In Python 3.0¶ Author:Guido van Rossu(https://docs.python.org/3/whatsnew/3.0.html) This article explains the new features in Python 3.0, compared to 2.6. Python 3.0, also known as ...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 5月20日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
修改了问题 5月12日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 5月12日

悬赏问题

¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
¥20 有关区间dp的问题求解
¥15 多电路系统共用电源的串扰问题
¥15 slam rangenet++配置
¥15 有没有研究水声通信方面的帮我改俩matlab代码
¥15 ubuntu子系统密码忘记
¥15 信号傅里叶变换在matlab上遇到的小问题请求帮助
¥15 保护模式-系统加载-段寄存器
¥15 电脑桌面设定一个区域禁止鼠标操作
¥15 求NPF226060磁芯的详细资料

有些时候会报“no such files"还有终端有些时候会提示Some characters could not be decoded,

0条回答 默认 最新

问题事件

悬赏问题

0条回答默认最新