python爬取番茄小说数据库保存为空

为什么不报错还保存不了数据，数据库和json都是空的


from lxml import etree
import requests
import json
import threading
from queue import Queue
import pymysql


class Tomato:
    def __init__(self):
        self.headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36"
        }
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_queue = Queue()

    def get_url_queue(self):
        url_temp = "https://fanqienovel.com/library/all/page_{}?sort=hottes"
        url_list = [url_temp.format(i) for i in range(1, 9)]
        for url in url_list:
            self.url_queue.put(url)

    def get_html_queue(self):
        while True:
            url = self.url_queue.get()
            html_source_page = requests.get(url, headers=self.headers).text
            self.html_queue.put(html_source_page)
            self.url_queue.task_done()

    def parse_html(self):
        while True:
            content_list = []
            html = self.html_queue.get()
            html_str = etree.HTML(html)
            node_list = html_str.xpath("//div[@class='book-item-text']")

            title_num = 0
            for node in node_list:
                title = node.xpath('./div/a/text()')[0]
                url = node.xpath('./div/a/@href')[0]
                author = node.xpath('./div[@class="book-item-desc"][1]/span/text()')[0]
                status = node.xpath('./div[@class="book-item-desc"][2]/span/text()')[0]
                intro = node.xpath('./div[contains(@class,"book-item-abstract")]/text()')[0]
                time = node.xpath("./div[@class='book-item-footer']/span/text()")[0].replace('@', '').strip()

                # 构建JSON格式的字符串
                item = {
                    "小说名称": title,
                    "小说链接": url,
                    "小说作者": author,
                    "状态":status,
                    "小说简介":intro,
                    '发布时间': time,
                }
                content_list.append(item)
                title_num += 1
            self.content_queue.put(content_list)
            self.saveData(content_list)
            self.html_queue.task_done()

    def save(self):
        while True:
            content_list = self.content_queue.get()
            with open("tomato.json", mode="a+", encoding='utf-8') as f:
                f.write(json.dumps(content_list, ensure_ascii=False, indent=2))
            self.content_queue.task_done()

    def saveData(self, content_list):
        db = pymysql.connect(host='localhost', user='root', password='root', database='py_tomato')
        cursor = db.cursor()

        for i in range(len(content_list)):
            item = content_list[i]
            article = item.get("小说名称")
            url = item.get("小说链接")
            author = item.get("小说作者")
            status = item.get("状态")
            intro = item.get("小说简介")
            publish_time = item.get("发布时间")
            sql = "insert into tomato (article,url,author,status,intro,time) values(%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (article, url, author,status,intro, publish_time))  # 传入正确的参数
                db.commit()
                print("数据库插入成功！")
            except Exception as e:
                print("插入数据失败", e)
                db.rollback()
        cursor.close()
        db.close()

    def run(self):
        thread_list = []
        t_url = threading.Thread(target=self.get_url_queue)
        thread_list.append(t_url)
        for page in range(5):
            t_content = threading.Thread(target=self.get_html_queue)
            thread_list.append(t_content)
        for j in range(5):
            t_content = threading.Thread(target=self.parse_html)
            thread_list.append(t_content)
        t_save = threading.Thread(target=self.save)
        thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True)
            t.start()
        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join()
        print("打印完成")


if __name__ == '__main__':
    tomato = Tomato()
    tomato.run()

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

2条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
专家-赤兔[在线] 优质创作者: 编程框架技术领域 2024-06-18 11:36
关注
引自免费微信小程序：皆我百晓生

从你提供的信息来看，代码在运行时没有报错，但是数据既没有被保存到数据库也没有被写入到JSON文件中。这可能由几个原因造成，我们逐一排查并解决。

首先，确保你的数据库连接信息是正确的，包括主机名、用户名、密码以及数据库名。在你提供的代码中，这些信息看起来是硬编码的，确保'localhost'、'root'、'root'以及'py_tomato'是准确无误的。

接下来，让我们检查一下数据库插入的部分。你使用了pymysql模块来与MySQL数据库交互，这是一个常见的库用于Python和MySQL之间的连接。在你的saveData方法中，你尝试将数据插入数据库，但你提到的数据库和JSON文件都为空，这可能是因为数据插入失败或者在某些地方中断了数据流。

为了验证这一点，我们可以做一些修改：

增加异常处理的详细性：当execute方法抛出异常时，你应该捕获具体的异常类型，并输出更详细的错误信息。这可以帮助定位问题所在。
检查JSON文件的写入模式：你使用了"a+"模式打开文件，这意味着每次写入都会追加到文件末尾。但是，如果你的JSON文件在每次运行程序后都需要包含完整的列表，那么你可能需要在第一次写入前清空文件，或者确保每个写入操作都包含了完整的JSON数组结构。

下面是针对这些问题的修改建议：

对于数据库插入部分，你已经在代码中加入了异常处理，但是可以增强错误信息的输出，例如：

except Exception as e: print(f"插入数据失败，错误详情: {e}") db.rollback()

对于JSON文件写入，你需要确保文件的结构是正确的。在第一次写入时，你可能需要先写入一个空的数组，然后在后续的写入操作中添加元素。或者，你可以在每次写入前检查文件是否为空，如果是，则写入完整的数组结构。下面是一个示例：

def save(self): while True: content_list = self.content_queue.get() if not os.path.exists("tomato.json"): with open("tomato.json", mode="w", encoding='utf-8') as f: json.dump([], f, ensure_ascii=False, indent=2) with open("tomato.json", mode="r+", encoding='utf-8') as f: file_data = json.load(f) file_data.extend(content_list) f.seek(0) json.dump(file_data, f, ensure_ascii=False, indent=2) self.content_queue.task_done()

以上代码首先检查文件是否存在，如果不存在则创建一个空的JSON数组。之后，它读取现有的文件数据，将新的内容列表追加到现有数据上，然后重新写入整个文件。这样可以确保JSON文件总是有效的JSON数组。

最后，检查你的线程同步和任务队列是否正确工作。确保所有线程都正确地完成了它们的任务，特别是self.url_queue.task_done()、self.html_queue.task_done()和self.content_queue.task_done()的调用，这些调用确保了主线程等待所有子线程完成任务。

如果以上步骤都不能解决问题，你可能需要更详细的日志记录或使用调试工具逐步执行代码，以便更精确地定位问题所在。
解决 1
无用
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 6月18日

python爬取番茄小说数据库保存为空

2条回答 默认 最新

问题事件

2条回答默认最新