目标是爬取番茄小说网的作小说名称、作者、路径、状态、简介、发布时间,连接数据库是成功的,但是却不能把数据存入数据库,保存的json格式的文件也是空的,只有方括号,整个程序并没有报错,代码如下:
from lxml import etree
import requests
import json
import threading
from queue import Queue
import time
import pymysql
class Tomato:
def __init__(self):
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36"
}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_queue(self):
url_temp = "https://fanqienovel.com/library/all/page_{}?sort=hottes"
url_list = [url_temp.format(i) for i in range(1, 9)]
for url in url_list:
self.url_queue.put(url)
def get_html_queue(self):
while True:
url = self.url_queue.get()
html_source_page = requests.get(url, headers=self.headers).text
self.html_queue.put(html_source_page)
self.url_queue.task_done()
def parse_html(self):
while True:
content_list = []
html = self.html_queue.get()
html_str = etree.HTML(html)
node_list = html_str.xpath("//div[@class='book-item-text']")
title_num = 0
for node in node_list:
title = node.xpath('./div/a/text()')[0]
url = node.xpath('./div/a/@href')[0]
author = node.xpath('./div[@class="book-item-desc"][1]/span/text()')[0]
status = node.xpath('./div[@class="book-item-desc"][2]/span/text()')[0]
intro = node.xpath('./div[contains(@class,"book-item-abstract")]/text()')[0]
time = node.xpath("./div[@class='book-item-footer']/span/text()")[0].replace('@', '').strip()
# 构建JSON格式的字符串
item = {
"小说名称": title,
"小说链接": url,
"小说作者": author,
"状态":status,
"小说简介":intro,
'发布时间': time,
}
content_list.append(item)
title_num += 1
self.content_queue.put(content_list)
self.saveData(content_list)
self.html_queue.task_done()
def save(self):
while True:
content_list = self.content_queue.get()
with open("tomato.json", mode="a+", encoding='utf-8') as f:
f.write(json.dumps(content_list, ensure_ascii=False, indent=2))
self.content_queue.task_done()
def saveData(self, content_list): # 添加 self 参数
db = pymysql.connect(host='localhost', user='root', password='root', database='py_tomato')
cursor = db.cursor()
for i in range(len(content_list)):
item = content_list[i]
article = item.get("小说名称")
url = item.get("小说链接")
author = item.get("小说作者")
status = item.get("状态")
intro = item.get("小说简介")
publish_time = item.get("发布时间")
sql = "insert into heima (article,url,author,status,intro,time) values(%s,%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, (article, url, author,status,intro, publish_time)) # 传入正确的参数
db.commit()
except Exception as e:
print("插入数据失败", e)
db.rollback()
cursor.close()
db.close()
def run(self):
thread_list = []
t_url = threading.Thread(target=self.get_url_queue)
thread_list.append(t_url)
for page in range(3):
t_content = threading.Thread(target=self.get_html_queue)
thread_list.append(t_content)
for j in range(3):
t_content = threading.Thread(target=self.parse_html)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join()
print("打印完成")
if __name__ == '__main__':
tomato = Tomato()
tomato.run()