import re
import time
import os
import requests
from lxml import etree
from threading import Thread
import threading
from queue import Queue
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36'}
'''要爬取小说的网址'''
url = '
'这个是笔趣阁任一部小说的网址'
'''对这页发起请求'''
resp = requests.get(url=url, headers=headers)
contentpage = resp.content.decode('utf-8', 'ignore')
html = etree.HTML(contentpage)
print(resp.status_code)
'''章节和内容队列+'''
titles_quenue = Queue(50000)
text_urls = Queue(50000)
def producer():
global titles_quenue,text_urls
while True:
title1 = html.xpath('/n//dd/a/text()')
for k in title1:
titles_quenue.put(k)
text_newurl = []
text_url2 = html.xpath('/n//dd/a/@href')
for i in text_url2:
j = url + i
text_newurl.append(j)
for j in text_newurl:
text_urls.put(j)
def consumer():
global titles_quenue,text_urls
'''小说名字'''
name = re.findall('<h1>(.+?)</h1>', contentpage, re.DOTALL)[0]
while True:
'''小说章节'''
title = titles_quenue.get()
'''小说链接--发起请求获取数据'''
text_url = text_urls.get()
print(text_url)
resp = requests.get(url=text_url, headers=headers)
content = resp.content.decode('utf-8')
html1 = etree.HTML(content)
text = html1.xpath('//div[@id="content"]/text()')
text = "".join(text)
while len(text) == 0:
'''如果没有内容,再次发起请求'''
resp2 = requests.get(url=text_url, headers=headers)
content2 = resp2.content.decode('utf-8')
html2 = etree.HTML(content2)
text = html2.xpath('//div[@id="content"]/text()')
text = "".join(text)
path1 = 'D:\AAAA桃花青帝\Python文件\爬虫\爬取小说/'
if len(text) != 0:
lll = os.path.exists(path1 + name)
if not lll:
os.makedirs(path1 + name)
with open('D:\AAAA桃花青帝\Python文件\爬虫\爬取小说\{}/'.format(name) + '{}'.format(title) + '.txt', 'a',encoding='utf-8') as f:
f.write(text)
print(title + '下载完成!!!!')
else:
with open('D:\AAAA桃花青帝\Python文件\爬虫\爬取小说\{}/'.format(name) + '{}'.format(title) + '.txt', 'a',encoding='utf-8') as f:
f.write(text)
print(title + '下载完成!!!!')
def multi():
#定义生产者
for i in range(50):
t = threading.Thread(target=producer)
t.start()
for j in range(5):
t= threading.Thread(target=consumer)
t.start()
multi()