import json
import re
import requests
import os
import sys
import traceback
sys.tracebacklimit=0
url='https://www.qb5200.la/book/116524/'
ajax_url='https://pagead2.googlesyndication.com/getconfig/sodar?sv=200&tid=gda&tv=r20230718&st=env'
headers={
':authority: pagead2.googlesyndication.com',
':method: GET',
':path: /getconfig/sodar?sv=200&tid=gda&tv=r20230718&st=env',
':scheme: https',
'accept: */*',
'accept-encoding: gzip, deflate, br',
'accept-language: zh-CN,zh;q=0.9',
'origin: https://www.qb5200.la',
'referer: https://www.qb5200.la/',
'sec-ch-ua: ";Not A Brand";v="99", "Chromium";v="94"',
'sec-ch-ua-mobile: ?0',
'sec-ch-ua-platform: "Windows"',
'sec-fetch-dest: empt',
'sec-fetch-mode: cors',
'sec-fetch-site: cross-site',
'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
}
start_url=requests.get(url,headers=headers).content.decode('gbk','ignore')
ajax_urlz=requests.get(ajax_url,headers=headers).content.decode('gbk','ignore')
def get_toc(html):
toc_url_list=[]
toc_block=re.findall('<dl class="zjlist>(.*?)</dl>',html,re.S)[0]
toc_url=re.findall('href="(.*?)"',toc_block,re.S)
for url in toc_url:
toc_url_list.append(start_url+url)
return toc_url_list
def get_article(html):
chapter_name=re.search('<div class="border">(.*?)</div>',html,re.S).group(1)
chapter_namez=chapter_name.select('h1:nth-of-type(1)')
text_block=re.search('<div id="content">(.*?)</div>',html,re.S).group(1)
text_block=text_block.replace('<br>','')
return chapter_namez,text_block
def save(chapter_namez,text_block):
os.makedirs('星门',exist_ok=True)
with open(os.path.join('星门',chapter_namez+'.txt'),'w',encoding='gbk')as f:
f.write(text_block)
修改后还是爬不出来
import json
import re
import requests
import os
import sys
import traceback
sys.tracebacklimit = 0
url = 'https://www.qb5200.la/book/116524/'
ajax_url = 'https://pagead2.googlesyndication.com/getconfig/sodar?sv=200&tid=gda&tv=r20230718&st=env'
headers = {
'authority': 'pagead2.googlesyndication.com',
'method': 'GET',
'path': '/getconfig/sodar?sv=200&tid=gda&tv=r20230718&st=env',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'origin': 'https://www.qb5200.la',
'referer': 'https://www.qb5200.la/',
'sec-ch-ua': '";Not A Brand";v="99", "Chromium";v="94"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
}
start_url = requests.get(url, headers=headers).content.decode('gbk', 'ignore')
ajax_urlz = requests.get(ajax_url, headers=headers).content.decode('gbk', 'ignore')
def get_toc(html):
toc_url_list = []
toc_block = re.findall('<dl class="zjlist>(.*?)</dl>', html, re.S)[0]
toc_url = re.findall('href="(.*?)"', toc_block, re.S)
for url in toc_url:
toc_url_list.append(start_url + url)
return toc_url_list
def get_article(html):
chapter_name = re.search('<div class="border">(.*?)</div>', html, re.S).group(1)
chapter_name = chapter_name.select('h1:nth-of-type(1)')
text_block = re.search('<div id="content">(.*?)</div>', html, re.S).group(1)
text_block = text_block.replace('<br>', '')
return chapter_name, text_block
def save(chapter_namez, text_block):
os.makedirs('星门', exist_ok=True)
i = 0;
while i < 627 in chapter_namez:
i += 1;
chapter_name = chapter_namez[i]
if chapter_name:
break
else:
'Unknown_Chapter_Name'
with open(os.path.join('星门', chapter_namez + '.txt'), 'w', encoding='gbk') as f:
f.write(text_block)