import requests
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup
import sys
def getHTMLText(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
h=requests.get(url,headers = headers, verify = False)
h.raise_for_status()
h.encoding="gbk"
return h.text
except:
return ""
url = 'http://www.shicimingju.com/book/hongloumeng/1.html'
soup = BeautifulSoup(getHTMLText(url),"html.parser")
with open('第一章.txt','w',encoding = 'UTF-8',errors = 'ignore') as f:
f.write(str(soup))
运行结果(部分)是