爬小说。只爬了10章就报错了。
import requests
from bs4 import BeautifulSoup
#获得章节链接、标题
def get_novel_chaters():
root_url = "http://www.qixivur.com/news/ts48.html"
r = requests.get(root_url)
r.encoding="utf-8"
soup = BeautifulSoup(r.text,"html.parser")
data = []
for dd in soup.find_all("dd"):
link = dd.find("a")
if not link:
continue
data.append(("http://www.qixivur.com%s"%link['href'],link.get_text()))
# print(link)
return data
#获得链接内容
def get_chapter_content(url):
r = requests.get(url)
r.encoding='utf-8'
soup = BeautifulSoup(r.text, "html.parser")
return soup.find('div',id="TextContent").get_text()
novel_chapters = get_novel_chaters()
total_cnt = len(novel_chapters)
idx = 0
for chapter in get_novel_chaters():
# print(chapter)
idx+=1
print(idx,total_cnt)
url,title = chapter
with open("%s.txt"%title,"w",encoding="utf-8") as fout:
fout.write(get_chapter_content(url))
1 1102
2 1102
3 1102
4 1102
5 1102
6 1102
7 1102
8 1102
9 1102
10 1102
Traceback (most recent call last):
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\models.py", line 434, in prepare_url
scheme, auth, host, port, path, query, fragment = parse_url(url)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\urllib3\util\url.py", line 397, in parse_url
return six.raise_from(LocationParseError(source_url), None)
File "<string>", line 3, in raise_from
urllib3.exceptions.LocationParseError: Failed to parse: http://www.qixivur.comjavascript:;
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Professional_documents/pythonProject/web_crawler/爬小说/main.py", line 36, in <module>
fout.write(get_chapter_content(url))
File "D:/Professional_documents/pythonProject/web_crawler/爬小说/main.py", line 21, in get_chapter_content
r = requests.get(url)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\sessions.py", line 573, in request
prep = self.prepare_request(req)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\sessions.py", line 496, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\models.py", line 368, in prepare
self.prepare_url(url, params)
File "D:\Professional_documents\pythonProject\web_crawler\venv\lib\site-packages\requests\models.py", line 436, in prepare_url
raise InvalidURL(*e.args)
requests.exceptions.InvalidURL: Failed to parse: http://www.qixivur.comjavascript:;
Process finished with exit code 1