爬取写下的文件全是同一个内容,
以下为代码:
#coding=GB18030
import urllib.request
from bs4 import BeautifulSoup
import re
import os
urls = ["https://www.bilibili.com/read/cv13853928?from=category_0","https://www.bilibili.com/read/cv13900955?from=category_0","https://www.bilibili.com/read/cv14392664?from=category_0","https://www.bilibili.com/read/cv14290608?from=category_0","https://www.bilibili.com/read/cv14269554?from=category_0","https://www.bilibili.com/read/cv14023818?from=category_0","https://www.bilibili.com/read/cv14367119?from=category_0","https://www.bilibili.com/read/cv14310331?from=category_0","https://www.bilibili.com/read/cv14312166?from=category_0","https://www.bilibili.com/read/cv14395382?from=category_0","https://www.bilibili.com/read/cv14340236?from=category_0","https://www.bilibili.com/read/cv14312107?from=category_0","https://www.bilibili.com/read/cv14381493?from=category_0","https://www.bilibili.com/read/cv14312157?from=category_0","https://www.bilibili.com/read/cv14342795?from=category_0","https://www.bilibili.com/read/cv14319354?from=category_0","https://www.bilibili.com/read/cv14381629?from=category_0","https://www.bilibili.com/read/cv14353230?from=category_0","https://www.bilibili.com/read/cv14309947?from=category_0","https://www.bilibili.com/read/cv14369822?from=category_0","https://www.bilibili.com/read/cv14394980?from=category_0","https://www.bilibili.com/read/cv14337802?from=category_0","https://www.bilibili.com/read/cv14365402?from=category_0","https://www.bilibili.com/read/cv14361551?from=category_0","https://www.bilibili.com/read/cv14346357?from=category_0","https://www.bilibili.com/read/cv14398923?from=category_0","https://www.bilibili.com/read/cv14314809?from=category_0","https://www.bilibili.com/read/cv14315884?from=category_0","https://www.bilibili.com/read/cv14361893?from=category_0","https://www.bilibili.com/read/cv14395601?from=category_0","https://www.bilibili.com/read/cv14326983?from=category_0","https://www.bilibili.com/read/cv14324884?from=category_0","https://www.bilibili.com/read/cv14327098?from=category_0","https://www.bilibili.com/read/cv14371294?from=category_0","https://www.bilibili.com/read/cv14350914?from=category_0","https://www.bilibili.com/read/cv14354339?from=category_0"]
def text_create(name, msg):
desktop_path = "C:\\txt\\"
full_path = desktop_path + name
file = open(full_path, 'w',encoding="utf-8")
file.write(msg)
# file.close()
filePrefix = 'text' #文件前缀
fileSuffix = '.txt' #文件后缀
fileNum = 31 #文件个数
for i in range(1, fileNum):
fileName = filePrefix + str(i) + fileSuffix
for i in range(1,fileNum):
i=i+1
url=urls[i]
a=urllib.request.urlopen(url)
htmlstr=a.read().decode('UTF-8')
soup=BeautifulSoup(htmlstr,'html.parser')
y=re.compile(r'<p>([\s\S]*?)</p>')
text=y.findall(str(soup))
x=''
for i in range(0,len(text)):
x=x+text[i]
text1=re.sub("</?\w+[^>]*>",'',x)
text2=text1.replace("。",'。\n\n\0\0')
text_create(fileName, text2)