import requests
from bs4 import BeautifulSoup
import os
url1 =' http://www.juda.cn'
def request(url):
resp = requests.get(url)
html = resp.content
soup = BeautifulSoup(html,"lxml")
# print(soup)
return soup
def main(url):
soup = request(url)
lis = soup.find('div',class_="news_list").find_all('div',class_="text")
#数据保存目录
path = os.getcwd()+u'//爬取内容1//'
if not os.path.exists(path):
os.mkdir(path)
#获取每篇文章的链接
for i in lis:
ff = i.find('div', class_="a_title").find_all('a', class_="size4-6p")
for it in ff:
link = url1 + it.get('href')
#请求每篇文章
result = request(link)
title = result.find('section',class_="title_part").find('h1',class_="size1-9p").get_text()
# print(title)
paper = result.find('div',class_="all col-md-12").find_all('p')
content = ''
for p in paper:
content += (p.text + '\n')
# print(content)
# 文章标题内容保存
totlename = path + title +'.text'
print(totlename)
paper = open(totlename,'w',encoding='utf-8')
paper.write('<<' + title + '>>\n\n')
paper.write(content)
paper.close()
if __name__ == '__main__':
for i in range(1, 7):
firsturl = 'http://www.juda.cn/search/index.html?keyword=%E4%B8%AD%E5%9B%BD%E9%93%81%E5%A1%94+%E6%A2%AF%E6%AC%A1%E7%94%B5%E6%B1%A0&type=news'
url2 = firsturl + "&page=" + str(i)
main(url2)