import bs4
import requests
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_enconding
return r.text
except:
return""
def getheadersList(slist,url):
html=getHtmlText(url)
soup=BeautifulSoup(html,"html.parser")
ul=soup.find_all('ul',attrs={"class":"idx_cm_list idx_cm_list_h"})
for a in ul:
title=ul.find_all('a')
slist.append(title.get('href'))
for i in range(len(slist)):
surl=slist[i]
nhtml=getHtmlText(surl)
soup=BeautifulSoup(nhtml,"html.parser")
nheader=soup.find_all('div',attrs={"class":"post_content_main"})
content=soup.find_all('div',attrs={"class":"post_text"})
print(nheader.find_all('h1').string)
print(content.get_text())
print('-------------------------分割线-----------------------')
def main():
url="http://news.163.com/domestic/"
slist=[]
main()