# -*- coding: GBK -*-
from bs4 import BeautifulSoup
class Website:
def __init__(self,name,url,targetPattern,absoluteUrl,
titleTag,bodyTag):
self.name = name
self.url = url
self.targetPattren = targetPattern
self.absoluteUrl = absoluteUrl
self.titleTag = titleTag
self.bodyTag = bodyTag
class Content:
def __init__(self,url,title,body):
self.url = url
self.title = title
self.body = body
def print(self):
print("URL: {}".format(self.url))
print("TITLE: {}".format(self.title))
print("BODY: {}".format(self.body))
import re
import requests
class Crawler:
def __init__(self,site):
self.site = site
self.visited = []
def getPage(self,url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
def safeGet(self,pageObj,selector):
selectedElems = pageObj.select(selector)
if selectedElems is not None and len(selectedElems) > 0:
return '\n'.join([elem.get_text() for elem in selectedElems])
return ''
def parse(self,url):
bs = self.getPage(url)
if bs is not None:
title = self.safeGet(bs,self.site.titleTag)
body = self.safeGet(bs,self.site.bodyTag)
if title != '' and body != '':
content = Content(url,title,body)
content.print()
def crawl(self):
"""获取网站主页的页面链接"""
bs = self.getPage(self.site.url)
targetPages = bs.findALL('a',href=re.compile(self.site.targetPattern))
for targetPage in targetPages:
targetPage = targetPate.attrs['href']
if targetPage not in self.visited:
self.visited.append(targetPage)
if not self.site.absolutedUrl:
targetPage = '{}{}'.format(self.site.url,targetPage)
self.parse(targetPage)
reuters = Website('Reuters', 'https://www.reuters.com', '^(/artilce/)', False,
'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()
代码如上,按照书上打的,运行后是这样的:
findALL是书上这么写的,我也试过改成find_all,findall,但都没用,还是报一样的错误