Ding_HHD
Ding_HHD
2017-11-27 13:29

python HTMLParser问题

  • 爬虫
  • python
  • web

在看python核心编程第三版的时候遇到的问题

```import io
import formatter
from html.parser import HTMLParser
import http
import os
import sys
import urllib.request

from urllib.parse import urlparse,urljoin

class Retriever(object):
slots=('url','file')
def init(self,url):
self.url,self.file=self.get_file(url)

def get_file(self,url,default='index.html'):
    'Create usable local filename from URL'
    parsed=urlparse(url)
    host=parsed.netloc.split('@')[-1].split(':')[0]
    filepath='%s%s' % (host,parsed.path)
    if not os.path.splitext(parsed.path)[1]:
        filepath=os.path.join(filepath,default)
    linkdir=os.path.dirname(filepath)
    if not os.path.isdir(linkdir):
        if os.path.exists(linkdir):
            os.unlink(linkdir)
        os.makedirs(linkdir)
    return url,filepath
def download(self):
    'Download URL to specific named file'
    try:
        retval=urllib.request.urlretrieve(self.url,self.file)
    except (IOError,httplib.InvalidURL) as e:
        retval=(('*** ERROR: bad URL "%s": %s' % (self.url, e)),)
    return retval
def parse_links(self):
    'Parse out the links found in download HTML file'
    f=open(self.file,'rb')
    data=f.read()
    f.close()
    parser=HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
    parser.feed(data)
    parser.close()
    return parser.anchorlist

class Crawler(object):
count=0

def __init__(self, url):
    self.q=[url]
    self.seen=set()
    parsed=urlparse(url)
    host=parsed.netloc.split('@')[-1].split(':')[0]
    self.dom='.'.join(host.split('.')[-2:])

def get_page(self, url, media=False):
    'Download page & parse links, add to queue if nec'
    r=Retriever(url)
    fname=r.download()[0]
    if fname[0] == '*':
        print(fname,'...skipping parse')
        return
    Crawler.count += 1
    print('\n(',Crawler.count,')')
    print('URL:',url)
    print('FILE:',fname)
    self.seen.add(url)
    ftype=os.path.splitext(fname)[1]
    if ftype not in ('.htm', '.html'):
        return

    for link in r.parse_links():
        if link.startswith('mailto:'):
            print('...discarded, mailto link')
            continue
        if not media:
            ftype=os.path.splitext(link)[1]
            if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
                print('...discarded, media file')
                continue
        if not link.startswith('http://'):
            link=urljoin(url,link)
        print('*',link)
        if link not in self.seen:
            if self.dom not in link:
                print('...discarded, not in domain')
            else:
                if link not in self.q:
                    self.q.append(link)
                    print('...new, added to Q')
                else:
                    print('...discarded, already in Q')
        else:
            print('...discarded, already processed')
def go(self,media=False):
    'Process next page in queue (if any)'
    while self.q:
        url=self.q.pop()
        self.get_page(url,media)

def main():
if len(sys.argv) > 1:
url=sys.argv[1]
else:
try:
url=input('Enter starting URL: ')
except (KeyboardInterrupt, EOFError):
url=''
if not url:
return
if not url.startswith('http://') and not url.startswith('ftp://'):
url='http://%s/' % url
robot=Crawler(url)
robot.go()

if name=='__main__':
main()

这里报错HTMLParser的__init__()需要一个位置参数而给了它两个
看了下文档,好像这个类只有一个关键字参数,给的例子也是先自定义一个子类再实例化,那么书上这么写是什么意思?要怎么改?求解答
  • 点赞
  • 回答
  • 收藏
  • 复制链接分享

1条回答