Ding_HHD 2017-11-27 13:29 采纳率: 0%
浏览 1495

python HTMLParser问题

在看python核心编程第三版的时候遇到的问题

```import io
import formatter
from html.parser import HTMLParser
import http
import os
import sys
import urllib.request

from urllib.parse import urlparse,urljoin

class Retriever(object):
slots=('url','file')
def init(self,url):
self.url,self.file=self.get_file(url)

def get_file(self,url,default='index.html'):
    'Create usable local filename from URL'
    parsed=urlparse(url)
    host=parsed.netloc.split('@')[-1].split(':')[0]
    filepath='%s%s' % (host,parsed.path)
    if not os.path.splitext(parsed.path)[1]:
        filepath=os.path.join(filepath,default)
    linkdir=os.path.dirname(filepath)
    if not os.path.isdir(linkdir):
        if os.path.exists(linkdir):
            os.unlink(linkdir)
        os.makedirs(linkdir)
    return url,filepath
def download(self):
    'Download URL to specific named file'
    try:
        retval=urllib.request.urlretrieve(self.url,self.file)
    except (IOError,httplib.InvalidURL) as e:
        retval=(('*** ERROR: bad URL "%s": %s' % (self.url, e)),)
    return retval
def parse_links(self):
    'Parse out the links found in download HTML file'
    f=open(self.file,'rb')
    data=f.read()
    f.close()
    parser=HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
    parser.feed(data)
    parser.close()
    return parser.anchorlist

class Crawler(object):
count=0

def __init__(self, url):
    self.q=[url]
    self.seen=set()
    parsed=urlparse(url)
    host=parsed.netloc.split('@')[-1].split(':')[0]
    self.dom='.'.join(host.split('.')[-2:])

def get_page(self, url, media=False):
    'Download page & parse links, add to queue if nec'
    r=Retriever(url)
    fname=r.download()[0]
    if fname[0] == '*':
        print(fname,'...skipping parse')
        return
    Crawler.count += 1
    print('\n(',Crawler.count,')')
    print('URL:',url)
    print('FILE:',fname)
    self.seen.add(url)
    ftype=os.path.splitext(fname)[1]
    if ftype not in ('.htm', '.html'):
        return

    for link in r.parse_links():
        if link.startswith('mailto:'):
            print('...discarded, mailto link')
            continue
        if not media:
            ftype=os.path.splitext(link)[1]
            if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
                print('...discarded, media file')
                continue
        if not link.startswith('http://'):
            link=urljoin(url,link)
        print('*',link)
        if link not in self.seen:
            if self.dom not in link:
                print('...discarded, not in domain')
            else:
                if link not in self.q:
                    self.q.append(link)
                    print('...new, added to Q')
                else:
                    print('...discarded, already in Q')
        else:
            print('...discarded, already processed')
def go(self,media=False):
    'Process next page in queue (if any)'
    while self.q:
        url=self.q.pop()
        self.get_page(url,media)

def main():
if len(sys.argv) > 1:
url=sys.argv[1]
else:
try:
url=input('Enter starting URL: ')
except (KeyboardInterrupt, EOFError):
url=''
if not url:
return
if not url.startswith('http://') and not url.startswith('ftp://'):
url='http://%s/' % url
robot=Crawler(url)
robot.go()

if name=='__main__':
main()

这里报错HTMLParser的__init__()需要一个位置参数而给了它两个
看了下文档,好像这个类只有一个关键字参数,给的例子也是先自定义一个子类再实例化,那么书上这么写是什么意思?要怎么改?求解答
  • 写回答

1条回答

报告相同问题?

悬赏问题

  • ¥15 drone 推送镜像时候 purge: true 推送完毕后没有删除对应的镜像,手动拷贝到服务器执行结果正确在样才能让指令自动执行成功删除对应镜像,如何解决?
  • ¥15 求daily translation(DT)偏差订正方法的代码
  • ¥15 js调用html页面需要隐藏某个按钮
  • ¥15 ads仿真结果在圆图上是怎么读数的
  • ¥20 Cotex M3的调试和程序执行方式是什么样的?
  • ¥20 java项目连接sqlserver时报ssl相关错误
  • ¥15 一道python难题3
  • ¥15 牛顿斯科特系数表表示
  • ¥15 arduino 步进电机
  • ¥20 程序进入HardFault_Handler