import requests
import xml.etree.ElementTree as ET
from xml.parsers.expat import ParserCreate
class DefaultSaxHandler(object):
def __init__(self, provinces):
self.provinces = provinces
def start_element(self, name, attrs):
pass
def end_element(self, name):
pass
def char_data(self, text):
if text!="a":
text=text
self.provinces.append(text)
def get_province_entry(url):
content = requests.get(url).content.decode('gb2312')
start = content.find('<table height="22" cellSpacing="0" cellPadding="0" width="710" border="0">')
end = content.find('<hr size="1" width="520">')
content = content[start:end].strip()
provinces=[]
handler = DefaultSaxHandler(provinces)
parser = ParserCreate()
parser.StartElementHandler = handler.start_element
parser.EndElementHandler = handler.end_element
parser.CharacterDataHandler = handler.char_data
parser.Parse(content)
return provinces
provinces = get_province_entry('http://www.ip138.com/post/')
print(provinces)
我的目标是爬取选中区域的文本值,但是出现错误
ExpatError Traceback (most recent call last)
<ipython-input-20-14237f21aa15> in <module>()
40 return provinces
41
---> 42 provinces = get_province_entry('http://www.ip138.com/post/')
43 print(provinces)
<ipython-input-20-14237f21aa15> in get_province_entry(url)
36 parser.EndElementHandler = handler.end_element
37 parser.CharacterDataHandler = handler.char_data
---> 38 parser.Parse(content)
39
40 return provinces
ExpatError: undefined entity: line 6, column 55
不知道哪里出错了