用python的sax模块解析xml的时候出现非法字符直接停止了,百度搜到有大神说用回调函数处理当前的非法字符或者跳过直接运行接下来的内容。不过具体应该怎么实现呢?求大神赐教。下面是我的代码,非法字符是出现再其中的很多个tagname="url"中的内容,能在原来的基础上改就更好了,再次感谢
import xml.sax
class XmlHandler( xml.sax.ContentHandler ):
def __init__(self):
self.CurrentData = ""
self.url = ""
self.id = ""
self.detail = ""
# 元素开始事件处理
def startElement(self, name,attr):
self.CurrentData = name
if name == "entry":
print("*****Entry*****")
# 元素结束事件处理
def endElement(self, name):
if self.CurrentData == "url":
print("url:", self.url)
elif self.CurrentData == "phish_id":
print("phish_id:", self.id)
elif self.CurrentData == "phish_detail_url":
print("phish_detail_url:", self.detail)
self.CurrentData = ""
# 内容事件处理
def characters(self, content):
if self.CurrentData == "url":
self.url = content
elif self.CurrentData == "phish_id":
self.id = content
elif self.CurrentData == "phish_detail_url":
self.detail = content
else:
if __name__ == "__main__":
# 创建一个 XMLReader
parser = xml.sax.make_parser()
# turn off namepsaces
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
# 重写 ContextHandler
Handler = XmlHandler()
parser.setContentHandler( Handler )
parser.parse("online-valid.xml")