I am using SAX to parse XML that has numeric html entities I need to convert and feed to JavaScript as part of a CGI. I can get the characters to print correctly, but not without being surrounded by linebreaks:
from xml.sax import make_parser from xml.sax.handler import ContentHandler import htmlentitydefs, re def unescape_charref(ref): name = ref[2:-1] base = 10 if name.startswith("x"): name = name[1:] base = 16 return unichr(int(name, base)) def replace_entities(match): ent = match.group() if ent[1] == "#": return unescape_charref(ent) repl = htmlentitydefs.name2codepoint.get(ent[1:-1]) if repl is not None: repl = unichr(repl) else: repl = ent return repl def unescape(data): return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data) class newsHandler(ContentHandler): def __init__(self): self.isNews = 0 def startElement(self, name, attrs): if name == 'title': self.isNews = 1 def characters(self, ch): if self.isNews: ch = unescape(ch) print ch def endElement(self, name): if name == 'title': self.isNews = 0 parser = make_parser() parser.setContentHandler(newsHandler()) parser.parse('http://www.some.com/rss/rss.xml') For a line like 'Mark à Capbreton' my results print as: 'Mark à Capbreton' Is this another SAX quirk? I've already had to hack my way around SAX not being able to split results on a colon. No matter if I try strip, etc the results are always the same: newlines surrounding the html entities. I'm using version 2.3.5 and need to stick to the standard libraries. Thanks. -- http://mail.python.org/mailman/listinfo/python-list