I *just* wrote something that does this. It uses the htmldata module - you can find that using pypi. It only allows a specific set of html tags and attempts to close tags not closed. :
from htmldata import tagextract, tagjoin allowed_tags = ['br', 'b', 'strong', 'em', 'i', 'u', 'tt', 'a', 'big', 'small', 'h2', 'h3', 'h4', 'strike', 'sub', 'sup', 'samp', 's', 'code', 'ins', 'br/', ] def htmlfilter(intext, allowed_tags=allowed_tags): """Given a text entry as input, check it only contains allowed html. It returns the text with banned html removed. Uses two functiosn from htmldata : from htmldata import tagextract, tagjoin allowed_tags is the list of tags that are allowed. """ html = tagextract(intext) # out_html = [] skip = None unclosed = [] for entry in html: if isinstance(entry, basestring): if skip is not None: continue out_html.append(entry) else: tag = entry[0] if skip is not None: if tag.startswith('/') and tag[1:] == skip: skip = None continue otag = tag if tag.startswith('/'): otag = tag[1:] if otag in allowed_tags: if tag.startswith('/'): if otag in unclosed: unlclosed.remove(otag) else: # bad html continue elif tag not in ['br', '/br', 'hr', '/hr', 'img', '/img']: # XXXX hardwired - what if we need to add to this ? unclosed.append(tag) out_html.append(entry) continue if not tag.startswith('/'): skip = tag for tag in unclosed: out_html.append(('/%s' % (tag,), {})) # close any unclosed tags return tagjoin(out_html) ############### I've used it to allow a few html tags to appear in my guestbook entries. It's not very sophisticated because complex tags like 'div' and tables aren't allowed. Best regards, Fuzzy http://www.voidspace.org.uk/python -- http://mail.python.org/mailman/listinfo/python-list