Hi, I've created a custom filter based on HTMLParser, with the following source:
class Filter(HTMLParser): def __init__(self, keyfile): HTMLParser.__init__(self) mykwfile = open(keyfile, 'r') self._keywords = [] for kw in mykwfile.read().split('\n'): self._keywords.append(kw) print kw mykwfile.close() self._toProcess = False self.stack = [] def handle_starttag(self, tag, attrs): if 'a' != tag: self.stack.append(self.__html_start_tag(tag, attrs)) return attrs = dict(attrs) self._toProcess = True for key in self._keywords: if 'a' == tag: p = re.compile(key, re.IGNORECASE) if 'href' in attrs: attrs['href'] = p.sub(r'XXXXX',attrs['href']) self.stack.append(self.__html_start_tag(tag, attrs)) def handle_startendtag(self, tag, attrs): if 'img' != tag and 'meta' != tag: self.stack.append(self.__html_startend_tag(tag, attrs)) return attrs = dict(attrs) self._toProcess = True for key in self._keywords: p = re.compile(key, re.IGNORECASE) if 'img' == tag: if 'src' in attrs: attrs['src'] = p.sub(r'XXXXX',attrs['src']) if 'alt' in attrs: attrs['alt'] = p.sub(r'XXXXX',attrs['alt']) if 'meta' == tag: if 'description' in attrs: attrs['description'] = p.sub(r'XXXXX',attrs['description']) if 'content' in attrs: attrs['content'] = p.sub(r'XXXXX',attrs['content']) if 'meta' == tag or 'img' == tag: self._toProcess = False self.stack.append(self.__html_startend_tag(tag, attrs)) def handle_endtag(self, tag): self.stack.append(self.__html_end_tag(tag)) if self._toProcess: self._toProcess = False def handle_data(self, data): if self._toProcess: for key in self._keywords: p = re.compile(key,re.IGNORECASE) data = p.sub(r'XXXXX',data) self.stack.append(data) def __html_start_tag(self, tag, attrs): return '<%s%s>' % (tag, self.__html_attrs(attrs)) def __html_startend_tag(self, tag, attrs): return '<%s%s/>' % (tag, self.__html_attrs(attrs)) def __html_end_tag(self, tag): return '</%s>' % (tag) def __html_attrs(self, attrs): _attrs = '' if attrs: _attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in attrs.iteritems()])) return _attrs But when I use it, it gives me the following error message: ERROR Processor exception: AttributeError: 'list' object has no attribute 'it eritems' Traceback (most recent call last): File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 87, in Process p.feed(document.GetValue("data")) File "HTMLParser.py", line 108, in feed File "HTMLParser.py", line 148, in goahead File "HTMLParser.py", line 281, in parse_starttag File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 121, in handle_startt ag self.stack.append(self.__html_start_tag(tag, attrs)) File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 167, in __html_start_ tag return '<%s%s>' % (tag, self.__html_attrs(attrs)) File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 178, in __html_attrs _attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in attrs.iteritems() ])) Anybody knows why it says attrs is not a list element? Thanks, Rubén -- http://mail.python.org/mailman/listinfo/python-list