> This is a good test for Python implementation bottlenecks. Run > that tokenizer on HTML, and see where the time goes.
I looked at it with cProfile, and the top function that comes up for a larger document (52k) is ...validator.HTMLConformanceChecker.__iter__. This method dispatches various validation routines, and it computes the method names from the input over and over again, doing lots of redundant string concatenations. It also capitalizes the element names, even though the spelling in the original document is probably not capitalized (but either upper-case or lower case). In my patch below, I create a dictionary of bound methods, indexed by (syntax) type and name, following the logic of falling back to just type-based validation if no type/name routine exists. However, in order to reduce the number of dictionary lookups, it will also cache type/name pairs (both in the original spelling, and the capitalized spelling), so that subsequent occurrences of the same element will hit the method cache. With this simple optimization, I get a 20% speedup on my test case. In my document, there are no attributes - the same changes should be made to attribute validation routines. I don't think this has anything to do with the case statement. Regards, Martin
diff -r 30ba63d28b1b python/src/html5lib/filters/validator.py --- a/python/src/html5lib/filters/validator.py Fri Jul 03 17:47:34 2009 +0300 +++ b/python/src/html5lib/filters/validator.py Sun Jul 05 21:10:06 2009 +0200 @@ -18,6 +18,7 @@ # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset +import re import _base import iso639codes import rfc3987 @@ -265,19 +266,45 @@ self.thingsThatDefineAnID = [] self.thingsThatPointToAnID = [] self.IDsWeHaveKnownAndLoved = [] + self.validate_type = {} + self.validate_type_name = {} + r = re.compile("^validate([A-Z][^A-Z]+)([A-Z][^A-Z]+)?$") + for name in dir(self): + m = r.match(name) + if not m: continue + method = getattr(self, name) + if m.group(2): + d = self.validate_type_name.setdefault(m.group(1), {}) + d[m.group(2)] = method + else: + self.validate_type[m.group(1)] = method def __iter__(self): - types = dict((v,k) for k,v in tokenTypes.iteritems()) for token in _base.Filter.__iter__(self): - fakeToken = {"type": types.get(token.get("type", "-"), "-"), - "name": token.get("name", "-").capitalize()} - method = getattr(self, "validate%(type)s%(name)s" % fakeToken, None) + t = token.get("type", "-") + n = token.get("name", "-") + try: + # try original name spelling + method = self.validate_type_name[t][n] + except KeyError: + # try capitalization + cn = n.capitalize() + try: + method = self.validate_type_name[t][cn] + # also cache original spelling + self.validate_type_name[t][n] = method + except KeyError: + # No name-specific validateion, try type-specific one + try: + method = self.validate_type[t] + # cache as name-specific as well + self.validate_type_name[t][cn] = method + self.validate_type_name[t][n] = method + except KeyError: + # no validation available + method = None if method: for t in method(token) or []: yield t - else: - method = getattr(self, "validate%(type)s" % fakeToken, None) - if method: - for t in method(token) or []: yield t yield token for t in self.eof() or []: yield t
-- http://mail.python.org/mailman/listinfo/python-list