Fredrik Lundh wrote: > Lonnie Princehouse wrote: > > >>"[a-z0-9_]" means "match a single character from the set {a through z, >>0 through 9, underscore}". > > > "\w" should be a bit faster; it's equivalent to "[a-zA-Z0-9_]" (unless you > specify otherwise using the locale or unicode flags), but is handled more > efficiently by the RE engine. > > (you can use \w in sets too, so you can do e.g. "[EMAIL PROTECTED]") > > </F>
The \w does make a small difference, but not as much as I expected. Surprisingly a standard Python word iterator works just as well, and is easier to understand than the re version. Which one is faster depends on the average word length and number of ignored characters. Cheers, Ron Character count: 100000 Word count: 16477 Average word size: 6.06906597075 word_counter: 0.06820057 (best of 3) count_words: 0.07333837 (best of 3) ######### import string import re import time import random # Create a really ugly n length string to test with. n = 100000 random.seed(1) lines = ''.join([ random.choice(string.ascii_letters * 2 + '[EMAIL PROTECTED]&*()#/<>' + ' \n' * 6) for x in range(n) ]) print 'Character count:', n print 'Word count:', len(lines.split()) print 'Average word size:', float(n)/len(lines.split()) letters = string.lowercase + string.digits + '_@' def word_iter(text, letters=letters): ltrs=letters.lower() wd = '' for c in text + ' ': if c in ltrs: wd += c elif wd != '': yield wd wd = '' def word_counter(text): txt = text.lower() countDict={} for wd in word_iter(txt): if wd in countDict: countDict[wd] += 1 else: countDict[wd] = 1 return countDict word_finder = re.compile('[EMAIL PROTECTED]', re.I) def count_words(string, word_finder = word_finder): # avoid global lookups countDict = {} for match in word_finder.finditer(string.lower()): word = match.group(0) countDict[word] = countDict.get(word,0) + 1 return countDict foos = [word_counter, count_words] r1 = r2 = None for foo in foos: best_time = 0 for n in range(3): t = time.clock() for line in lines.splitlines(): countDict = foo(line) tt = time.clock()-t if tt > best_time: best_time = tt r1 = r2 r2 = countDict if r1 != None: # change to 1 if assert fails to find problem if 0: for k in r1.keys(): if r1[k] != r2[k]: print k,r1[k],r2[k] assert r1 == r2 print '%s: %.8f (best of %d)' \ % (foo.__name__, best_time, n+1) -- http://mail.python.org/mailman/listinfo/python-list