chris wrote: > Hi, > > i would like to parse many thousand files and aggregate the counts for > the field entries related to every id. > > extract_field grep the identifier for the fields with regex. > > result = [ { extract_field("id", line) : [extract_field("field1", > line),extract_field("field2", line)]} for line in FILE ] > > result gives me. > {'a: ['0', '84']}, > {'a': ['0', '84']}, > {'b': ['1000', '83']}, > {'b': ['0', '84']}, > > i like to aggregate them for every line or maybe file and get after > the complete parsing procedure > the possibility to count the amount of ids having > 0 entries in > '83'. > > {'a: {'0':2, '84':2}} > {'b': {'1000':1,'83':1,'84':1} } > > My current solution with mysql is really slow.
>>> def rows(lines): ... for line in lines: ... yield extract_field("id", line), [extract_field(name, line) for name in "field1", "field2"] ... >>> for row in rows(lines): ... print row ... ('a', ['0', '84']) ('b', ['1000', '83']) ('a', ['0', '84']) ('b', ['0', '84']) >>> from collections import defaultdict >>> class defaultdict(defaultdict): # omit that in your real code ... def __repr__(self): return repr(dict(self)) ... >>> outer = defaultdict(lambda: defaultdict(int)) >>> for key, values in rows(lines): ... inner = outer[key] ... for v in values: ... inner[v] += 1 ... >>> outer {'a': {'0': 2, '84': 2}, 'b': {'83': 1, '1000': 1, '84': 1, '0': 1}} -- http://mail.python.org/mailman/listinfo/python-list