MRAB wrote: > The .index method does a linear search, checking on average 1/2 of the > items in the list. That's why it's so slow. > > In order to avoid that you could build a dict of each value in > dimension_values[col_idx] and its index in a single pass so that it > becomes a quick lookup.
For example: from itertools import count, izip def iterator_factory(): # columns = ["color", "size", "weight", "value"] rows = [ ["Yellow", "Big", 2, 4], ["Blue", "Big", 3, -4], ["Blue", "Small", 10, 55], #... ] return iter(rows) class Indexer(object): def __init__(self): self.lookup = {} self.counter = count() def __call__(self, value): try: return self.lookup[value] except KeyError: result = self.lookup[value] = next(self.counter) return result def to_list(self): d = self.lookup reverse = dict(izip(d.itervalues(), d.iterkeys())) assert len(reverse) == len(d) return [reverse[i] for i in xrange(len(reverse))] def pairs(rows, dimension_cols, measure_cols, indexers): for row in rows: dv = [indexer(row[col]) for indexer, col in izip(indexers, dimension_cols)] mv = [row[col] for col in measure_cols] yield dv, mv def main(): # dimension_names = ["color", "size"] dimension_cols = [0, 1] # measure_names = ["weight", "value"] measure_cols = [2, 3] indexers = [Indexer() for _ in dimension_cols] facts = pairs(iterator_factory(), dimension_cols, measure_cols, indexers) facts = list(facts) print facts for i, indexer in enumerate(indexers): print "%d: %s" % (i, indexer.to_list()) if __name__ == "__main__": main() -- http://mail.python.org/mailman/listinfo/python-list