Oh dear god, I implemented this and it overall killed performance by about 50% - 100%. The same script (entering 3000 items) takes between 88 - 109s (it was running in 55s earlier).
Here is the new Set implementation: class SeaSet(set): __slots__ = ['master', 'added', 'deleted'] def __init__(self, s=None): if s is None: s = [] self.master = set(s) self.added = set() self.deleted = set() def add(self, l): if l not in self.master: self.added.add(l) try: self.deleted.remove(l) except KeyError: pass def remove(self, l): try: self.master.remove(l) self.deleted.add(l) except KeyError: try: self.added.remove(l) except KeyError: pass def __contains__(self, l): if l in self.deleted: return False elif l in self.added: return True else: return l in self.master def __len__(self): return len(self.master) + len(self.added) def __iter__(self): for x in self.master: yield x for x in self.added: yield x def __or__(self, s): return self.union(s) def __and__(self, s): return self.intersection(s) def __sub__(self, s): return self.difference(s) def union(self, s): """docstring for union""" if isinstance(s, (set, frozenset)): return s | self.master | self.added elif isinstance(s, SeaSet): return self.master | self.added | s.master | s.added else: raise TypeError def intersection(self, s): """docstring for intersection""" if isinstance(s, (set, frozenset)): return s & self.master & self.added elif isinstance(s, SeaSet): return self.master & self.added & s.master & s.added else: raise TypeError def difference(self, s): """docstring for difference""" if isinstance(s, (set, frozenset)): self.deleted |= (self.master - s) self.master -= s self.added -= s elif isinstance(s, SeaSet): t = (s.master | s.deleted) self.deleted |= self.master - t self.master -= t self.added -= t else: raise TypeError The surprising thing is that commits *ARE* running about 50% faster (according to the time column in the hotshot profiler). But, now, the longest running operations seem to be the I/O operations which are taking 10 times longer! (even if they're only reading or writing a few bytes. Could this have something to do with the set implementation being in Python as opposed to C? For instance, this method: def __readTableHeader(self, f): hdr = f.read(sz__TABLE_HEADER_FORMAT__) if len(hdr) < sz__TABLE_HEADER_FORMAT__: raise EOFError t = THF_U(hdr) #t = unpack(__TABLE_HEADER_FORMAT__, hdr) return t is now taking > 13s when it was taking less than 0.8s before! (same number of calls, nothing changed except the set implementation) sz__TABLE_HEADER_FORMAT__ is a constant = struct.calcsize("<II37s") = 45 THF_U is a reference to the struct.unpack function for the relevant Struct object What the heck happenned?! Prateek -- http://mail.python.org/mailman/listinfo/python-list