Terry Hancock wrote: > Nick Vatamaniuc wrote: >> The original post just said that he wanted to index some values by >> their urls and didn't really care about the urls themselves, so I >> suggested that he just use the hash of the key as the key. The >> dictionary will then take a hash of that [note that: >> hash(string)=hash(hash(string)) ] then the dictionary will not keep >> the reference to the urls and GC will collect them. So instead of >> dic[urlstring']=value he will do dic[hash(urlstring)]=value. But then >> to retrieve he will have to do old_value=dic[hash(urlstring]]. > > Note that it is trivial to catch collisions on entry and correct them: > > key = hash(url_string) > i = 0 > if d.has_key(key): > while d.has_key(key): > i += 1 > d[key + repr(i)] = val > else: > d[key] = val > > or something along those lines. No need to keep the original string.
How do you intend to get the value back out of the dictionary? candidates = [] for key in xrange(hash(url)): try: candidates.append(d[key]) except KeyError: break Instead, I would put all values with the same hash into a container, along the lines of d.setdefault(hash(url), []).append(value)) Here is a simple implementation that can be used to put the idea to a test. It keeps all but the first url for a cluster with identical hashes. class Cluster(object): """A cluster of dictionary values with the same hash(key). Helper for Dict. """ def __init__(self, default): self.pairs = {} self.default = default def __setitem__(self, key, value): assert key not in self.pairs self.pairs[key] = value def __getitem__(self, key): return self.pairs.get(key, self.default) def __repr__(self): return "Cluster(default=%r, pairs=%r)" % (self.default, self.pairs) def itervalues(self): yield self.default for value in self.pairs.itervalues(): yield value class Dict(object): """A dictionary that stores hashes instead of keys as long as there are no collisions. The value entered first becomes the default for a cluster of values with the same hash. It follows that you cannot implement a reliable containment test and must make sure by exterior means that only existing keys are looked up. """ def __init__(self): self.dict = {} def __setitem__(self, key, value): short_key = hash(key) try: cluster = self.dict[short_key] except KeyError: self.dict[short_key] = value else: if isinstance(cluster, Cluster): cluster[key] = value else: cluster = Cluster(cluster) cluster[key] = value self.dict[short_key] = cluster def __getitem__(self, key): cluster = self.dict[hash(key)] if not isinstance(cluster, Cluster): return cluster return cluster[key] def itervalues(self): for value in self.dict.itervalues(): if isinstance(value, Cluster): for v in value.itervalues(): yield v else: yield value def __repr__(self): return "Dict(%s)" % repr(self.dict)[1:-1] if __name__ == "__main__": class BadHash(object): """Key with user-defined hash. Allows enforcing hash collisions. """ def __init__(self, value, hash): self.value = value self.hash = hash def __hash__(self): return hash(self.hash) def __eq__(self, other): return self.value == other.value def __repr__(self): return "BadHash(value=%r, hash=%r)" % (self.value, self.hash) d = Dict() for i, v in enumerate("alpha beta gamma delta".split()): d[BadHash(v, i)] = v for v in "sigma tau omega".split(): d[BadHash(v, 42)] = v print d assert d[BadHash("gamma", 2)] == "gamma" assert d[BadHash("sigma", 42)] == "sigma" assert d[BadHash("tau", 42)] == "tau" # but be warned that assert d[BadHash("whatever", 42)] == "sigma" expected = sorted("alpha beta gamma delta sigma tau omega".split()) assert list(sorted(d.itervalues())) == expected I fear that the memory footprint of a url is not large enough to warrant the approach, though. Peter -- http://mail.python.org/mailman/listinfo/python-list