Hello, (first i'm sorry for my bad english...)
for a program, i need to have some kind of dictionary which will contain a huge amount of keys and values. i have tried 'to do it simple' using a normal dict but when available memory was exhausted, my computer started to be really slow as it swap-ed. so i wondered if i can not use some kind of cache, i googled and found information on LRU, LFU, and LFU interested me : keep only most frequently used items inside dict (and memory) and writing others on disk for a possible future reuse. here is a silly attempt to do it, it still missed some features but before to continue i would like to have your opinions. i'm interested in any propositions or refactorisations :) and btw i may have missed others already/tuned/better solutions... best. --- import fileinput, pickle, random, shutil, sys class sLFU: TMP = """%s.bak""" """ a simple kind of Least Frequently Used container""" def __init__(self, size, ratio, filename = None): """ filename = a temporary file which will could be deleted""" self.d = {} # container self.size, self.ratio = size, ratio self.freqs, self.fmin = {}, 0 self.filename = filename if self.filename: open(self.filename, 'w') def __setitem__(self, obj, v): self.d[obj] = v self.freqs[obj] = self.freqs.get(obj, self.fmin) + 1 if len(self.d) > self.size: """ lfu = { (size / ratio) least frequently used objects... }""" freqs = [(f, obj) for (obj, f) in self.freqs.items()] # maybe should i use a generator () like # ((f, obj) for (obj, f) in self.freqs.items()) freqs.sort() lfu = {} # and here enumerate(sorted(freqs)) for i, (f, obj) in enumerate(freqs): if i > self.size / self.ratio: break lfu[obj] = self.d[obj] """ next minimal frequency will be max frequency of the lfu (maybe it would be better to substract this from others in self.freqs)""" self.fmin = f """ and now delete theses objects...""" for obj in lfu: del self.freqs[obj] del self.d[obj] if self.filename: """ now must save lfu to disk... as i do not managed to do otherwise, copy file to a tmp file and read it line by line, updating when necessary... there must be plenty rooms for improvement here :(""" shutil.copy(self.filename, self.TMP % self.filename) fhs = open(self.TMP % self.filename) fh = open(self.filename, 'w') """ flag to ignore 'value' line""" ignoreNext = False for i, line in enumerate(fhs): """ first copy old lfu which were already set, updating them if necessary...""" if ignoreNext: ignoreNext = False continue line = line.rstrip() if i % 2 == 0: obj = self.loads(line) if obj not in lfu and obj in self.d: """ obj available from memory, do not to keep it on disk...""" ignoreNext = True continue elif obj in lfu: """ obj was available from memory, but as a lfu, was removed and must be updated on disk""" fh.write("%s\n" % line) fh.write("%s\n" % self.dumps(lfu[obj])) del lfu[obj] ignoreNext = True continue """ default behaviour : copy line...""" fh.write("%s\n" % line) """ from now lfu should contain only unseen lfu objects, add them to file...""" fh = open(self.filename, 'a') for obj in lfu: fh.write("%s\n" % self.dumps(obj)) fh.write("%s\n" % self.dumps(lfu[obj])) def __getitem__(self, obj): if obj in self.d: """ from memory :)""" return self.d[obj] if self.filename: """ if a filename was provided, search inside file if such an obj is present, then return it ; else raise IndexErrror.""" fh = open(self.filename) found = False for i, line in enumerate(fh): line = line.rstrip() if found: v = self.loads(line) self.d[obj] = v self.freqs[obj] = self.fmin + 1 return v if obj == self.loads(line): found = True raise KeyError(obj) """ maybe class methods would have been better here, actually i would have liked static + class... totally arbitrary format, haved choosed to use pickle, odd line contain 'obj' (next) even line contain 'value' """ def dumps(self, s): return pickle.dumps(s).replace("\n", "\t") staticmethod(dumps) def loads(self, s): return pickle.loads(s.replace("\t", "\n")) staticmethod(loads) lfu = sLFU(2, 2, 'slfu.txt') lfu[1] 8 = {'1': 1fd2 'a'} lfu[2] = [] lfu[3] = '3' lfu[2] = [1,] lfu[2] = [1,2] lfu[4] = 4 lfu[5] = '5' print "#d", len(lfu.d) print lfu.d print "".join(open(lfu.filename).readlines()) -- http://mail.python.org/mailman/listinfo/python-list