Hi, I am trying to load files into a dictionary for analysis. the size of the dictionary will grow quite large (several million entries) and as inserting into a dictionary is roughly O(n) I figured if I loaded each file into it's own dictionary it would speed things up. However it did not.
So I decided to write a small test program (attached) As you can see I am inserting one million entries a time into a map. I ran the tests where I put all three million entries into one map and one where I put one million each into it's own map. What I would have expected is that if I insert one million into it's own map the time to do that would be roughly constant for each map. Interestingly it is not. It's about the same as if I load everything into one map. Oh and I have 4G of RAM and the test consumes about 40% at it's max. I even run the test on one of our servers with 64G of RAM, so I can rule out swapping as the issue. Can anyone explain this oddity? Any insight is highly appreciated. Here's the output of the test runs: $ ./dicttest.py -t 0 Inserting into one map Inserting 1000000 keys lasted 0:00:26 (38019 1/s) len(map) 1000000 Inserting 1000000 keys lasted 0:01:17 (12831 1/s) len(map) 2000000 Inserting 1000000 keys lasted 0:02:23 (6972 1/s) len(map) 3000000 total 3000000 $ ./dicttest.py -t 1 Inserting into three maps Inserting 1000000 keys lasted 0:00:32 (30726 1/s) len(map) 1000000 Inserting 1000000 keys lasted 0:01:29 (11181 1/s) len(map) 1000000 Inserting 1000000 keys lasted 0:02:23 (6957 1/s) len(map) 1000000 total 3000000 Thanks, Lothar ,----[ /home/lothar/tmp/dicttest.py ] | #!/usr/bin/python | # -*- coding: utf-8 -*- | | import datetime | import optparse | import sys | import time | | | | | def fillDict(map, nop, num, guid): | before = time.time() | | for i in xrange(0, num): | key = (i, guid) | if not nop: | map[key] = ([], {}) | | after = time.time() | elapsed = (after - before) | if elapsed <= 0: | divide = 1.0 | else: | divide = elapsed | elapsedTime = datetime.timedelta(seconds=int(elapsed)) | print("Inserting %d keys lasted %s (%d 1/s)" % (num, elapsedTime, (num / | divide))) print("len(map) %d" % (len(map))) | | | def test0(nop, num): | print("Inserting into one map") | map = {} | fillDict(map, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd71") | fillDict(map, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd72") | fillDict(map, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd73") | print("total %d" % (len(map))) | | | def test1(nop, num): | print("Inserting into three maps") | map1 = {} | map2 = {} | map3 = {} | fillDict(map1, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd71") | fillDict(map2, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd72") | fillDict(map3, nop, num, "0561c83c-9675-4e6f-bedc-86bcb6acfd73") | total = 0 | for map in [map1, map2, map3]: | total += len(map) | print("total %d" % (total)) | | | | if __name__ == "__main__": | usage = "USAGE: %prog [options]" | description="test" | version="%prog 1.0" | | parser = optparse.OptionParser(usage=usage, version=version, | description=description) parser.add_option( | "-t", | "--testnum", | action="store", | dest="testnum", | help="the number of the test to execute", | type="int", | default=1 | ) | parser.add_option( | "-i", | "--iterations", | action="store", | dest="iterations", | help="the number of iterations", | type="int", | default=1000000 | ) | parser.add_option( | "-n", | "--nop", | action="store_true", | dest="nop", | help="don't store in the dictionary only load and parse", | default=False | ) | | (options, args) = parser.parse_args() | | testmap = { | 0:test0, | 1:test1, | } | | test = testmap[options.testnum] | | test(options.nop, options.iterations) `---- -- http://mail.python.org/mailman/listinfo/python-list