Hello group, I've come from C/C++ and am now trying to code some Python because I absolutely love the language. However I still have trouble getting Python code to run efficiently. Right now I have a easy task: Get a file, split it up into a million chunks, count the most prominent character in each chunk and output that value into a file - in other words: Say we have a 2 GB file, we evaluate what character is most prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there (ord("A")).
I've first tried Python. Please don't beat me, it's slow as hell and probably a horrible solution: #!/usr/bin/python import sys import os f = open(sys.argv[1], "r") filesize = os.stat(sys.argv[1])[6] width = 1024 height = 1024 pixels = width * height blocksize = filesize / width / height print("Filesize : %d" % (filesize)) print("Image size : %dx%d" % (width, height)) print("Bytes per Pixel: %d" % (blocksize)) picture = { } havepixels = 0 while True: data = f.read(blocksize) if len(data) <= 0: break datamap = { } for i in range(len(data)): datamap[ord(data[i])] = datamap.get(data[i], 0) + 1 maxchr = None maxcnt = None for (char, count) in datamap.items(): if (maxcnt is None) or (count > maxcnt): maxcnt = count maxchr = char most = maxchr posx = havepixels % width posy = havepixels / width havepixels += 1 if (havepixels % 1024) == 0: print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels)) picture[(posx, posy)] = most pic = open(sys.argv[1] + ".pgm", "w") pic.write("P2\n") pic.write("# CREATOR: Crappyass Python Script\n") pic.write("%d %d\n" % (width, height)) pic.write("255\n") for y in range(height): for x in range(width): pos = (x, y) most = picture.get(pos, -1) pic.write("%d\n" % (most)) As this was horribly slow (20 Minutes for a 2GB file) I coded the whole thing in C also: #include <stdio.h> #include <errno.h> #include <string.h> #include <stdlib.h> #define BLOCKSIZE 2048 int main(int argc, char **argv) { unsigned int count[256]; int width, height; FILE *f; FILE *in; width = 1024; height = 1024; char temp[2048]; if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); } in = fopen(argv[1], "r"); if (!in) { perror("fopen"); exit(1); } snprintf(temp, 255, "%s.pgm", argv[1]); f = fopen(temp, "w"); if (!f) { perror("fopen"); exit(1); } fprintf(f, "P2\n"); fprintf(f, "# CREATOR: C\n"); fprintf(f, "%d %d\n", width, height); fprintf(f, "255\n"); width = 1024; height = 1024; while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) { int i; memset(count, 0, sizeof(count)); for (i = 0; i < sizeof(temp); i++) { count[(int)temp[i]]++; } int greatest; int maxcount; greatest = 0; maxcount = count[0]; for (i = 1; i < 256; i++) { if (count[i] > maxcount) { maxcount = count[i]; greatest = i; } } fprintf(f, "%d\n", greatest); } fclose(f); fclose(in); return 0; } Which takes about 40 seconds. I want the niceness of Python but a little more speed than I'm getting (I'd settle for factor 2 or 3 slower, but factor 30 is just too much). Can anyone point out how to solve this efficiently in Python? Kind regards, Johannes -- "Meine Gegenklage gegen dich lautet dann auf bewusste Verlogenheit, verlästerung von Gott, Bibel und mir und bewusster Blasphemie." -- Prophet und Visionär Hans Joss aka HJP in de.sci.physik <48d8bf1d$0$7510$54022...@news.sunrise.ch> -- http://mail.python.org/mailman/listinfo/python-list