Hi!, El Mon, Jan 04, 2016 at 06:33:59PM -0800, Ganesh Ajjanagadde escribio: > This exploits an approach based on the sieve of Eratosthenes, a popular > method for generating prime numbers. > > Tables are identical to previous ones. > > Tested with FATE with/without --enable-hardcoded-tables. > > Sample benchmark (Haswell, GNU/Linux+gcc): > prev: > 7860100 decicycles in cbrt_tableinit, 1 runs, 0 skips > 7777490 decicycles in cbrt_tableinit, 2 runs, 0 skips > [...] > 7582339 decicycles in cbrt_tableinit, 256 runs, 0 skips > 7563556 decicycles in cbrt_tableinit, 512 runs, 0 skips > > new: > 2099480 decicycles in cbrt_tableinit, 1 runs, 0 skips > 2044470 decicycles in cbrt_tableinit, 2 runs, 0 skips > [...] > 1796544 decicycles in cbrt_tableinit, 256 runs, 0 skips > 1791631 decicycles in cbrt_tableinit, 512 runs, 0 skips >
See attached code, function "test1", based on an approximation of: (i+1)^(1/3) ~= i^(1/3) * ( 1 + 1/(3i) - 1/(9i) + 5/(81i) - .... ) Generated values are the same as original floats (max error in double is < 4*10^-10), it is faster (and I think, simpler) than your version. Perhaps altering the constants it could be made faster still, but it is currently dominated by de division in the main loop. Daniel.
#include <math.h> #include <stdlib.h> #include <stdio.h> #include <sys/time.h> void test1(float *out) { int i; double cb; // Really small values are filled with original function: for(i=0; i<64; i++) out[i] = cbrt(i) * i; // "cb" is the cube-root approximation to "i" cb = 4; for(i=64; i<343; i++) { double s, r, t; out[i] = cb * i; // For small values, we use 5 terms: r = 1.0/(3*i); // 0 A , 1 M , 1 D t = r; // s = 1.0 + r; // 1 A , 1 M , 1 D r = r * r; // 1 A , 2 M , 1 D s = s - r; // 2 A , 2 M , 1 D r = r * t; // 2 A , 3 M , 1 D s = s + 1.6666155 *r; // 3 A , 4 M , 1 D r = r * t; // 3 A , 5 M , 1 D s = s - 3.289935 *r; // 4 A , 6 M , 1 D cb = cb * s; // 4 A , 7 M , 1 D } cb = 7; for(i=343; i<8192; i++) { double s, r, t; out[i] = cb * i; // For big values, we use 4 terms: r = 1.0/(3*i); // 0 A , 1 M , 1 D t = r; // s = 1.0 + r; // 1 A , 1 M , 1 D r = r * r; // 1 A , 2 M , 1 D s = s - r; // 2 A , 2 M , 1 D r = r * t; // 2 A , 3 M , 1 D s = s + 1.6644 *r; // 3 A , 4 M , 1 D cb = cb * s; // 3 A , 5 M , 1 D } } void test2(float *out) { int i; for(i=0; i<8192; i++) out[i] = cbrt(i) * i; } static double get_time(void) { struct timeval t1; gettimeofday(&t1,0); return t1.tv_sec * 1000000.0 + t1.tv_usec; } int main() { int i; double t1, t2, t3; char *orig, *fast; orig = malloc(sizeof(float) * 8192); fast = malloc(sizeof(float) * 8192); // Repeat 200 times for(i=0; i<200; i++) { t1 = get_time(); test1((float *)fast); t2 = get_time(); test2((float *)orig); t3 = get_time(); printf("Orig: %f\tFast: %f\n", (t3-t2), (t2-t1)); } // Compare for(i=0; i<sizeof(float)*8192; i++) { if( orig[i] != fast[i] ) printf("error at byte %d (word %d) [%d != %d]\n", i, (int)(i/sizeof(float)), orig[i], fast[i]); } return 0; }
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel