Hi!,

El Mon, Jan 04, 2016 at 06:33:59PM -0800, Ganesh Ajjanagadde escribio:
> This exploits an approach based on the sieve of Eratosthenes, a popular
> method for generating prime numbers.
> 
> Tables are identical to previous ones.
> 
> Tested with FATE with/without --enable-hardcoded-tables.
> 
> Sample benchmark (Haswell, GNU/Linux+gcc):
> prev:
> 7860100 decicycles in cbrt_tableinit,       1 runs,      0 skips
> 7777490 decicycles in cbrt_tableinit,       2 runs,      0 skips
> [...]
> 7582339 decicycles in cbrt_tableinit,     256 runs,      0 skips
> 7563556 decicycles in cbrt_tableinit,     512 runs,      0 skips
> 
> new:
> 2099480 decicycles in cbrt_tableinit,       1 runs,      0 skips
> 2044470 decicycles in cbrt_tableinit,       2 runs,      0 skips
> [...]
> 1796544 decicycles in cbrt_tableinit,     256 runs,      0 skips
> 1791631 decicycles in cbrt_tableinit,     512 runs,      0 skips
>

See attached code, function "test1", based on an approximation of:

  (i+1)^(1/3) ~= i^(1/3) * ( 1 + 1/(3i) - 1/(9i) + 5/(81i) - .... )

Generated values are the same as original floats (max error in double
is < 4*10^-10), it is faster (and I think, simpler) than your version.

Perhaps altering the constants it could be made faster still, but it is
currently dominated by de division in the main loop.

    Daniel.
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>

void test1(float *out)
{
    int i;
    double cb;

    // Really small values are filled with original function:
    for(i=0; i<64; i++)
        out[i] = cbrt(i) * i;

    // "cb" is the cube-root approximation to "i"
    cb = 4;
    for(i=64; i<343; i++)
    {
        double s, r, t;
        out[i] = cb * i;
        // For small values, we use 5 terms:
        r = 1.0/(3*i);        // 0 A , 1 M , 1 D
        t = r;                //
        s = 1.0 + r;          // 1 A , 1 M , 1 D
        r = r * r;            // 1 A , 2 M , 1 D
        s = s - r;            // 2 A , 2 M , 1 D
        r = r * t;            // 2 A , 3 M , 1 D
        s = s + 1.6666155 *r; // 3 A , 4 M , 1 D
        r = r * t;            // 3 A , 5 M , 1 D
        s = s - 3.289935 *r;  // 4 A , 6 M , 1 D
        cb = cb * s;          // 4 A , 7 M , 1 D
    }

    cb = 7;
    for(i=343; i<8192; i++)
    {
        double s, r, t;
        out[i] = cb * i;
        // For big values, we use 4 terms:
        r = 1.0/(3*i);        // 0 A , 1 M , 1 D
        t = r;                //
        s = 1.0 + r;          // 1 A , 1 M , 1 D
        r = r * r;            // 1 A , 2 M , 1 D
        s = s - r;            // 2 A , 2 M , 1 D
        r = r * t;            // 2 A , 3 M , 1 D
        s = s + 1.6644 *r;    // 3 A , 4 M , 1 D
        cb = cb * s;          // 3 A , 5 M , 1 D
    }
}

void test2(float *out)
{
    int i;

    for(i=0; i<8192; i++)
        out[i] = cbrt(i) * i;
}

static double get_time(void)
{
    struct timeval t1;
    gettimeofday(&t1,0);
    return t1.tv_sec * 1000000.0 + t1.tv_usec;
}

int main()
{
    int i;
    double t1, t2, t3;
    char *orig, *fast;
    orig = malloc(sizeof(float) * 8192);
    fast = malloc(sizeof(float) * 8192);

    // Repeat 200 times
    for(i=0; i<200; i++)
    {
        t1 = get_time();
        test1((float *)fast);
        t2 = get_time();
        test2((float *)orig);
        t3 = get_time();

        printf("Orig: %f\tFast: %f\n", (t3-t2), (t2-t1));
    }

    // Compare
    for(i=0; i<sizeof(float)*8192; i++)
    {
        if( orig[i] != fast[i] )
            printf("error at byte %d (word %d) [%d != %d]\n", i, (int)(i/sizeof(float)),
                   orig[i], fast[i]);
    }

    return 0;
}
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to