Bench result; orignal: -> FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 27.768 sec MFLOPS: 38.65 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 28.359 sec MFLOPS: 37.84
soft-hard-float: GCC version: 4.3.3 Ops count: 1073217024 Time spent: 14.874 sec MFLOPS: 72.15 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 14.249 sec MFLOPS: 75.32 direct-hard-float: -> FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 13.021 sec MFLOPS: 82.42 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 12.472 sec MFLOPS: 86.05 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 11.803 sec MFLOPS: 90.93 FLOPS 3.00 GCC version: 4.3.3 Ops count: 1073217024 Time spent: 11.945 sec MFLOPS: 89.85 bench program: ``` #include <stdio.h> #include <stdlib.h> #ifdef __vxworks #include <sys/resource.h> #include <vxworks.h> #include <timers.h> #include <time.h> #elif defined(_MSC_VER) #include <Windows.h> #include <time.h> #else #include <time.h> #endif /* cl -O2 test_flops.c gcc -O2 test_flops.c -o test_flops */ #ifndef DIM #define DIM 1024 const long long int nop = 1073217024; #else #define COUNT long long int nop = 0; #endif void printm(double A[DIM][DIM]) { int i,j; for (i=0; i<DIM; i++) { for (j=0; j<DIM; j++) printf("%6.3f", A[i][j]); printf("\n"); } } void initm(double A[DIM][DIM]) { int i,j; srand(38741); for (i = 0; i < DIM; i++) for (j = 0; j < DIM; j++) A[i][j] = (double)rand() / (double)RAND_MAX - 0.5; } void dge(double A[DIM][DIM]) { int i, j, k; double c; for (k = 1; k < DIM; k++) { for (i = k; i < DIM; i++) { c = A[i][k-1] / A[k-1][k-1]; #ifdef COUNT nop += 1; #endif for (j = 0; j < DIM; j++) { A[i][j] -= c * A[k-1][j]; #ifdef COUNT nop += 2; #endif } } } } double X[DIM][DIM]; /* * return a timestamp with sub-second precision * QueryPerformanceCounter and clock_gettime have an undefined starting point (null/zero) * and can wrap around, i.e. be nulled again. */ double get_seconds() { #ifdef _MSC_VER static LARGE_INTEGER frequency; if (frequency.QuadPart == 0) QueryPerformanceFrequency(&frequency); LARGE_INTEGER now; QueryPerformanceCounter(&now); return (now.QuadPart * 1.0) / frequency.QuadPart; #else struct timespec now; clock_gettime(CLOCK_REALTIME, &now); return now.tv_sec + now.tv_nsec * 1e-9; #endif } int main (int argc, char **argv) { double a = 1.0; double b = 2.0; double c = a + b; double t; int count = 1; int i; printf("FLOPS %.2lf\n", c); #ifdef _MSC_VER printf("MSC_VER version: %d\n", _MSC_VER); #else printf("GCC version: " __VERSION__ "\n"); #endif initm(X); t = get_seconds(); #ifndef __vxworks if (argc > 1) { sscanf(argv[1], "%d", &count); } #endif for (i = 0; i < count; i += 1) { dge(X); } t = get_seconds() - t; printf("Ops count: %llu\n", nop * count); printf("Time spent: %.3lf sec\n", t); printf("MFLOPS: %.2f\n", 1e-6 * nop * count / t ); #ifdef PRINTM printm(X); #endif return 0; } ``` On Tue, May 5, 2020 at 3:30 AM <luoyongg...@gmail.com> wrote: > From: Yonggang Luo <luoyongg...@gmail.com> > > Just post as an idea to improve PPC fp performance. > With this idea, we have no need to adjust the helper orders. > > Signed-off-by: Yonggang Luo <luoyongg...@gmail.com> > --- > target/ppc/fpu_helper.c | 44 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 44 insertions(+) > > diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c > index 2bd49a2cdf..79051e4540 100644 > --- a/target/ppc/fpu_helper.c > +++ b/target/ppc/fpu_helper.c > @@ -926,6 +926,17 @@ static void float_invalid_op_addsub(CPUPPCState *env, > bool set_fpcc, > /* fadd - fadd. */ > float64 helper_fadd(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd + u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_add(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -941,6 +952,17 @@ float64 helper_fadd(CPUPPCState *env, float64 arg1, > float64 arg2) > /* fsub - fsub. */ > float64 helper_fsub(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd - u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_sub(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -967,6 +989,17 @@ static void float_invalid_op_mul(CPUPPCState *env, > bool set_fprc, > /* fmul - fmul. */ > float64 helper_fmul(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd * u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_mul(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > @@ -997,6 +1030,17 @@ static void float_invalid_op_div(CPUPPCState *env, > bool set_fprc, > /* fdiv - fdiv. */ > float64 helper_fdiv(CPUPPCState *env, float64 arg1, float64 arg2) > { > + CPU_DoubleU u1, u2; > + > + u1.d = arg1; > + u2.d = arg2; > + CPU_DoubleU retDouble; > + retDouble.nd = u1.nd / u2.nd; > + if (likely(float64_is_zero_or_normal(retDouble.d))) > + { > + /* TODO: Handling inexact */ > + return retDouble.d; > + } > float64 ret = float64_div(arg1, arg2, &env->fp_status); > int status = get_float_exception_flags(&env->fp_status); > > -- > 2.23.0.windows.1 > > -- 此致 礼 罗勇刚 Yours sincerely, Yonggang Luo