--- Comment #10 from Jan Hubicka <hubicka at gcc dot> ---
This is benchmarkeable version of the simplified testcase:

jan@localhost:/tmp> cat t.c
#define N 10000000
struct rgb {unsigned char r,g,b;} rgbs[N];
int *addr;
struct drgb {double r,g,b;
#ifdef OPACITY
             double o;

struct drgb sum(double w)
        struct drgb r;
        for (int i = 0; i < N; i++)
          r.r += rgbs[i].r * w;
          r.g += rgbs[i].g * w;
          r.b += rgbs[i].b * w;
        return r;
jan@localhost:/tmp> cat q.c
struct drgb {double r,g,b;
#ifdef OPACITY
             double o;
struct drgb sum(double w);
        for (int i = 0; i < 1000; i++)

jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g ; objdump -d a.out | grep
vfmadd231pd  ; perf stat ./a.out
  40119d:       c4 e2 d9 b8 d1          vfmadd231pd %xmm1,%xmm4,%xmm2

 Performance counter stats for './a.out':

         12,148.04 msec task-clock:u                     #    1.000 CPUs
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
               736      page-faults:u                    #   60.586 /sec        
    50,018,421,148      cycles:u                         #    4.117 GHz         
           220,502      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle      
    39,950,154,369      stalled-cycles-backend:u         #   79.87% backend
cycles idle       
   120,000,191,713      instructions:u                   #    2.40  insn per
                                                  #    0.33  stalled cycles per
    10,000,048,918      branches:u                       #  823.182 M/sec       
             7,959      branch-misses:u                  #    0.00% of all

      12.149466078 seconds time elapsed

      12.149084000 seconds user
       0.000000000 seconds sys

jan@localhost:/tmp> gcc t.c q.c -march=native -O3 -g -DOPACITY ; objdump -d
a.out | grep vfmadd231pd  ; perf stat ./a.out

 Performance counter stats for './a.out':

         12,141.11 msec task-clock:u                     #    1.000 CPUs
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
               735      page-faults:u                    #   60.538 /sec        
    50,018,839,129      cycles:u                         #    4.120 GHz         
           185,034      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle      
    29,963,999,798      stalled-cycles-backend:u         #   59.91% backend
cycles idle       
   120,000,191,729      instructions:u                   #    2.40  insn per
                                                  #    0.25  stalled cycles per
    10,000,048,913      branches:u                       #  823.652 M/sec       
             7,311      branch-misses:u                  #    0.00% of all

      12.142252354 seconds time elapsed

      12.138237000 seconds user
       0.004000000 seconds sys

So on zen2 hardware I get same performance on both.  It may be interesting to
test it on Raptor Lake.

Reply via email to