http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57162



             Bug #: 57162

           Summary: Ofast does not make use of avx while O3 does

    Classification: Unclassified

           Product: gcc

           Version: 4.9.0

            Status: UNCONFIRMED

          Severity: normal

          Priority: P3

         Component: tree-optimization

        AssignedTo: unassig...@gcc.gnu.org

        ReportedBy: vincenzo.innoce...@cern.ch





in a trivial 4x4 matmul Ofast code looks worse than O3 for avx





cat matmul.cc

alignas(32) float a[4][4];

alignas(32) float b[4][4];

alignas(32) float c[4][4];



void matmul() {

  for (int i=0;i!=4;++i)

    for (int j=0;j!=4;++j) {

      float sum=0;

      for (int k=0;k!=4;++k)

            sum += a[i][k]*b[k][j];

      c[i][j]=sum;

    }

}



c++ -O3 -march=corei7-avx -mavx2 -std=c++11 -S matmul.cc

    .text

    .align 4,0x90

    .globl __Z6matmulv

__Z6matmulv:

LFB0:

    vmovss    8+_b(%rip), %xmm4

    vmovss    _b(%rip), %xmm7

    vinsertps    $0x10, 12+_b(%rip), %xmm4, %xmm0

    vmovss    24+_b(%rip), %xmm1

    vmovss    16+_b(%rip), %xmm4

    vinsertps    $0x10, 4+_b(%rip), %xmm7, %xmm5

    vmovlhps    %xmm0, %xmm5, %xmm5

    vmovss    40+_b(%rip), %xmm7

    vinsertf128    $1, %xmm5, %ymm5, %ymm5

    vinsertps    $0x10, 28+_b(%rip), %xmm1, %xmm0

    vinsertps    $0x10, 20+_b(%rip), %xmm4, %xmm3

    vmovss    32+_b(%rip), %xmm1

    vmovlhps    %xmm0, %xmm3, %xmm3

    vmovss    56+_b(%rip), %xmm4

    vinsertf128    $1, %xmm3, %ymm3, %ymm3

    vinsertps    $0x10, 44+_b(%rip), %xmm7, %xmm0

    vmovss    48+_b(%rip), %xmm6

    vinsertps    $0x10, 36+_b(%rip), %xmm1, %xmm2

    vmovlhps    %xmm0, %xmm2, %xmm2

    vinsertps    $0x10, 60+_b(%rip), %xmm4, %xmm0

    vxorps    %xmm4, %xmm4, %xmm4

    vinsertf128    $1, %xmm2, %ymm2, %ymm2

    vinsertps    $0x10, 52+_b(%rip), %xmm6, %xmm1

    vmovlhps    %xmm0, %xmm1, %xmm1

    vmovaps    _a(%rip), %ymm0

    vinsertf128    $1, %xmm1, %ymm1, %ymm1

    vpermilps    $0, %ymm0, %ymm7

    vmulps    %ymm5, %ymm7, %ymm7

    vaddps    %ymm4, %ymm7, %ymm7

    vpermilps    $85, %ymm0, %ymm6

    vmulps    %ymm3, %ymm6, %ymm6

    vaddps    %ymm6, %ymm7, %ymm7

    vpermilps    $170, %ymm0, %ymm6

    vmulps    %ymm2, %ymm6, %ymm6

    vpermilps    $255, %ymm0, %ymm0

    vmulps    %ymm1, %ymm0, %ymm0

    vaddps    %ymm6, %ymm7, %ymm6

    vaddps    %ymm0, %ymm6, %ymm0

    vmovaps    %ymm0, _c(%rip)

    vmovaps    32+_a(%rip), %ymm0

    vpermilps    $0, %ymm0, %ymm6

    vmulps    %ymm5, %ymm6, %ymm5

    vaddps    %ymm4, %ymm5, %ymm4

    vpermilps    $85, %ymm0, %ymm5

    vmulps    %ymm3, %ymm5, %ymm3

    vaddps    %ymm3, %ymm4, %ymm3

    vpermilps    $170, %ymm0, %ymm4

    vmulps    %ymm2, %ymm4, %ymm2

    vpermilps    $255, %ymm0, %ymm0

    vmulps    %ymm1, %ymm0, %ymm1

    vaddps    %ymm2, %ymm3, %ymm2

    vaddps    %ymm1, %ymm2, %ymm0

    vmovaps    %ymm0, 32+_c(%rip)

    vzeroupper



and

c++ -Ofast -march=corei7-avx -mavx2 -std=c++11 -S matmul.cc

Vincenzos-MacBook-Pro:vectorize innocent$ cat matmul.s

    .text

    .align 4,0x90

    .globl __Z6matmulv

__Z6matmulv:

LFB0:

    vmovaps    16+_a(%rip), %xmm1

    vmovaps    48+_a(%rip), %xmm0

    vmovaps    _a(%rip), %xmm4

    vmovaps    32+_a(%rip), %xmm2

    vbroadcastss    32+_b(%rip), %xmm6

    vshufps    $136, %xmm1, %xmm4, %xmm3

    vshufps    $221, %xmm1, %xmm4, %xmm4

    vbroadcastss    36+_b(%rip), %xmm5

    vshufps    $136, %xmm0, %xmm2, %xmm1

    vshufps    $221, %xmm0, %xmm2, %xmm2

    vbroadcastss    40+_b(%rip), %xmm7

    vshufps    $136, %xmm1, %xmm3, %xmm0

    vshufps    $221, %xmm1, %xmm3, %xmm3

    vshufps    $136, %xmm2, %xmm4, %xmm1

    vshufps    $221, %xmm2, %xmm4, %xmm2

    vmulps    %xmm6, %xmm3, %xmm6

    vbroadcastss    48+_b(%rip), %xmm4

    vmulps    %xmm5, %xmm3, %xmm5

    vmulps    %xmm7, %xmm3, %xmm7

    vmulps    %xmm4, %xmm2, %xmm4

    vaddps    %xmm4, %xmm6, %xmm6

    vbroadcastss    16+_b(%rip), %xmm4

    vmulps    %xmm4, %xmm1, %xmm4

    vaddps    %xmm4, %xmm6, %xmm6

    vbroadcastss    _b(%rip), %xmm4

    vmulps    %xmm4, %xmm0, %xmm4

    vaddps    %xmm4, %xmm6, %xmm6

    vbroadcastss    52+_b(%rip), %xmm4

    vmulps    %xmm4, %xmm2, %xmm4

    vaddps    %xmm4, %xmm5, %xmm5

    vbroadcastss    20+_b(%rip), %xmm4

    vmulps    %xmm4, %xmm1, %xmm4

    vaddps    %xmm4, %xmm5, %xmm5

    vbroadcastss    4+_b(%rip), %xmm4

    vmulps    %xmm4, %xmm0, %xmm4

    vaddps    %xmm4, %xmm5, %xmm4

    vbroadcastss    56+_b(%rip), %xmm5

    vmulps    %xmm5, %xmm2, %xmm5

    vaddps    %xmm5, %xmm7, %xmm7

    vbroadcastss    24+_b(%rip), %xmm5

    vmulps    %xmm5, %xmm1, %xmm5

    vaddps    %xmm5, %xmm7, %xmm7

    vbroadcastss    8+_b(%rip), %xmm5

    vmulps    %xmm5, %xmm0, %xmm5

    vaddps    %xmm5, %xmm7, %xmm5

    vbroadcastss    44+_b(%rip), %xmm7

    vmulps    %xmm7, %xmm3, %xmm3

    vbroadcastss    60+_b(%rip), %xmm7

    vmulps    %xmm7, %xmm2, %xmm2

    vaddps    %xmm2, %xmm3, %xmm2

    vbroadcastss    28+_b(%rip), %xmm3

    vmulps    %xmm3, %xmm1, %xmm1

    vunpcklps    %xmm5, %xmm6, %xmm3

    vaddps    %xmm1, %xmm2, %xmm1

    vbroadcastss    12+_b(%rip), %xmm2

    vmulps    %xmm2, %xmm0, %xmm0

    vaddps    %xmm0, %xmm1, %xmm0

    vunpckhps    %xmm5, %xmm6, %xmm1

    vunpcklps    %xmm0, %xmm4, %xmm2

    vunpckhps    %xmm0, %xmm4, %xmm0

    vunpcklps    %xmm2, %xmm3, %xmm4

    vunpckhps    %xmm2, %xmm3, %xmm2

    vmovaps    %xmm4, _c(%rip)

    vmovaps    %xmm2, 16+_c(%rip)

    vunpcklps    %xmm0, %xmm1, %xmm2

    vunpckhps    %xmm0, %xmm1, %xmm0

    vmovaps    %xmm2, 32+_c(%rip)

    vmovaps    %xmm0, 48+_c(%rip)

    ret

Reply via email to