https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88476

            Bug ID: 88476
           Summary: Optimize expressions which uses vector, mask and
                    general purpose registers
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

I was playing with Compiler Explorer to see how compilers can optimize various
pieces of code. I found next version of clang (version 8.0.0 (trunk 348905))
can optimize expressions which uses vector, mask and general purpose registers.
Such approach opens new optimization possibilities. Here are two example
functions which demonstrates this:

[code]
#include <immintrin.h>

void test1(void* data1, void* data2)
{
    __m128i v1 = _mm_load_si128((__m128i const*)data1);
    __m128i v2 = _mm_load_si128((__m128i const*)data2);
    __mmask8 m1 = _mm_testn_epi16_mask(v1, v1);
    __mmask8 m2 = _mm_testn_epi16_mask(v2, v2);
    __mmask8 m = (m1 | 3) & (m2 | 3);
    v1 = _mm_maskz_add_epi16(m, v1, v2);
    _mm_store_si128((__m128i*)data2, v1);
}

void test2(void* data1, void* data2)
{
    __m128i v1 = _mm_load_si128((__m128i const*)data1);
    __m128i v2 = _mm_load_si128((__m128i const*)data2);
    __mmask8 m1 = _mm_testn_epi16_mask(v1, v1);
    __mmask8 m2 = _mm_testn_epi16_mask(v2, v2);
    m1 = _kor_mask8(m1, 3);
    m2 = _kor_mask8(m2, 3);
    __mmask8 m = _kand_mask8(m1, m2);
    v1 = _mm_maskz_add_epi16(m, v1, v2);
    _mm_store_si128((__m128i*)data2, v1);
}
[/code]

When compiled using clang with -O3 -march=skylake-avx512, both are optimized to
the same code:

[asm]
test(void*, void*): # @test(void*, void*)
  vmovdqa xmm0, xmmword ptr [rdi]
  vmovdqa xmm1, xmmword ptr [rsi]
  vpor xmm2, xmm1, xmm0
  vptestnmw k0, xmm2, xmm2
  mov al, 3
  kmovd k1, eax
  korb k1, k0, k1
  vpaddw xmm0 {k1} {z}, xmm1, xmm0
  vmovdqa xmmword ptr [rsi], xmm0
  ret
[/asm]

gcc 9.0.0 20181211 (experimental) produces this:

[asm]
test1(void*, void*):
  vmovdqa64 xmm1, XMMWORD PTR [rsi]
  vmovdqa64 xmm0, XMMWORD PTR [rdi]
  vptestnmw k1, xmm1, xmm1
  vptestnmw k2{k1}, xmm0, xmm0
  kmovb eax, k2
  or eax, 3
  kmovb k3, eax
  vpaddw xmm0{k3}{z}, xmm0, xmm1
  vmovaps XMMWORD PTR [rsi], xmm0
  ret
test2(void*, void*):
  vmovdqa64 xmm0, XMMWORD PTR [rdi]
  vmovdqa64 xmm1, XMMWORD PTR [rsi]
  vptestnmw k1, xmm0, xmm0
  vptestnmw k3, xmm1, xmm1
  mov eax, 3
  kmovb k2, eax
  korb k1, k1, k2
  korb k0, k3, k2
  kandb k1, k1, k0
  vpaddw xmm0{k1}{z}, xmm0, xmm1
  vmovaps XMMWORD PTR [rsi], xmm0
  ret
[/asm]

Reply via email to