https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88476
Bug ID: 88476 Summary: Optimize expressions which uses vector, mask and general purpose registers Product: gcc Version: 9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: bugzi...@poradnik-webmastera.com Target Milestone: --- I was playing with Compiler Explorer to see how compilers can optimize various pieces of code. I found next version of clang (version 8.0.0 (trunk 348905)) can optimize expressions which uses vector, mask and general purpose registers. Such approach opens new optimization possibilities. Here are two example functions which demonstrates this: [code] #include <immintrin.h> void test1(void* data1, void* data2) { __m128i v1 = _mm_load_si128((__m128i const*)data1); __m128i v2 = _mm_load_si128((__m128i const*)data2); __mmask8 m1 = _mm_testn_epi16_mask(v1, v1); __mmask8 m2 = _mm_testn_epi16_mask(v2, v2); __mmask8 m = (m1 | 3) & (m2 | 3); v1 = _mm_maskz_add_epi16(m, v1, v2); _mm_store_si128((__m128i*)data2, v1); } void test2(void* data1, void* data2) { __m128i v1 = _mm_load_si128((__m128i const*)data1); __m128i v2 = _mm_load_si128((__m128i const*)data2); __mmask8 m1 = _mm_testn_epi16_mask(v1, v1); __mmask8 m2 = _mm_testn_epi16_mask(v2, v2); m1 = _kor_mask8(m1, 3); m2 = _kor_mask8(m2, 3); __mmask8 m = _kand_mask8(m1, m2); v1 = _mm_maskz_add_epi16(m, v1, v2); _mm_store_si128((__m128i*)data2, v1); } [/code] When compiled using clang with -O3 -march=skylake-avx512, both are optimized to the same code: [asm] test(void*, void*): # @test(void*, void*) vmovdqa xmm0, xmmword ptr [rdi] vmovdqa xmm1, xmmword ptr [rsi] vpor xmm2, xmm1, xmm0 vptestnmw k0, xmm2, xmm2 mov al, 3 kmovd k1, eax korb k1, k0, k1 vpaddw xmm0 {k1} {z}, xmm1, xmm0 vmovdqa xmmword ptr [rsi], xmm0 ret [/asm] gcc 9.0.0 20181211 (experimental) produces this: [asm] test1(void*, void*): vmovdqa64 xmm1, XMMWORD PTR [rsi] vmovdqa64 xmm0, XMMWORD PTR [rdi] vptestnmw k1, xmm1, xmm1 vptestnmw k2{k1}, xmm0, xmm0 kmovb eax, k2 or eax, 3 kmovb k3, eax vpaddw xmm0{k3}{z}, xmm0, xmm1 vmovaps XMMWORD PTR [rsi], xmm0 ret test2(void*, void*): vmovdqa64 xmm0, XMMWORD PTR [rdi] vmovdqa64 xmm1, XMMWORD PTR [rsi] vptestnmw k1, xmm0, xmm0 vptestnmw k3, xmm1, xmm1 mov eax, 3 kmovb k2, eax korb k1, k1, k2 korb k0, k3, k2 kandb k1, k1, k0 vpaddw xmm0{k1}{z}, xmm0, xmm1 vmovaps XMMWORD PTR [rsi], xmm0 ret [/asm]