The purpose of those define_insn_and_split: 1. Combine vpcmpuw and zero_extend into vpcmpuw. 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov.
It should partially fix the issue in PR. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/103750 * config/i386/sse.md (*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>): New define_insn_and_split. (*<avx512>_cmp<mode>3): Ditto. (*<avx512>_cmp<mode>3_zero_extenddi): New define_insn. (*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): New define_insn_and_split. (*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>): Ditto. (*<avx512>_ucmp<mode>3): Ditto. (*<avx512>_ucmp<mode>3_zero_extenddi): New define_insn. (*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>): New define_insn_and_split. gcc/testsuite/ChangeLog: * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/ * g++.target/i386/pr103750-1.C: New test. --- gcc/config/i386/sse.md | 267 ++++++++++++++++++ gcc/testsuite/g++.target/i386/pr103750-1.C | 50 ++++ .../gcc.target/i386/bitwise_mask_op-3.c | 6 +- 3 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5196149ee32..fb885d58272 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3702,6 +3702,75 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom +;; duplicated vpcmpuw to vpcmpuw and kmov +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg. +(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<V48H_AVX512VL:avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<V48H_AVX512VL:cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<V48H_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_cmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < 64" + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn_and_split "*<avx512>_cmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (not:<avx512fmaskmode> @@ -3735,6 +3804,72 @@ (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<VI12_AVX512VL:cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_cmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_cmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_int_iterator UNSPEC_PCMP_ITER [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP]) @@ -3771,6 +3906,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI12_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_ucmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < 64" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> @@ -3785,6 +3986,72 @@ (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec:<VI48_AVX512VL:avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& <SWI248x:MODE>mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<VI48_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <SWI248x:MODE>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<VI48_AVX512VL:sseinsnmode>")]) + +(define_insn_and_split "*<avx512>_ucmp<mode>3" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (unspec:<avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], <avx512fmaskmode>mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*<avx512>_ucmp<mode>3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec:<avx512fmaskmode> + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "<cmp_imm_predicate>" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < 64" + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn_and_split "*<avx512>_ucmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (not:<avx512fmaskmode> diff --git a/gcc/testsuite/g++.target/i386/pr103750-1.C b/gcc/testsuite/g++.target/i386/pr103750-1.C new file mode 100644 index 00000000000..83f471331b3 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr103750-1.C @@ -0,0 +1,50 @@ +/* PR target/103750 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=cannonlake -maes -std=c++1y" } */ +/* { dg-final { scan-assembler-times "kmovw" 2 } } */ +/* { dg-final { scan-assembler-times "kmovd" 2 } } */ +/* There shouldn't be any kmovw/kmovd inside the loop. */ +#include <immintrin.h> + +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept +{ + __m256i mch256 = _mm256_set1_epi16(c); + for ( ; n < e; n += 32) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); + if (_kortestz_mask16_u8(mask1, mask2)) + continue; + + unsigned idx = _tzcnt_u32(mask1); + if (mask1 == 0) { + idx = __tzcnt_u16(mask2); + n += 16; + } + return n + idx; + } + return e; +} + +const char16_t *qustrchr1(char16_t *n, char16_t *e, char16_t c) noexcept +{ + __m256i mch256 = _mm256_set1_epi16(c); + for ( ; n < e; n += 32) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); + if (_kortestz_mask32_u8(mask1, mask2)) + continue; + + unsigned idx = _tzcnt_u32(mask1); + if (mask1 == 0) { + idx = __tzcnt_u16(mask2); + n += 16; + } + return n + idx; + } + return e; +} + diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c index 352c49d6c6b..82bb99e30af 100644 --- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c @@ -12,7 +12,7 @@ foo_orb (__m512i a, __m512i b) foo = m1 | m2; } -/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ void foo_xorb (__m512i a, __m512i b) @@ -22,7 +22,7 @@ foo_xorb (__m512i a, __m512i b) foo = m1 ^ m2; } -/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" { xfail { *-*-* && { ! ia32 } } } } } */ void foo_andb (__m512i a, __m512i b) @@ -40,4 +40,4 @@ foo_andnb (__m512i a, __m512i b) foo = m1 & ~m2; } -/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4" { xfail { *-*-* && { ! ia32 } } } } } */ -- 2.18.1