With respective two-operand bitwise operations now expressable by a single VPTERNLOG, add splitters to also deal with ior and xor counterparts of the original and-only case. Note that the splitters need to be separate, as the placement of "not" differs in the final insns (*iornot<mode>3, *xnor<mode>3) which are intended to pick up one half of the result.
gcc/ * config/i386/sse.md: New splitters to simplify not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. gcc/testsuite/ * gcc.target/i386/pr100711-4.c: New test. * gcc.target/i386/pr100711-5.c: New test. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17366,6 +17366,36 @@ (match_dup 2)))] "operands[3] = gen_reg_rtx (<MODE>mode);") +(define_split + [(set (match_operand:VI 0 "register_operand") + (ior:VI + (vec_duplicate:VI + (not:<ssescalarmode> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + "<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + +(define_split + [(set (match_operand:VI 0 "register_operand") + (xor:VI + (vec_duplicate:VI + (not:<ssescalarmode> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + "<MODE_SIZE> == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (not:VI (xor:VI (match_dup 3) (match_dup 2))))] + "operands[3] = gen_reg_rtx (<MODE>mode);") + (define_insn "*andnot<mode>3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v16si foo_v16si (int a, v16si b) +{ + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v8di foo_v8di (long long a, v8di b) +{ + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */ --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ + return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ + return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v16si foo_v16si (int a, v16si b) +{ + return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v8di foo_v8di (long long a, v8di b) +{ + return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x99" 4 } } */