On Wed, Dec 30, 2020 at 10:23 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following patch adds combine splitters to optimize: > - vpcmpeqd %ymm1, %ymm1, %ymm1 > - vpandn %ymm1, %ymm0, %ymm0 > vpmovmskb %ymm0, %eax > + notl %eax > etc. (for vectors with less than 32 elements with xorl instead of notl). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2020-12-30 Jakub Jelinek <ja...@redhat.com> > > PR target/98461 > * config/i386/sse.md (<sse2_avx2>_pmovmskb): Add splitters > for pmovmskb of NOT vector. > > * gcc.target/i386/sse2-pr98461.c: New test. > * gcc.target/i386/avx2-pr98461.c: New test.
OK. Thanks, Uros. > > --- gcc/config/i386/sse.md.jj 2020-12-28 12:27:32.318754687 +0100 > +++ gcc/config/i386/sse.md 2020-12-29 14:15:45.898508216 +0100 > @@ -16099,6 +16099,53 @@ (define_insn "*sse2_pmovmskb_ext" > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "SI")]) > > +(define_split > + [(set (match_operand:SI 0 "register_operand") > + (unspec:SI > + [(not:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand"))] > + UNSPEC_MOVMSK))] > + "TARGET_SSE2" > + [(set (match_dup 2) > + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)) > + (set (match_dup 0) (match_dup 3))] > +{ > + operands[2] = gen_reg_rtx (SImode); > + if (GET_MODE_NUNITS (<MODE>mode) == 32) > + operands[3] = gen_rtx_NOT (SImode, operands[2]); > + else > + { > + operands[3] > + = gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1, > + SImode); > + operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]); > + } > +}) > + > +(define_split > + [(set (match_operand:SI 0 "register_operand") > + (unspec:SI > + [(subreg:VI1_AVX2 (not (match_operand 1 "register_operand")) 0)] > + UNSPEC_MOVMSK))] > + "TARGET_SSE2 > + && GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_INT > + && GET_MODE_SIZE (GET_MODE (operands[1])) == <MODE_SIZE>" > + [(set (match_dup 2) > + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)) > + (set (match_dup 0) (match_dup 3))] > +{ > + operands[2] = gen_reg_rtx (SImode); > + operands[1] = gen_lowpart (<MODE>mode, operands[1]); > + if (GET_MODE_NUNITS (<MODE>mode) == 32) > + operands[3] = gen_rtx_NOT (SImode, operands[2]); > + else > + { > + operands[3] > + = gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1, > + SImode); > + operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]); > + } > +}) > + > (define_insn_and_split "*<sse2_avx2>_pmovmskb_lt" > [(set (match_operand:SI 0 "register_operand" "=r") > (unspec:SI > --- gcc/testsuite/gcc.target/i386/sse2-pr98461.c.jj 2020-12-29 > 14:20:44.258146127 +0100 > +++ gcc/testsuite/gcc.target/i386/sse2-pr98461.c 2020-12-29 > 14:23:11.462490600 +0100 > @@ -0,0 +1,50 @@ > +/* PR target/98461 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-sse3 -masm=att" } */ > +/* { dg-final { scan-assembler-times "\tpmovmskb\t" 6 } } */ > +/* { dg-final { scan-assembler-times "\txorl\t" 6 } } */ > +/* { dg-final { scan-assembler-not "\tpcmpeq" } } */ > +/* { dg-final { scan-assembler-not "\tpxor" } } */ > +/* { dg-final { scan-assembler-not "\tpandn" } } */ > + > +#include <x86intrin.h> > + > +int > +f1 (__m128i x) > +{ > + return _mm_movemask_epi8 (x) ^ 65535; > +} > + > +int > +f2 (__m128i x) > +{ > + return _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 (255))); > +} > + > +int > +f3 (__v16qi x) > +{ > + x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1 }; > + return _mm_movemask_epi8 ((__m128i) x); > +} > + > +long > +f4 (__m128i x) > +{ > + return (unsigned) (_mm_movemask_epi8 (x) ^ 65535); > +} > + > +long > +f5 (__m128i x) > +{ > + return (unsigned) _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 > (255))); > +} > + > +long > +f6 (__v16qi x) > +{ > + x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1 }; > + return (unsigned) _mm_movemask_epi8 ((__m128i) x); > +} > --- gcc/testsuite/gcc.target/i386/avx2-pr98461.c.jj 2020-12-29 > 14:20:27.429335767 +0100 > +++ gcc/testsuite/gcc.target/i386/avx2-pr98461.c 2020-12-29 > 14:19:50.944746895 +0100 > @@ -0,0 +1,54 @@ > +/* PR target/98461 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -masm=att" } */ > +/* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */ > +/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */ > +/* { dg-final { scan-assembler-not "\tvpcmpeq" } } */ > +/* { dg-final { scan-assembler-not "\tvpxor" } } */ > +/* { dg-final { scan-assembler-not "\tvpandn" } } */ > + > +#include <x86intrin.h> > + > +int > +f1 (__m256i x) > +{ > + return ~_mm256_movemask_epi8 (x); > +} > + > +int > +f2 (__m256i x) > +{ > + return _mm256_movemask_epi8 (_mm256_andnot_si256 (x, _mm256_set1_epi8 > (255))); > +} > + > +int > +f3 (__v32qi x) > +{ > + x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1 }; > + return _mm256_movemask_epi8 ((__m256i) x); > +} > + > +long > +f4 (__m256i x) > +{ > + return (unsigned) ~_mm256_movemask_epi8 (x); > +} > + > +long > +f5 (__m256i x) > +{ > + return (unsigned) _mm256_movemask_epi8 (_mm256_andnot_si256 (x, > _mm256_set1_epi8 (255))); > +} > + > +long > +f6 (__v32qi x) > +{ > + x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1, > + -1, -1, -1, -1, -1, -1, -1, -1 }; > + return (unsigned) _mm256_movemask_epi8 ((__m256i) x); > +} > > Jakub >