On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For i386, it will enable below opt > > from > notl %edi > vpbroadcastd %edi, %xmm0 > vpand %xmm1, %xmm0, %xmm0 > to > vpbroadcastd %edi, %xmm0 > vpandn %xmm1, %xmm0, %xmm0
There will be cases where (vec_duplicate (not A)) is better than (not (vec_duplicate A)), so I'm not sure it is a good idea to forcefully canonicalize unary operations. I suppose the simplification happens inside combine - doesn't combine already have code to try variants of an expression and isn't this a good candidate that can be added there, avoiding the canonicalization? Richard. > gcc/ChangeLog: > > PR target/100711 > * simplify-rtx.c (simplify_unary_operation_1): > Canonicalize (vec_duplicate (not A)) to > (not (vec_duplicate A)). > * doc/md.texi (Insn Canonicalizations): Document > canonicalization of vec_duplicate. > > gcc/testsuite/ChangeLog: > > PR target/100711 > * gcc.target/i386/avx2-pr100711.c: New test. > * gcc.target/i386/avx512bw-pr100711.c: New test. > --- > gcc/doc/md.texi | 5 ++ > gcc/simplify-rtx.c | 6 ++ > gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++ > .../gcc.target/i386/avx512bw-pr100711.c | 48 ++++++++++++ > 4 files changed, 132 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c > create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi > index 0e65b3ae663..06b42901413 100644 > --- a/gcc/doc/md.texi > +++ b/gcc/doc/md.texi > @@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is > extended also. > This transformation is only applied when it can be proven that the > original operation had sufficient precision to prevent overflow. > > +@cindex @code{vec_duplicate}, canonicalization of > +@item > +@code{(vec_duplicate (not @var{a}))} is converted to > +@code{(not (vec_duplicate @var{a}))}. > + > @end itemize > > Further canonicalization rules are defined in the function > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > index 04423bbd195..171fc447d50 100644 > --- a/gcc/simplify-rtx.c > +++ b/gcc/simplify-rtx.c > @@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code > code, machine_mode mode, > #endif > break; > > + /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)). */ > + case VEC_DUPLICATE: > + if (GET_CODE (op) == NOT) > + return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0))); > + break; > + > default: > break; > } > diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c > b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c > new file mode 100644 > index 00000000000..5b144623873 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c > @@ -0,0 +1,73 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -O2" } */ > +/* { dg-final { scan-assembler-times "pandn" 8 } } */ > +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */ > +typedef char v16qi __attribute__((vector_size(16))); > +typedef char v32qi __attribute__((vector_size(32))); > +typedef short v8hi __attribute__((vector_size(16))); > +typedef short v16hi __attribute__((vector_size(32))); > +typedef int v4si __attribute__((vector_size(16))); > +typedef int v8si __attribute__((vector_size(32))); > +typedef long long v2di __attribute__((vector_size(16))); > +typedef long long v4di __attribute__((vector_size(32))); > + > +v16qi > +f1 (char a, v16qi c) > +{ > + char b = ~a; > + return (__extension__(v16qi) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v32qi > +f2 (char a, v32qi c) > +{ > + char b = ~a; > + return (__extension__(v32qi) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v8hi > +f3 (short a, v8hi c) > +{ > + short b = ~a; > + return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c; > +} > + > +v16hi > +f4 (short a, v16hi c) > +{ > + short b = ~a; > + return (__extension__(v16hi) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v4si > +f5 (int a, v4si c) > +{ > + int b = ~a; > + return (__extension__(v4si) {b, b, b, b}) & c; > +} > + > +v8si > +f6 (int a, v8si c) > +{ > + int b = ~a; > + return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c; > +} > + > +v2di > +f7 (long long a, v2di c) > +{ > + long long b = ~a; > + return (__extension__(v2di) {b, b}) & c; > +} > + > +v4di > +f8 (long long a, v4di c) > +{ > + long long b = ~a; > + return (__extension__(v4di) {b, b, b, b}) & c; > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c > b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c > new file mode 100644 > index 00000000000..f0a103d0bc2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -O2" } */ > +/* { dg-final { scan-assembler-times "pandn" 4 } } */ > +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */ > + > +typedef char v64qi __attribute__((vector_size(64))); > +typedef short v32hi __attribute__((vector_size(64))); > +typedef int v16si __attribute__((vector_size(64))); > +typedef long long v8di __attribute__((vector_size(64))); > + > +v64qi > +f1 (char a, v64qi c) > +{ > + char b = ~a; > + return (__extension__(v64qi) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v32hi > +f2 (short a, v32hi c) > +{ > + short b = ~a; > + return (__extension__(v32hi) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v16si > +f3 (int a, v16si c) > +{ > + int b = ~a; > + return (__extension__(v16si) {b, b, b, b, b, b, b, b, > + b, b, b, b, b, b, b, b}) & c; > +} > + > +v8di > +f4 (long long a, v8di c) > +{ > + long long b = ~a; > + return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c; > +} > -- > 2.18.1 >