On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> For i386, it will enable below opt
>
> from
>         notl    %edi
>         vpbroadcastd    %edi, %xmm0
>         vpand   %xmm1, %xmm0, %xmm0
> to
>         vpbroadcastd    %edi, %xmm0
>         vpandn   %xmm1, %xmm0, %xmm0

There will be cases where (vec_duplicate (not A)) is better
than (not (vec_duplicate A)), so I'm not sure it is a good idea
to forcefully canonicalize unary operations.  I suppose the
simplification happens inside combine - doesn't combine
already have code to try variants of an expression and isn't
this a good candidate that can be added there, avoiding
the canonicalization?

Richard.

> gcc/ChangeLog:
>
>         PR target/100711
>         * simplify-rtx.c (simplify_unary_operation_1):
>         Canonicalize (vec_duplicate (not A)) to
>         (not (vec_duplicate A)).
>         * doc/md.texi (Insn Canonicalizations): Document
>         canonicalization of vec_duplicate.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/100711
>         * gcc.target/i386/avx2-pr100711.c: New test.
>         * gcc.target/i386/avx512bw-pr100711.c: New test.
> ---
>  gcc/doc/md.texi                               |  5 ++
>  gcc/simplify-rtx.c                            |  6 ++
>  gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
>  .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
>  4 files changed, 132 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 0e65b3ae663..06b42901413 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is 
> extended also.
>  This transformation is only applied when it can be proven that the
>  original operation had sufficient precision to prevent overflow.
>
> +@cindex @code{vec_duplicate}, canonicalization of
> +@item
> +@code{(vec_duplicate (not @var{a}))} is converted to
> +@code{(not (vec_duplicate @var{a}))}.
> +
>  @end itemize
>
>  Further canonicalization rules are defined in the function
> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 04423bbd195..171fc447d50 100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>  #endif
>        break;
>
> +      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
> +    case VEC_DUPLICATE:
> +      if (GET_CODE (op) == NOT)
> +       return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
> +      break;
> +
>      default:
>        break;
>      }
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c 
> b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> new file mode 100644
> index 00000000000..5b144623873
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 8 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +typedef char v16qi __attribute__((vector_size(16)));
> +typedef char v32qi __attribute__((vector_size(32)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef short v16hi __attribute__((vector_size(32)));
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef long long v2di __attribute__((vector_size(16)));
> +typedef long long v4di __attribute__((vector_size(32)));
> +
> +v16qi
> +f1 (char a, v16qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32qi
> +f2 (char a, v32qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8hi
> +f3 (short a, v8hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16hi
> +f4 (short a, v16hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v4si
> +f5 (int a, v4si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v4si) {b, b, b, b}) & c;
> +}
> +
> +v8si
> +f6 (int a, v8si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v2di
> +f7 (long long a, v2di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v2di) {b, b}) & c;
> +}
> +
> +v4di
> +f8 (long long a, v4di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v4di) {b, b, b, b}) & c;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c 
> b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> new file mode 100644
> index 00000000000..f0a103d0bc2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 4 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +
> +typedef char v64qi __attribute__((vector_size(64)));
> +typedef short v32hi __attribute__((vector_size(64)));
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef long long v8di __attribute__((vector_size(64)));
> +
> +v64qi
> +f1 (char a, v64qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32hi
> +f2 (short a, v32hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16si
> +f3 (int a, v16si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8di
> +f4 (long long a, v8di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
> +}
> --
> 2.18.1
>

Reply via email to