Kyrill Tkachov <kyrylo.tkac...@foss.arm.com> writes:
> @@ -764,6 +780,13 @@ (define_insn "aarch64_<sur>adalp<mode>_3"
>  ;; UABAL     tmp.8h, op1.16b, op2.16b
>  ;; UADALP    op3.4s, tmp.8h
>  ;; MOV               op0, op3 // should be eliminated in later passes.
> +;;
> +;; For TARGET_DOTPROD we do:
> +;; MOV       tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
> +;; UABD      tmp2.16b, op1.16b, op2.16b
> +;; UDOT      op3.4s, tmp2.16b, tmp1.16b
> +;; MOV       op0, op3 // RA will tie the operands of UDOT appropriately.
> +;;
>  ;; The signed version just uses the signed variants of the above 
> instructions.

It looks like the code does what the comment says, and uses SDOT for the
signed optab.  Doesn't it need to be UDOT for both?  The signedness of the
optab applies to the inputs (and so to SABD vs. UABD), but the absolute
difference is always unsigned.

>  
>  (define_expand "<sur>sadv16qi"
> @@ -773,6 +796,18 @@ (define_expand "<sur>sadv16qi"
>     (use (match_operand:V4SI 3 "register_operand"))]
>    "TARGET_SIMD"
>    {
> +    if (TARGET_DOTPROD)
> +      {
> +     rtx ones = gen_reg_rtx (V16QImode);
> +     emit_move_insn (ones,
> +                     aarch64_simd_gen_const_vector_dup (V16QImode,
> +                                                         HOST_WIDE_INT_1));

Easier as:

  rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));

> +     rtx abd = gen_reg_rtx (V16QImode);
> +     emit_insn (gen_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
> +     emit_insn (gen_aarch64_<sur>dotv16qi (operands[0], operands[3],
> +                                            abd, ones));

Nit: indented too far.

Thanks,
Richard

> +     DONE;
> +      }
>      rtx reduc = gen_reg_rtx (V8HImode);
>      emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
>                                              operands[2]));
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 16e4dbda73ab928054590c47a4398408162c0332..5afb692493c6e9fa31355693e7843e4f0b1b281c
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1059,6 +1059,9 @@ (define_code_attr f16mac [(plus "a") (minus "s")])
>  ;; Map smax to smin and umax to umin.
>  (define_code_attr max_opp [(smax "smin") (umax "umin")])
>  
> +;; Same as above, but louder.
> +(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
> +
>  ;; The number of subvectors in an SVE_STRUCT.
>  (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
>                               (VNx8SI  "2") (VNx4DI  "2")
> diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c 
> b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..e08c33785303e86815554e67a300189a67dfc1da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +/* { dg-additional-options "-O3" } */
> +
> +#pragma GCC target "+nosve"
> +
> +#define N 1024
> +
> +signed char pix1[N], pix2[N];
> +
> +int foo (void)
> +{
> +  int i_sum = 0;
> +  int i;
> +
> +  for (i = 0; i < N; i++)
> +    i_sum += __builtin_abs (pix1[i] - pix2[i]);
> +
> +  return i_sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {\tsshll\t} } } */
> +/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
> +/* { dg-final { scan-assembler-not {\tssubl\t} } } */
> +/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
> +/* { dg-final { scan-assembler-not {\tabs\t} } } */
> +
> +/* { dg-final { scan-assembler {\tsabd\t} } } */
> +/* { dg-final { scan-assembler {\tsdot\t} } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c 
> b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
> index 
> 40b28843616e84df137210b45ec16abed2a37c75..85a867a113013f560bfd0a3142805b9c95ad8c5a
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O3" } */
>  
> -#pragma GCC target "+nosve"
> +#pragma GCC target "+nosve+nodotprod"
>  
>  #define N 1024
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c 
> b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ea8de4d69758bd6bc9af9e33e1498f838b706949
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
> +/* { dg-add-options arm_v8_2a_dotprod_neon }  */
> +/* { dg-additional-options "-O3" } */
> +
> +#pragma GCC target "+nosve"
> +
> +#define N 1024
> +
> +unsigned char pix1[N], pix2[N];
> +
> +int foo (void)
> +{
> +  int i_sum = 0;
> +  int i;
> +
> +  for (i = 0; i < N; i++)
> +    i_sum += __builtin_abs (pix1[i] - pix2[i]);
> +
> +  return i_sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {\tushll\t} } } */
> +/* { dg-final { scan-assembler-not {\tushll2\t} } } */
> +/* { dg-final { scan-assembler-not {\tusubl\t} } } */
> +/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
> +/* { dg-final { scan-assembler-not {\tabs\t} } } */
> +
> +/* { dg-final { scan-assembler {\tuabd\t} } } */
> +/* { dg-final { scan-assembler {\tudot\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c 
> b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
> index 
> 69ceaf4259ea43e95078ce900d2498c3a2291369..a66e1209662cefaa95c90d8d2694f9c7c0de4152
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
> +++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O3" } */
>  
> -#pragma GCC target "+nosve"
> +#pragma GCC target "+nosve+nodotprod"
>  
>  #define N 1024
>  

Reply via email to