Re: [PATCH] aarch64: Use RTL builtins for integer mla intrinsics

Richard Sandiford via Gcc-patches Fri, 22 Jan 2021 06:56:36 -0800

Thanks for doing this.  The patch looks good with one very minor nit fixed:


Jonathan Wright <jonathan.wri...@arm.com> writes:
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> f7efee61de4c5268acf446555af4a93fece6b169..da696d9fee2ffbabc9d89f2e9299fbde086cfee1
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -7294,72 +7294,48 @@ __extension__ extern __inline int8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
>  {
> -  int8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav8qi(__a, __b, __c);

GNU style (followed in the header file) is to insert a space between
the function name and the arguments.  Same for the other functions.

Since other patches like this are on their way, would you mind
going through the process on https://gcc.gnu.org/gitwrite.html
to get commit access?  (I'll sponsor.)

Once you've got access, the patch is OK to commit with the change above.

A nice follow-on would be to lower the mla intrinsics to IFN_FMA.
See aarch64_general_gimple_fold_builtin, which does something similar
for IFN_REDUC_PLUS etc.

Thanks,
Richard

>  }
>  
>  __extension__ extern __inline int16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>  {
> -  int16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav4hi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
>  {
> -  int32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav2si(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline uint8x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
>  {
> -  uint8x8_t __result;
> -  __asm__ ("mla %0.8b, %2.8b, %3.8b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint8x8_t) __builtin_aarch64_mlav8qi((int8x8_t) __a,
> +                                               (int8x8_t) __b,
> +                                               (int8x8_t) __c);
>  }
>  
>  __extension__ extern __inline uint16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
>  {
> -  uint16x4_t __result;
> -  __asm__ ("mla %0.4h, %2.4h, %3.4h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint16x4_t) __builtin_aarch64_mlav4hi((int16x4_t) __a,
> +                                                (int16x4_t) __b,
> +                                                (int16x4_t) __c);
>  }
>  
>  __extension__ extern __inline uint32x2_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
>  {
> -  uint32x2_t __result;
> -  __asm__ ("mla %0.2s, %2.2s, %3.2s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint32x2_t) __builtin_aarch64_mlav2si((int32x2_t) __a,
> +                                                (int32x2_t) __b,
> +                                                (int32x2_t) __c);
>  }
>  
>  #define vmlal_high_lane_s16(a, b, c, d)                                 \
> @@ -7835,72 +7811,48 @@ __extension__ extern __inline int8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
>  {
> -  int8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav16qi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
>  {
> -  int16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav8hi(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline int32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
>  {
> -  int32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return __builtin_aarch64_mlav4si(__a, __b, __c);
>  }
>  
>  __extension__ extern __inline uint8x16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
>  {
> -  uint8x16_t __result;
> -  __asm__ ("mla %0.16b, %2.16b, %3.16b"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint8x16_t) __builtin_aarch64_mlav16qi((int8x16_t) __a,
> +                                                 (int8x16_t) __b,
> +                                                 (int8x16_t) __c);
>  }
>  
>  __extension__ extern __inline uint16x8_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
>  {
> -  uint16x8_t __result;
> -  __asm__ ("mla %0.8h, %2.8h, %3.8h"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint16x8_t) __builtin_aarch64_mlav8hi((int16x8_t) __a,
> +                                                (int16x8_t) __b,
> +                                                (int16x8_t) __c);
>  }
>  
>  __extension__ extern __inline uint32x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
>  {
> -  uint32x4_t __result;
> -  __asm__ ("mla %0.4s, %2.4s, %3.4s"
> -           : "=w"(__result)
> -           : "0"(__a), "w"(__b), "w"(__c)
> -           : /* No clobbers */);
> -  return __result;
> +  return (uint32x4_t) __builtin_aarch64_mlav4si((int32x4_t) __a,
> +                                                (int32x4_t) __b,
> +                                                (int32x4_t) __c);
>  }
>  
>  __extension__ extern __inline float32x2_t

Re: [PATCH] aarch64: Use RTL builtins for integer mla intrinsics

Reply via email to