On 11/05/18 14:29, Kyrill Tkachov wrote:
> Hi all,
> 
> When the AESE,AESD and AESMC, AESMC instructions are generated through
> the appropriate arm_neon.h intrinsics
> we really want to keep them together when the AESE feeds into an AESMC
> and fusion is supported by the target CPU.
> We have macro-fusion hooks and scheduling model forwarding paths defined
> to facilitate that.
> It is, however, not always enough.
> 
> This patch adds another mechanism for doing that.
> When we can detect during combine that the required dependency is exists
> (AESE -> AESMC, AESD -> AESIMC)
> just keep them together with a combine pattern throughout the rest of
> compilation.
> We won't ever want to split them.
> 
> The testcases generate 4 AESE(D) instructions in a block followed by 4
> AES(I)MC instructions that
> consume the corresponding results and it also adds a bunch of
> computations in-between so that the
> AESE and AESMC instructions are not trivially back-to-back, thus
> exercising the compiler's ability
> to bring them together.
> 
> With this patch all 4 pairs are fused whereas before a couple of fusions
> would be missed due to intervening
> arithmetic and memory instructions.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?
> 
> Thanks,
> Kyrill
> 
> 2018-05-11  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>
> 
>     * config/aarch64/aarch64-simd.md (*aarch64_crypto_aese_fused):
>     New pattern.
>     (aarch64_crypto_aesd_fused): Likewise.
> 
> 2018-05-11  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>
> 
>     * gcc.target/aarch64/crypto-fuse-1.c: New test.
>     * gcc.target/aarch64/crypto-fuse-2.c: Likewise.

Your testcases are missing a newline at the end of each file.  Otherwise OK.

R.

> 
> fuse-combine.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 7c166b6c8ec40475d1e01561b613b590b6690ad5..9a6ed304432af0ca23ec7d3797783a3128776a6e
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -5790,6 +5790,44 @@ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
>        (const_string "yes")])]
>  )
>  
> +;; When AESE/AESMC fusion is enabled we really want to keep the two together
> +;; and enforce the register dependency without scheduling or register
> +;; allocation messing up the order or introducing moves inbetween.
> +;;  Mash the two together during combine.
> +
> +(define_insn "*aarch64_crypto_aese_fused"
> +  [(set (match_operand:V16QI 0 "register_operand" "=&w")
> +     (unspec:V16QI
> +       [(unspec:V16QI
> +         [(match_operand:V16QI 1 "register_operand" "0")
> +          (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
> +       ] UNSPEC_AESMC))]
> +  "TARGET_SIMD && TARGET_AES
> +   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
> +  "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
> +  [(set_attr "type" "crypto_aese")
> +   (set_attr "length" "8")]
> +)
> +
> +;; When AESD/AESIMC fusion is enabled we really want to keep the two together
> +;; and enforce the register dependency without scheduling or register
> +;; allocation messing up the order or introducing moves inbetween.
> +;;  Mash the two together during combine.
> +
> +(define_insn "*aarch64_crypto_aesd_fused"
> +  [(set (match_operand:V16QI 0 "register_operand" "=&w")
> +     (unspec:V16QI
> +       [(unspec:V16QI
> +         [(match_operand:V16QI 1 "register_operand" "0")
> +          (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
> +       ] UNSPEC_AESIMC))]
> +  "TARGET_SIMD && TARGET_AES
> +   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
> +  "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
> +  [(set_attr "type" "crypto_aese")
> +   (set_attr "length" "8")]
> +)
> +
>  ;; sha1
>  
>  (define_insn "aarch64_crypto_sha1hsi"
> diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c 
> b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..79fd6011ed946d746ed5f03d26c7fe661f3f8154
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
> +
> +#include <arm_neon.h>
> +
> +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
> +#define AESMC(r, i) (r = vaesmcq_u8 (i))
> +
> +uint8x16_t dummy;
> +uint8x16_t a;
> +uint8x16_t b;
> +uint8x16_t c;
> +uint8x16_t d;
> +uint8x16_t e;
> +
> +void
> +foo (void)
> +{
> +  AESE (a, a, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (b, b, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (c, c, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (d, d, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +
> +  AESMC (a, a);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (b, b);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (c, c);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (d, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c 
> b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ed9eb69e803b24ec16a72075c46a9b6e6898c2fe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
> +
> +#include <arm_neon.h>
> +
> +#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key)));
> +#define AESMC(r, i) (r = vaesimcq_u8 (i))
> +
> +uint8x16_t dummy;
> +uint8x16_t a;
> +uint8x16_t b;
> +uint8x16_t c;
> +uint8x16_t d;
> +uint8x16_t e;
> +
> +void
> +foo (void)
> +{
> +  AESE (a, a, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (b, b, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (c, c, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (d, d, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +
> +  AESMC (a, a);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (b, b);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (c, c);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (d, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */
> \ No newline at end of file
> 

Reply via email to