On 11/05/18 14:29, Kyrill Tkachov wrote: > Hi all, > > When the AESE,AESD and AESMC, AESMC instructions are generated through > the appropriate arm_neon.h intrinsics > we really want to keep them together when the AESE feeds into an AESMC > and fusion is supported by the target CPU. > We have macro-fusion hooks and scheduling model forwarding paths defined > to facilitate that. > It is, however, not always enough. > > This patch adds another mechanism for doing that. > When we can detect during combine that the required dependency is exists > (AESE -> AESMC, AESD -> AESIMC) > just keep them together with a combine pattern throughout the rest of > compilation. > We won't ever want to split them. > > The testcases generate 4 AESE(D) instructions in a block followed by 4 > AES(I)MC instructions that > consume the corresponding results and it also adds a bunch of > computations in-between so that the > AESE and AESMC instructions are not trivially back-to-back, thus > exercising the compiler's ability > to bring them together. > > With this patch all 4 pairs are fused whereas before a couple of fusions > would be missed due to intervening > arithmetic and memory instructions. > > Bootstrapped and tested on aarch64-none-linux-gnu. > > Ok for trunk? > > Thanks, > Kyrill > > 2018-05-11 Kyrylo Tkachov <kyrylo.tkac...@arm.com> > > * config/aarch64/aarch64-simd.md (*aarch64_crypto_aese_fused): > New pattern. > (aarch64_crypto_aesd_fused): Likewise. > > 2018-05-11 Kyrylo Tkachov <kyrylo.tkac...@arm.com> > > * gcc.target/aarch64/crypto-fuse-1.c: New test. > * gcc.target/aarch64/crypto-fuse-2.c: Likewise.
Your testcases are missing a newline at the end of each file. Otherwise OK. R. > > fuse-combine.patch > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 7c166b6c8ec40475d1e01561b613b590b6690ad5..9a6ed304432af0ca23ec7d3797783a3128776a6e > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -5790,6 +5790,44 @@ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi" > (const_string "yes")])] > ) > > +;; When AESE/AESMC fusion is enabled we really want to keep the two together > +;; and enforce the register dependency without scheduling or register > +;; allocation messing up the order or introducing moves inbetween. > +;; Mash the two together during combine. > + > +(define_insn "*aarch64_crypto_aese_fused" > + [(set (match_operand:V16QI 0 "register_operand" "=&w") > + (unspec:V16QI > + [(unspec:V16QI > + [(match_operand:V16QI 1 "register_operand" "0") > + (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE) > + ] UNSPEC_AESMC))] > + "TARGET_SIMD && TARGET_AES > + && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)" > + "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b" > + [(set_attr "type" "crypto_aese") > + (set_attr "length" "8")] > +) > + > +;; When AESD/AESIMC fusion is enabled we really want to keep the two together > +;; and enforce the register dependency without scheduling or register > +;; allocation messing up the order or introducing moves inbetween. > +;; Mash the two together during combine. > + > +(define_insn "*aarch64_crypto_aesd_fused" > + [(set (match_operand:V16QI 0 "register_operand" "=&w") > + (unspec:V16QI > + [(unspec:V16QI > + [(match_operand:V16QI 1 "register_operand" "0") > + (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD) > + ] UNSPEC_AESIMC))] > + "TARGET_SIMD && TARGET_AES > + && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)" > + "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b" > + [(set_attr "type" "crypto_aese") > + (set_attr "length" "8")] > +) > + > ;; sha1 > > (define_insn "aarch64_crypto_sha1hsi" > diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c > b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..79fd6011ed946d746ed5f03d26c7fe661f3f8154 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c > @@ -0,0 +1,44 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */ > + > +#include <arm_neon.h> > + > +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key))); > +#define AESMC(r, i) (r = vaesmcq_u8 (i)) > + > +uint8x16_t dummy; > +uint8x16_t a; > +uint8x16_t b; > +uint8x16_t c; > +uint8x16_t d; > +uint8x16_t e; > + > +void > +foo (void) > +{ > + AESE (a, a, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (b, b, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (c, c, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (d, d, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + > + AESMC (a, a); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (b, b); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (c, c); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (d, d); > +} > + > +/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */ > \ No newline at end of file > diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c > b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..ed9eb69e803b24ec16a72075c46a9b6e6898c2fe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c > @@ -0,0 +1,44 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */ > + > +#include <arm_neon.h> > + > +#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key))); > +#define AESMC(r, i) (r = vaesimcq_u8 (i)) > + > +uint8x16_t dummy; > +uint8x16_t a; > +uint8x16_t b; > +uint8x16_t c; > +uint8x16_t d; > +uint8x16_t e; > + > +void > +foo (void) > +{ > + AESE (a, a, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (b, b, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (c, c, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESE (d, d, e); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + > + AESMC (a, a); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (b, b); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (c, c); > + dummy = vaddq_u8 (dummy, dummy); > + dummy = vaddq_u8 (dummy, dummy); > + AESMC (d, d); > +} > + > +/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */ > \ No newline at end of file >