Hi All,

I have been looking into a class of problems where GCC is not recognizing that
a subreg of lane 0 (using little-endian as example) of a vector register and
passing that to an instruction.

As an example consider

poly64_t
testcase (uint8x16_t input, poly64x2_t mask)
{
    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)input, 0),
                               vgetq_lane_p64(mask, 0));
    poly64_t prodH = vmull_high_p64((poly64x2_t)input, mask);
    return prodL + prodH;
}

Where we generate

testcase:
        dup     d2, v0.d[0]
        dup     d3, v1.d[0]
        pmull2  v0.1q, v0.2d, v1.2d
        pmull   v2.1q, v2.1d, v3.1d
        add     d0, d2, d0
        fmov    x0, d0
        ret

whereas it should have been, which clang generates:

testcase:
        pmull   v2.1q, v0.1d, v1.1d
        pmull2  v0.1q, v0.2d, v1.2d
        add     v0.2d, v0.2d, v2.2d
        fmov    x0, d0
        ret

Now this can be naively solved by just adding the RTL patterns for the
vec_selects as the example in the patch, but this doesn't solve the overall
problem and I am wondering how to best do this.

One approach would be to extend combine's noop detection in noop_move_p to
recognize these cases.

The downside here is that the conversion becomes implicit in the rtl. i.e.
you'll see a SET of a V2DI but a use of DI for that same register.  I'm not sure
the semantics of RTL allow such implicit uses?

The second approach I can think of is to extend reload to recognize these no-ops
and give the same register and mark the extract as unused such that DSE cleans
it up.

But there's probably a better approach I didn't think of :)

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (*aarch64_crypto_pmullv2di): Example RTL.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/pmull_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
05d18f8bd3ac09c56c82dc73cff855315eb302b7..7bdb93869dbbedc786575b5f89f39c4c6d0d76d0
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7231,6 +7231,20 @@ (define_insn "aarch64_crypto_pmulldi"
   [(set_attr "type" "crypto_pmull")]
 )
 
+(define_insn "*aarch64_crypto_pmullv2di"
+  [(set (match_operand:TI 0 "register_operand" "=w")
+        (unspec:TI  [(vec_select:DI
+                       (match_operand:V2DI 1 "register_operand" "w")
+                       (parallel [
+                         (match_operand:SI 2 "const_int_operand" "Z")]))
+                    (match_operand:DI 3 "register_operand" "w")]
+                   UNSPEC_PMULL))]
+ "TARGET_SIMD && TARGET_AES"
+ "pmull\\t%0.1q, %1.1d, %3.1d"
+  [(set_attr "type" "crypto_pmull")]
+)
+
+
 (define_insn "aarch64_crypto_pmullv2di"
  [(set (match_operand:TI 0 "register_operand" "=w")
        (unspec:TI [(match_operand:V2DI 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/pmull_2.c 
b/gcc/testsuite/gcc.target/aarch64/pmull_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..d9d47518fab2b582329b6332e3a9c7d97c148192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pmull_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8-a+crypto -O3" } */
+
+#include "arm_neon.h"
+
+poly64_t
+testcase (uint8x16_t input, poly64x2_t mask)
+{
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)input, 0),
+                              vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)input, mask);
+    return prodL + prodH;
+}
+
+/* { dg-final { scan-assembler-times "pmull\\tv" 1 } } */


-- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 05d18f8bd3ac09c56c82dc73cff855315eb302b7..7bdb93869dbbedc786575b5f89f39c4c6d0d76d0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7231,6 +7231,20 @@ (define_insn "aarch64_crypto_pmulldi"
   [(set_attr "type" "crypto_pmull")]
 )
 
+(define_insn "*aarch64_crypto_pmullv2di"
+  [(set (match_operand:TI 0 "register_operand" "=w")
+        (unspec:TI  [(vec_select:DI
+			(match_operand:V2DI 1 "register_operand" "w")
+			(parallel [
+			  (match_operand:SI 2 "const_int_operand" "Z")]))
+		     (match_operand:DI 3 "register_operand" "w")]
+		    UNSPEC_PMULL))]
+ "TARGET_SIMD && TARGET_AES"
+ "pmull\\t%0.1q, %1.1d, %3.1d"
+  [(set_attr "type" "crypto_pmull")]
+)
+
+
 (define_insn "aarch64_crypto_pmullv2di"
  [(set (match_operand:TI 0 "register_operand" "=w")
        (unspec:TI [(match_operand:V2DI 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/pmull_2.c b/gcc/testsuite/gcc.target/aarch64/pmull_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..d9d47518fab2b582329b6332e3a9c7d97c148192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pmull_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8-a+crypto -O3" } */
+
+#include "arm_neon.h"
+
+poly64_t
+testcase (uint8x16_t input, poly64x2_t mask)
+{
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)input, 0),
+			       vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)input, mask);
+    return prodL + prodH;
+}
+
+/* { dg-final { scan-assembler-times "pmull\\tv" 1 } } */

Reply via email to