Hi All,

Consider the following case

#include <arm_neon.h>

uint64_t
test4 (uint8x16_t input)
{
    uint8x16_t bool_input = vshrq_n_u8(input, 7);
    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 
0),
                                vgetq_lane_p64(mask, 0));
    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
    return vget_lane_u16((uint16x4_t)res, 3);
}

which generates (after my CSE patches):

test4:
        ushr    v0.16b, v0.16b, 7
        mov     x0, 16512
        movk    x0, 0x1020, lsl 16
        movk    x0, 0x408, lsl 32
        movk    x0, 0x102, lsl 48
        fmov    d1, x0
        pmull   v2.1q, v0.1d, v1.1d
        dup     v1.2d, v1.d[0]
        pmull2  v0.1q, v0.2d, v1.2d
        trn2    v2.8b, v2.8b, v0.8b
        umov    w0, v2.h[3]
        re

which is suboptimal since the constant is never needed on the genreg side and
should have been materialized on the SIMD side since the constant is so big
that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.

The problem is that the choice of on which side to materialize the constant can
only be done during reload.  We may need an extra register (to hold the
addressing) and so can't be done after reload.

I have tried to support this with a pattern during reload, but the problem is I
can't seem to find a way to tell reload it should spill a constant under
condition x.  Instead I tried with a split which reload selects when the
condition hold.

This has a couple of issues:

1. The pattern can be expanded late (could be fixed with !reload_completed).
2. Because it's split so late we can't seem to be able to share the anchors for
   the ADRP.
3. Because it's split so late and basically reload doesn't know about the spill
   and so the ADD lo12 isn't pushed into the addressing mode of the LDR.

I don't know how to properly fix these since I think the only way is for reload
to do the spill properly itself, but in this case not having the patter makes it
avoid the mem pattern and pick r <- n instead followed by r -> w.

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
        * config/aarch64/constraints.md (Dx): New.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  
r,  r, w,r,w, w")
-       (match_operand:DI 1 "aarch64_mov_operand"  " 
r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, 
m,m,  r,  r, w,r,w,w")
+       (match_operand:DI 1 "aarch64_mov_operand"  " 
r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0])
+    && (GP_REGNUM_P (REGNO (operands[0]))
+       || (can_create_pseudo_p ()
+           && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
    [(const_int 0)]
    "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
+       if (GP_REGNUM_P (REGNO (operands[0])))
+        aarch64_expand_mov_immediate (operands[0], operands[1]);
+       else
+        {
+          rtx mem = force_const_mem (DImode, operands[1]);
+          gcc_assert (mem);
+          emit_move_insn (operands[0], mem);
+        }
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" 
"mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
                     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
                     neon_move")
-   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md 
b/gcc/config/aarch64/constraints.md
index 
3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4
 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "Dx"
+  "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
 (define_constraint "vgb"
   "@internal
    A constraint that matches an immediate offset valid for SVE LD1B


-- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  ,r,w, m,m,  r,  r, w,r,w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   #
    * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+    && REG_P (operands[0])
+    && (GP_REGNUM_P (REGNO (operands[0]))
+	|| (can_create_pseudo_p ()
+	    && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
    [(const_int 0)]
    "{
-       aarch64_expand_mov_immediate (operands[0], operands[1]);
+       if (GP_REGNUM_P (REGNO (operands[0])))
+	 aarch64_expand_mov_immediate (operands[0], operands[1]);
+       else
+	 {
+	   rtx mem = force_const_mem (DImode, operands[1]);
+	   gcc_assert (mem);
+	   emit_move_insn (operands[0], mem);
+	 }
        DONE;
     }"
   ;; The "mov_imm" type for CNTD is just a placeholder.
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
 		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
 		     neon_move")
-   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
+   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -474,6 +474,14 @@ (define_address_constraint "Dp"
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "Dx"
+  "@internal
+ A constraint that matches an integer immediate operand not valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
+      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
+
 (define_constraint "vgb"
   "@internal
    A constraint that matches an immediate offset valid for SVE LD1B

Reply via email to