Catching up on backlog, sorry for the very late response:

Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> Consider the following case
>
> #include <arm_neon.h>
>
> uint64_t
> test4 (uint8x16_t input)
> {
>     uint8x16_t bool_input = vshrq_n_u8(input, 7);
>     poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
>     poly64_t prodL = 
> vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
>                               vgetq_lane_p64(mask, 0));
>     poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
>     uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
>     return vget_lane_u16((uint16x4_t)res, 3);
> }
>
> which generates (after my CSE patches):
>
> test4:
>       ushr    v0.16b, v0.16b, 7
>       mov     x0, 16512
>       movk    x0, 0x1020, lsl 16
>       movk    x0, 0x408, lsl 32
>       movk    x0, 0x102, lsl 48
>       fmov    d1, x0
>       pmull   v2.1q, v0.1d, v1.1d
>       dup     v1.2d, v1.d[0]
>       pmull2  v0.1q, v0.2d, v1.2d
>       trn2    v2.8b, v2.8b, v0.8b
>       umov    w0, v2.h[3]
>       re
>
> which is suboptimal since the constant is never needed on the genreg side and
> should have been materialized on the SIMD side since the constant is so big
> that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov.
>
> The problem is that the choice of on which side to materialize the constant 
> can
> only be done during reload.  We may need an extra register (to hold the
> addressing) and so can't be done after reload.
>
> I have tried to support this with a pattern during reload, but the problem is 
> I
> can't seem to find a way to tell reload it should spill a constant under
> condition x.  Instead I tried with a split which reload selects when the
> condition hold.

If this is still an issue, one thing to try would be to put a "$" before
the "r" in the GPR alternative.  If that doesn't work then yeah,
I think we're out of luck describing this directly.  If "$" does work,
it'd be interesting to see whether "^" does too.

Thanks,
Richard

>
> This has a couple of issues:
>
> 1. The pattern can be expanded late (could be fixed with !reload_completed).
> 2. Because it's split so late we can't seem to be able to share the anchors 
> for
>    the ADRP.
> 3. Because it's split so late and basically reload doesn't know about the 
> spill
>    and so the ADD lo12 isn't pushed into the addressing mode of the LDR.
>
> I don't know how to properly fix these since I think the only way is for 
> reload
> to do the spill properly itself, but in this case not having the patter makes 
> it
> avoid the mem pattern and pick r <- n instead followed by r -> w.
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W.
>       * config/aarch64/constraints.md (Dx): New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>  
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, 
> m,m,  r,  r, w,r,w, w")
> -     (match_operand:DI 1 "aarch64_mov_operand"  " 
> r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w  ,r  
> ,r,w, m,m,  r,  r, w,r,w,w")
> +     (match_operand:DI 1 "aarch64_mov_operand"  " 
> r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
> @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64"
>     mov\\t%x0, %1
>     mov\\t%w0, %1
>     #
> +   #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
>     ldr\\t%d0, %1
> @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64"
>     fmov\\t%d0, %d1
>     * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
>     "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
> DImode))
> -    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
> +    && REG_P (operands[0])
> +    && (GP_REGNUM_P (REGNO (operands[0]))
> +     || (can_create_pseudo_p ()
> +         && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))"
>     [(const_int 0)]
>     "{
> -       aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       if (GP_REGNUM_P (REGNO (operands[0])))
> +      aarch64_expand_mov_immediate (operands[0], operands[1]);
> +       else
> +      {
> +        rtx mem = force_const_mem (DImode, operands[1]);
> +        gcc_assert (mem);
> +        emit_move_insn (operands[0], mem);
> +      }
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" 
> "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm,
>                    load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
>                    neon_move")
> -   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
> +   (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
>  )
>  
>  (define_insn "insv_imm<mode>"
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index 
> 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4
>  100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -474,6 +474,14 @@ (define_address_constraint "Dp"
>   An address valid for a prefetch instruction."
>   (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
>  
> +(define_constraint "Dx"
> +  "@internal
> + A constraint that matches an integer immediate operand not valid\
> + for AdvSIMD scalar operations in DImode."
> + (and (match_code "const_int")
> +      (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)")
> +      (match_test "!aarch64_move_imm (INTVAL (op), DImode)")))
> +
>  (define_constraint "vgb"
>    "@internal
>     A constraint that matches an immediate offset valid for SVE LD1B

Reply via email to