Hi All, Consider the following case
#include <arm_neon.h> uint64_t test4 (uint8x16_t input) { uint8x16_t bool_input = vshrq_n_u8(input, 7); poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), vgetq_lane_p64(mask, 0)); poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); return vget_lane_u16((uint16x4_t)res, 3); } which generates (after my CSE patches): test4: ushr v0.16b, v0.16b, 7 mov x0, 16512 movk x0, 0x1020, lsl 16 movk x0, 0x408, lsl 32 movk x0, 0x102, lsl 48 fmov d1, x0 pmull v2.1q, v0.1d, v1.1d dup v1.2d, v1.d[0] pmull2 v0.1q, v0.2d, v1.2d trn2 v2.8b, v2.8b, v0.8b umov w0, v2.h[3] re which is suboptimal since the constant is never needed on the genreg side and should have been materialized on the SIMD side since the constant is so big that it requires 5 instruction to create otherwise. 4 mov/movk and one fmov. The problem is that the choice of on which side to materialize the constant can only be done during reload. We may need an extra register (to hold the addressing) and so can't be done after reload. I have tried to support this with a pattern during reload, but the problem is I can't seem to find a way to tell reload it should spill a constant under condition x. Instead I tried with a split which reload selects when the condition hold. This has a couple of issues: 1. The pattern can be expanded late (could be fixed with !reload_completed). 2. Because it's split so late we can't seem to be able to share the anchors for the ADRP. 3. Because it's split so late and basically reload doesn't know about the spill and so the ADD lo12 isn't pushed into the addressing mode of the LDR. I don't know how to properly fix these since I think the only way is for reload to do the spill properly itself, but in this case not having the patter makes it avoid the mem pattern and pick r <- n instead followed by r -> w. Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64.md (*movdi_aarch6): Add Dx -> W. * config/aarch64/constraints.md (Dx): New. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64" ) (define_insn_and_split "*movdi_aarch64" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w") - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w") + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] "(register_operand (operands[0], DImode) || aarch64_reg_or_zero (operands[1], DImode))" "@ @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64" mov\\t%x0, %1 mov\\t%w0, %1 # + # * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]); ldr\\t%x0, %1 ldr\\t%d0, %1 @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64" fmov\\t%d0, %d1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + && REG_P (operands[0]) + && (GP_REGNUM_P (REGNO (operands[0])) + || (can_create_pseudo_p () + && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))" [(const_int 0)] "{ - aarch64_expand_mov_immediate (operands[0], operands[1]); + if (GP_REGNUM_P (REGNO (operands[0]))) + aarch64_expand_mov_immediate (operands[0], operands[1]); + else + { + rtx mem = force_const_mem (DImode, operands[1]); + gcc_assert (mem); + emit_move_insn (operands[0], mem); + } DONE; }" ;; The "mov_imm" type for CNTD is just a placeholder. - [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm, + [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm, load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov, neon_move") - (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] + (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] ) (define_insn "insv_imm<mode>" diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -474,6 +474,14 @@ (define_address_constraint "Dp" An address valid for a prefetch instruction." (match_test "aarch64_address_valid_for_prefetch_p (op, true)")) +(define_constraint "Dx" + "@internal + A constraint that matches an integer immediate operand not valid\ + for AdvSIMD scalar operations in DImode." + (and (match_code "const_int") + (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)") + (match_test "!aarch64_move_imm (INTVAL (op), DImode)"))) + (define_constraint "vgb" "@internal A constraint that matches an immediate offset valid for SVE LD1B --
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index eb8ccd4b97bbd4f0c3ff5791e48cfcfb42ec6c2e..a18886cb65c86daa16baa1691b1718f2d3a1be6c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1298,8 +1298,8 @@ (define_insn_and_split "*movsi_aarch64" ) (define_insn_and_split "*movdi_aarch64" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m, r, r, w,r,w, w") - (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w ,r ,r,w, m,m, r, r, w,r,w,w") + (match_operand:DI 1 "aarch64_mov_operand" " r,r,k,N,M,n,Dx,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))] "(register_operand (operands[0], DImode) || aarch64_reg_or_zero (operands[1], DImode))" "@ @@ -1309,6 +1309,7 @@ (define_insn_and_split "*movdi_aarch64" mov\\t%x0, %1 mov\\t%w0, %1 # + # * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]); ldr\\t%x0, %1 ldr\\t%d0, %1 @@ -1321,17 +1322,27 @@ (define_insn_and_split "*movdi_aarch64" fmov\\t%d0, %d1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + && REG_P (operands[0]) + && (GP_REGNUM_P (REGNO (operands[0])) + || (can_create_pseudo_p () + && !aarch64_can_const_movi_rtx_p (operands[1], DImode)))" [(const_int 0)] "{ - aarch64_expand_mov_immediate (operands[0], operands[1]); + if (GP_REGNUM_P (REGNO (operands[0]))) + aarch64_expand_mov_immediate (operands[0], operands[1]); + else + { + rtx mem = force_const_mem (DImode, operands[1]); + gcc_assert (mem); + emit_move_insn (operands[0], mem); + } DONE; }" ;; The "mov_imm" type for CNTD is just a placeholder. - [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm, + [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,mov_imm, load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov, neon_move") - (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] + (set_attr "arch" "*,*,*,*,*,*,simd,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")] ) (define_insn "insv_imm<mode>" diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index 3b49b452119c49320020fa9183314d9a25b92491..422d95b50a8e9608b57f0f39745c89d58ea1e8a4 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -474,6 +474,14 @@ (define_address_constraint "Dp" An address valid for a prefetch instruction." (match_test "aarch64_address_valid_for_prefetch_p (op, true)")) +(define_constraint "Dx" + "@internal + A constraint that matches an integer immediate operand not valid\ + for AdvSIMD scalar operations in DImode." + (and (match_code "const_int") + (match_test "!aarch64_can_const_movi_rtx_p (op, DImode)") + (match_test "!aarch64_move_imm (INTVAL (op), DImode)"))) + (define_constraint "vgb" "@internal A constraint that matches an immediate offset valid for SVE LD1B