Wilco Dijkstra <[email protected]> writes: > ping > > > From: Wilco Dijkstra > Sent: 04 June 2021 14:44 > To: Richard Sandiford <[email protected]> > Cc: Kyrylo Tkachov <[email protected]>; GCC Patches > <[email protected]> > Subject: [PATCH v3] AArch64: Improve GOT addressing > > Hi Richard, > > This merges the v1 and v2 patches and removes the spurious MEM from > ldr_got_small_si/di. This has been rebased after [1], and the performance > gain has now doubled. > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571708.html > > Improve GOT addressing by treating the instructions as a pair. This reduces > register pressure and improves code quality significantly. SPECINT2017 > improves > by 0.6% with -fPIC and codesize is 0.73% smaller. Perlbench has 0.9% smaller > codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1. > > Passes bootstrap and regress. OK for commit?
Looks like everyone agrees that https://github.com/ARM-software/abi-aa/pull/106 should go in in some form, so I think it's OK for GCC to keep the instructions together. Some comments on the implementation though. (We might have covered these earlier, sorry if so.) - Why do we rewrite the constant moves after reload into ldr_got_small_sidi and ldr_got_small_<mode>? Couldn't we just get the move patterns to output the sequence directly? - I think we should leave out the rtx_costs part and deal with that separately. This patch should just be about whether we emit two separate define_insns for the move or whether we keep a single one (to support relaxation). Thanks, Richard > > ChangeLog: > 2021-06-04 Wilco Dijkstra <[email protected]> > > * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload. > (movdi): Likewise. > (ldr_got_small_<mode>): Remove MEM and LO_SUM, emit ADRP+LDR GOT > sequence. > (ldr_got_small_sidi): Likewise. > * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay > splitting of GOT accesses until after reload. Remove tmp_reg and MEM. > (aarch64_print_operand): Correctly print got_lo12 in L specifier. > (aarch64_rtx_costs): Set rematerialization cost for GOT accesses. > (aarch64_mov_operand_p): Make GOT accesses valid move operands. > > --- > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 08245827daa3f8199b29031e754244c078f0f500..11ea33c70fb06194fadfe94322fdfa098e5320fc > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, > > case SYMBOL_SMALL_GOT_4G: > { > + /* Use movdi for GOT accesses until after reload - this improves > + CSE and rematerialization. */ > + if (!reload_completed) > + { > + emit_insn (gen_rtx_SET (dest, imm)); > + return; > + } > + > /* In ILP32, the mode of dest can be either SImode or DImode, > while the got entry is always of SImode size. The mode of > dest depends on how dest is used: if dest is assigned to a > @@ -3624,34 +3632,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, > patterns here (two patterns for ILP32). */ > > rtx insn; > - rtx mem; > - rtx tmp_reg = dest; > machine_mode mode = GET_MODE (dest); > > - if (can_create_pseudo_p ()) > - tmp_reg = gen_reg_rtx (mode); > - > - emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm)); > if (mode == ptr_mode) > { > if (mode == DImode) > - insn = gen_ldr_got_small_di (dest, tmp_reg, imm); > + insn = gen_ldr_got_small_di (dest, imm); > else > - insn = gen_ldr_got_small_si (dest, tmp_reg, imm); > - > - mem = XVECEXP (SET_SRC (insn), 0, 0); > + insn = gen_ldr_got_small_si (dest, imm); > } > else > { > gcc_assert (mode == Pmode); > - > - insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm); > - mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); > + insn = gen_ldr_got_small_sidi (dest, imm); > } > > - gcc_assert (MEM_P (mem)); > - MEM_READONLY_P (mem) = 1; > - MEM_NOTRAP_P (mem) = 1; > emit_insn (insn); > return; > } > @@ -11019,7 +11014,7 @@ aarch64_print_operand (FILE *f, rtx x, int code) > switch (aarch64_classify_symbolic_expression (x)) > { > case SYMBOL_SMALL_GOT_4G: > - asm_fprintf (asm_out_file, ":lo12:"); > + asm_fprintf (asm_out_file, ":got_lo12:"); > break; > > case SYMBOL_SMALL_TLSGD: > @@ -13452,6 +13447,12 @@ cost_plus: > > case SYMBOL_REF: > *cost = 0; > + > + /* Use a separate remateralization cost for GOT accesses. */ > + if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC > + && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G) > + *cost = COSTS_N_INSNS (1) / 2; > + > return true; > > case HIGH: > @@ -19907,6 +19908,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) > return aarch64_simd_valid_immediate (x, NULL); > } > > + /* GOT accesses are valid moves until after regalloc. */ > + if (SYMBOL_REF_P (x) > + && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G) > + return true; > + > x = strip_salt (x); > if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x)) > return true; > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > abfd84526745d029ad4953eabad6dd17b159a218..30effca6f3562f6870a6cc8097750e63bb0d424d > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64" > fmov\\t%w0, %s1 > fmov\\t%s0, %s1 > * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);" > - "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), > SImode) > - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" > + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), > SImode) > + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) > + || (reload_completed > + && (aarch64_classify_symbolic_expression (operands[1]) > + == SYMBOL_SMALL_GOT_4G))" > [(const_int 0)] > "{ > aarch64_expand_mov_immediate (operands[0], operands[1]); > @@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64" > fmov\\t%x0, %d1 > fmov\\t%d0, %d1 > * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" > - "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), > DImode)) > - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" > + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), > DImode) > + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) > + || (reload_completed > + && (aarch64_classify_symbolic_expression (operands[1]) > + == SYMBOL_SMALL_GOT_4G))" > [(const_int 0)] > "{ > aarch64_expand_mov_immediate (operands[0], operands[1]); > @@ -6703,27 +6709,26 @@ (define_insn "add_losym_<mode>" > [(set_attr "type" "alu_imm")] > ) > > +;;; GOT accesses use a single insn so linkers can relax GOT relocations. > (define_insn "ldr_got_small_<mode>" > [(set (match_operand:PTR 0 "register_operand" "=r") > - (unspec:PTR [(mem:PTR (lo_sum:PTR > - (match_operand:PTR 1 "register_operand" "r") > - (match_operand:PTR 2 "aarch64_valid_symref" > "S")))] > + (unspec:PTR [(match_operand:PTR 1 "aarch64_valid_symref" "S")] > UNSPEC_GOTSMALLPIC))] > "" > - "ldr\\t%<w>0, [%1, #:got_lo12:%c2]" > - [(set_attr "type" "load_<ldst_sz>")] > + "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]" > + [(set_attr "type" "load_<ldst_sz>") > + (set_attr "length" "8")] > ) > > (define_insn "ldr_got_small_sidi" > [(set (match_operand:DI 0 "register_operand" "=r") > (zero_extend:DI > - (unspec:SI [(mem:SI (lo_sum:DI > - (match_operand:DI 1 "register_operand" "r") > - (match_operand:DI 2 "aarch64_valid_symref" > "S")))] > + (unspec:SI [(match_operand:DI 1 "aarch64_valid_symref" "S")] > UNSPEC_GOTSMALLPIC)))] > "TARGET_ILP32" > - "ldr\\t%w0, [%1, #:got_lo12:%c2]" > - [(set_attr "type" "load_4")] > + "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]" > + [(set_attr "type" "load_4") > + (set_attr "length" "8")] > ) > > (define_insn "ldr_got_small_28k_<mode>"
