Hi Richard, This merges the v1 and v2 patches and removes the spurious MEM from ldr_got_small_si/di. This has been rebased after [1], and the performance gain has now doubled.
[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571708.html Improve GOT addressing by treating the instructions as a pair. This reduces register pressure and improves code quality significantly. SPECINT2017 improves by 0.6% with -fPIC and codesize is 0.73% smaller. Perlbench has 0.9% smaller codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1. Passes bootstrap and regress. OK for commit? ChangeLog: 2021-06-04 Wilco Dijkstra <wdijk...@arm.com> * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload. (movdi): Likewise. (ldr_got_small_<mode>): Remove MEM and LO_SUM, emit ADRP+LDR GOT sequence. (ldr_got_small_sidi): Likewise. * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay splitting of GOT accesses until after reload. Remove tmp_reg and MEM. (aarch64_print_operand): Correctly print got_lo12 in L specifier. (aarch64_rtx_costs): Set rematerialization cost for GOT accesses. (aarch64_mov_operand_p): Make GOT accesses valid move operands. --- diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 08245827daa3f8199b29031e754244c078f0f500..11ea33c70fb06194fadfe94322fdfa098e5320fc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, case SYMBOL_SMALL_GOT_4G: { + /* Use movdi for GOT accesses until after reload - this improves + CSE and rematerialization. */ + if (!reload_completed) + { + emit_insn (gen_rtx_SET (dest, imm)); + return; + } + /* In ILP32, the mode of dest can be either SImode or DImode, while the got entry is always of SImode size. The mode of dest depends on how dest is used: if dest is assigned to a @@ -3624,34 +3632,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, patterns here (two patterns for ILP32). */ rtx insn; - rtx mem; - rtx tmp_reg = dest; machine_mode mode = GET_MODE (dest); - if (can_create_pseudo_p ()) - tmp_reg = gen_reg_rtx (mode); - - emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm)); if (mode == ptr_mode) { if (mode == DImode) - insn = gen_ldr_got_small_di (dest, tmp_reg, imm); + insn = gen_ldr_got_small_di (dest, imm); else - insn = gen_ldr_got_small_si (dest, tmp_reg, imm); - - mem = XVECEXP (SET_SRC (insn), 0, 0); + insn = gen_ldr_got_small_si (dest, imm); } else { gcc_assert (mode == Pmode); - - insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm); - mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); + insn = gen_ldr_got_small_sidi (dest, imm); } - gcc_assert (MEM_P (mem)); - MEM_READONLY_P (mem) = 1; - MEM_NOTRAP_P (mem) = 1; emit_insn (insn); return; } @@ -11019,7 +11014,7 @@ aarch64_print_operand (FILE *f, rtx x, int code) switch (aarch64_classify_symbolic_expression (x)) { case SYMBOL_SMALL_GOT_4G: - asm_fprintf (asm_out_file, ":lo12:"); + asm_fprintf (asm_out_file, ":got_lo12:"); break; case SYMBOL_SMALL_TLSGD: @@ -13452,6 +13447,12 @@ cost_plus: case SYMBOL_REF: *cost = 0; + + /* Use a separate remateralization cost for GOT accesses. */ + if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC + && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G) + *cost = COSTS_N_INSNS (1) / 2; + return true; case HIGH: @@ -19907,6 +19908,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) return aarch64_simd_valid_immediate (x, NULL); } + /* GOT accesses are valid moves until after regalloc. */ + if (SYMBOL_REF_P (x) + && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G) + return true; + x = strip_salt (x); if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x)) return true; diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index abfd84526745d029ad4953eabad6dd17b159a218..30effca6f3562f6870a6cc8097750e63bb0d424d 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64" fmov\\t%w0, %s1 fmov\\t%s0, %s1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);" - "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode) + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) + || (reload_completed + && (aarch64_classify_symbolic_expression (operands[1]) + == SYMBOL_SMALL_GOT_4G))" [(const_int 0)] "{ aarch64_expand_mov_immediate (operands[0], operands[1]); @@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64" fmov\\t%x0, %d1 fmov\\t%d0, %d1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" - "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode) + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) + || (reload_completed + && (aarch64_classify_symbolic_expression (operands[1]) + == SYMBOL_SMALL_GOT_4G))" [(const_int 0)] "{ aarch64_expand_mov_immediate (operands[0], operands[1]); @@ -6703,27 +6709,26 @@ (define_insn "add_losym_<mode>" [(set_attr "type" "alu_imm")] ) +;;; GOT accesses use a single insn so linkers can relax GOT relocations. (define_insn "ldr_got_small_<mode>" [(set (match_operand:PTR 0 "register_operand" "=r") - (unspec:PTR [(mem:PTR (lo_sum:PTR - (match_operand:PTR 1 "register_operand" "r") - (match_operand:PTR 2 "aarch64_valid_symref" "S")))] + (unspec:PTR [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLPIC))] "" - "ldr\\t%<w>0, [%1, #:got_lo12:%c2]" - [(set_attr "type" "load_<ldst_sz>")] + "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]" + [(set_attr "type" "load_<ldst_sz>") + (set_attr "length" "8")] ) (define_insn "ldr_got_small_sidi" [(set (match_operand:DI 0 "register_operand" "=r") (zero_extend:DI - (unspec:SI [(mem:SI (lo_sum:DI - (match_operand:DI 1 "register_operand" "r") - (match_operand:DI 2 "aarch64_valid_symref" "S")))] + (unspec:SI [(match_operand:DI 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLPIC)))] "TARGET_ILP32" - "ldr\\t%w0, [%1, #:got_lo12:%c2]" - [(set_attr "type" "load_4")] + "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]" + [(set_attr "type" "load_4") + (set_attr "length" "8")] ) (define_insn "ldr_got_small_28k_<mode>"