https://gcc.gnu.org/g:873cffc79209119a65aa657b0d427345e52b75c3
commit r15-5569-g873cffc79209119a65aa657b0d427345e52b75c3 Author: Georg-Johann Lay <a...@gjlay.de> Date: Wed Nov 20 12:25:18 2024 +0100 AVR: target/117726 - Better optimizations of ASHIFT:SI insns. This patch improves the 4-byte ASHIFT insns. 1) It adds a "r,r,C15" alternative for improved long << 15. 2) It adds 3-operand alternatives (depending on options) and splits them after peephole2 / before avr-fuse-move into a 3-operand byte shift and a 2-operand residual bit shift. For better control, it introduces new option -msplit-bit-shift that's activated at -O2 and higher per default. 2) is even performed with -Os, but not with -Oz. PR target/117726 gcc/ * config/avr/avr.opt (-msplit-bit-shift): Add new optimization option. * common/config/avr/avr-common.cc (avr_option_optimization_table) [OPT_LEVELS_2_PLUS]: Turn on -msplit-bit-shift. * config/avr/avr.h (machine_function.n_avr_fuse_add_executed): New bool component. * config/avr/avr.md (attr "isa") <2op, 3op>: Add new values. (attr "enabled"): Handle them. (ashlsi3, *ashlsi3, *ashlsi3_const): Add "r,r,C15" alternative. Add "r,0,C4l" and "r,r,C4l" alternatives (depending on 2op / 3op). (define_split) [avr_split_bit_shift]: Add 2 new ashift:ALL4 splitters. (define_peephole2) [ashift:ALL4]: Add (match_dup 3) so that the scratch won't overlap with the output operand of the matched insn. (*ashl<mode>3_const_split): Remove unused ashift:ALL4 splitter. * config/avr/avr-passes.cc (emit_valid_insn) (emit_valid_move_clobbercc): Move out of anonymous namespace. (make_avr_pass_fuse_add) <gate>: Don't override. <execute>: Set n_avr_fuse_add_executed according to func->machine->n_avr_fuse_add_executed. (pass_data avr_pass_data_split_after_peephole2): New object. (avr_pass_split_after_peephole2): New rtl_opt_pass. (avr_emit_shift): New static function. (avr_shift_is_3op, avr_split_shift_p, avr_split_shift) (make_avr_pass_split_after_peephole2): New functions. * config/avr/avr-passes.def (avr_pass_split_after_peephole2): Insert new pass after pass_peephole2. * config/avr/avr-protos.h (n_avr_fuse_add_executed, avr_shift_is_3op, avr_split_shift_p) (avr_split_shift, avr_optimize_size_level) (make_avr_pass_split_after_peephole2): New prototypes. * config/avr/avr.cc (n_avr_fuse_add_executed): New global variable. (avr_optimize_size_level): New function. (avr_set_current_function): Set n_avr_fuse_add_executed according to cfun->machine->n_avr_fuse_add_executed. (ashlsi3_out) [case 15]: Output optimized code for this offset. (avr_rtx_costs_1) [ASHIFT, SImode]: Adjust costs of oggsets 15, 16. * config/avr/constraints.md (C4a, C4r, C4r): New constraints. * pass_manager.h (pass_manager): Adjust comments. Diff: --- gcc/common/config/avr/avr-common.cc | 1 + gcc/config/avr/avr-passes.cc | 214 +++++++++++++++++++++++++++++++----- gcc/config/avr/avr-passes.def | 7 ++ gcc/config/avr/avr-protos.h | 8 ++ gcc/config/avr/avr.cc | 56 +++++++++- gcc/config/avr/avr.h | 6 + gcc/config/avr/avr.md | 204 +++++++++++++++++++++------------- gcc/config/avr/avr.opt | 4 + gcc/config/avr/constraints.md | 16 +++ gcc/pass_manager.h | 2 +- 10 files changed, 407 insertions(+), 111 deletions(-) diff --git a/gcc/common/config/avr/avr-common.cc b/gcc/common/config/avr/avr-common.cc index 333f950c80e3..54c99bd0b4af 100644 --- a/gcc/common/config/avr/avr-common.cc +++ b/gcc/common/config/avr/avr-common.cc @@ -39,6 +39,7 @@ static const struct default_options avr_option_optimization_table[] = { OPT_LEVELS_2_PLUS, OPT_mfuse_add_, NULL, 2 }, { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_mfuse_move_, NULL, 3 }, { OPT_LEVELS_2_PLUS, OPT_mfuse_move_, NULL, 23 }, + { OPT_LEVELS_2_PLUS, OPT_msplit_bit_shift, NULL, 1 }, // Stick to the "old" placement of the subreg lowering pass. { OPT_LEVELS_1_PLUS, OPT_fsplit_wide_types_early, NULL, 1 }, /* Allow optimizer to introduce store data races. This used to be the diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc index 49473efbb0d3..b854f186a7ac 100644 --- a/gcc/config/avr/avr-passes.cc +++ b/gcc/config/avr/avr-passes.cc @@ -49,6 +49,34 @@ #define FIRST_GPR (AVR_TINY ? REG_18 : REG_2) + +// Emit pattern PAT, and ICE when the insn is not valid / not recognized. + +static rtx_insn * +emit_valid_insn (rtx pat) +{ + rtx_insn *insn = emit_insn (pat); + + if (! valid_insn_p (insn)) // Also runs recog(). + fatal_insn ("emit unrecognizable insn", insn); + + return insn; +} + +// Emit a single_set with an optional scratch operand. This function +// asserts that the new insn is valid and recognized. + +static rtx_insn * +emit_valid_move_clobbercc (rtx dest, rtx src, rtx scratch = NULL_RTX) +{ + rtx pat = scratch + ? gen_gen_move_clobbercc_scratch (dest, src, scratch) + : gen_gen_move_clobbercc (dest, src); + + return emit_valid_insn (pat); +} + + namespace { @@ -116,31 +144,6 @@ single_set_with_scratch (rtx_insn *insn, int ®no_scratch) return single_set (insn); } -// Emit pattern PAT, and ICE when the insn is not valid / not recognized. - -static rtx_insn * -emit_valid_insn (rtx pat) -{ - rtx_insn *insn = emit_insn (pat); - - if (! valid_insn_p (insn)) // Also runs recog(). - fatal_insn ("emit unrecognizable insn", insn); - - return insn; -} - -// Emit a single_set with an optional scratch operand. This function -// asserts that the new insn is valid and recognized. - -static rtx_insn * -emit_valid_move_clobbercc (rtx dest, rtx src, rtx scratch = NULL_RTX) -{ - rtx pat = scratch - ? gen_gen_move_clobbercc_scratch (dest, src, scratch) - : gen_gen_move_clobbercc (dest, src); - - return emit_valid_insn (pat); -} // One bit for each GRP in REG_0 ... REG_31. using gprmask_t = uint32_t; @@ -4213,12 +4216,17 @@ public: return make_avr_pass_fuse_add (m_ctxt); } - bool gate (function *) final override + unsigned int execute (function *func) final override { - return optimize && avr_fuse_add > 0; + func->machine->n_avr_fuse_add_executed += 1; + n_avr_fuse_add_executed = func->machine->n_avr_fuse_add_executed; + + if (optimize && avr_fuse_add > 0) + return execute1 (func); + return 0; } - unsigned int execute (function *) final override; + unsigned int execute1 (function *); struct Some_Insn { @@ -4697,7 +4705,7 @@ avr_pass_fuse_add::fuse_mem_add (Mem_Insn &mem, Add_Insn &add) as PRE_DEC + PRE_DEC for two adjacent locations. */ unsigned int -avr_pass_fuse_add::execute (function *func) +avr_pass_fuse_add::execute1 (function *func) { df_note_add_problem (); df_analyze (); @@ -4769,6 +4777,146 @@ avr_pass_fuse_add::execute (function *func) } + +////////////////////////////////////////////////////////////////////////////// +// Split insns after peephole2 / befor avr-fuse-move. +static const pass_data avr_pass_data_split_after_peephole2 = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + 0 // todo_flags_finish +}; + +class avr_pass_split_after_peephole2 : public rtl_opt_pass +{ +public: + avr_pass_split_after_peephole2 (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_split_after_peephole2, ctxt) + { + this->name = name; + } + + unsigned int execute (function *) final override + { + if (avr_shift_is_3op ()) + split_all_insns (); + return 0; + } + +}; // avr_pass_split_after_peephole2 + +} // anonymous namespace + + +/* Whether some shift insn alternatives are a 3-operand insn or a + 2-operand insn. This 3op alternatives allow the source and the + destination register of the shift to be different right from the + start, because the splitter will split the 3op shift into a 3op byte + shift and a 2op residual bit shift. + (When the residual shift has an offset of one less than the bitsize, + then the residual shift is also a 3op insn. */ + +bool +avr_shift_is_3op () +{ + // Don't split for OPTIMIZE_SIZE_MAX (-Oz). + // For OPTIMIZE_SIZE_BALANCED (-Os), we still split because + // the size overhead (if exists at all) is marginal. + + return (avr_split_bit_shift + && optimize > 0 + && avr_optimize_size_level () < OPTIMIZE_SIZE_MAX); +} + + +/* Implement constraints `C4a', `C4l' and `C4r'. + Whether we split an N_BYTES shift of code CODE in { ASHIFTRT, + LSHIFTRT, ASHIFT } into a byte shift and a residual bit shift. */ + +bool +avr_split_shift_p (int n_bytes, int offset, rtx_code) +{ + gcc_assert (n_bytes == 4); + + return (avr_shift_is_3op () + && offset % 8 != 0 && IN_RANGE (offset, 17, 30)); +} + + +static void +avr_emit_shift (rtx_code code, rtx dest, rtx src, int off, rtx scratch) +{ + machine_mode mode = GET_MODE (dest); + rtx shift; + + if (off == GET_MODE_BITSIZE (mode) - 1) + { + shift = gen_rtx_fmt_ee (code, mode, src, GEN_INT (off)); + } + else + { + if (REGNO (dest) != REGNO (src)) + emit_valid_move_clobbercc (dest, src); + shift = gen_rtx_fmt_ee (code, mode, dest, GEN_INT (off)); + } + + emit_valid_move_clobbercc (dest, shift, scratch); +} + + +/* Worker for define_split that run when -msplit-bit-shift is on. + Split a shift of code CODE into a 3op byte shift and a residual bit shift. + Return 'true' when a split has been performed and insns have been emitted. + Otherwise, return 'false'. */ + +bool +avr_split_shift (rtx xop[], rtx scratch, rtx_code code) +{ + scratch = scratch && REG_P (scratch) ? scratch : NULL_RTX; + rtx dest = xop[0]; + rtx src = xop[1]; + int ioff = INTVAL (xop[2]); + + gcc_assert (GET_MODE_SIZE (GET_MODE (dest)) == 4); + + if (code == ASHIFT) + { + if (ioff >= 25) + { + rtx dst8 = avr_byte (dest, 3); + rtx src8 = avr_byte (src, 0); + avr_emit_shift (code, dst8, src8, ioff % 8, NULL_RTX); + emit_valid_move_clobbercc (avr_byte (dest, 2), const0_rtx); + emit_valid_move_clobbercc (avr_word (dest, 0), const0_rtx); + return true; + } + else if (ioff >= 17) + { + rtx dst16 = avr_word (dest, 2); + rtx src16 = avr_word (src, 0); + avr_emit_shift (code, dst16, src16, ioff % 16, scratch); + emit_valid_move_clobbercc (avr_word (dest, 0), const0_rtx); + return true; + } + else + gcc_unreachable (); + } + else + gcc_unreachable (); + + return false; +} + + +namespace +{ + ////////////////////////////////////////////////////////////////////////////// // Determine whether an ISR may use the __gcc_isr pseudo-instruction. @@ -5125,3 +5273,11 @@ make_avr_pass_fuse_move (gcc::context *ctxt) { return new avr_pass_fuse_move (ctxt, "avr-fuse-move"); } + +// Split insns after peephole2 / befor avr-fuse-move. + +rtl_opt_pass * +make_avr_pass_split_after_peephole2 (gcc::context *ctxt) +{ + return new avr_pass_split_after_peephole2 (ctxt, "avr-split-after-peephole2"); +} diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def index 857e6b521238..be8278370b01 100644 --- a/gcc/config/avr/avr-passes.def +++ b/gcc/config/avr/avr-passes.def @@ -104,3 +104,10 @@ INSERT_PASS_BEFORE (pass_split_after_reload, 1, avr_pass_ifelse); - The RTL peepholer may optimize insns involving lower registers. */ INSERT_PASS_AFTER (pass_peephole2, 1, avr_pass_fuse_move); + + /* Run an instance of post-reload split prior to avr-fuse-move. + Purpose is to split 3-operand shift insns into a 3-operand shift + with a byte offset, and a 2-operand residual shift after + RTL peepholes but prior to the avr-fuse-move pass. */ + +INSERT_PASS_AFTER (pass_peephole2, 1, avr_pass_split_after_peephole2); diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h index d316e0182a23..c9bb5bc5139a 100644 --- a/gcc/config/avr/avr-protos.h +++ b/gcc/config/avr/avr-protos.h @@ -169,6 +169,13 @@ extern rtx cc_reg_rtx; extern rtx ccn_reg_rtx; extern rtx cczn_reg_rtx; +extern int n_avr_fuse_add_executed; +extern bool avr_shift_is_3op (); +extern bool avr_split_shift_p (int n_bytes, int offset, rtx_code); +extern bool avr_split_shift (rtx xop[], rtx xscratch, rtx_code); + +extern int avr_optimize_size_level (); + #endif /* RTX_CODE */ #ifdef REAL_VALUE_TYPE @@ -188,6 +195,7 @@ extern rtl_opt_pass *make_avr_pass_pre_proep (gcc::context *); extern rtl_opt_pass *make_avr_pass_recompute_notes (gcc::context *); extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *); extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *); +extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *); #ifdef RTX_CODE extern bool avr_casei_sequence_check_operands (rtx *xop); extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands); diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc index 508e2d147bff..d74b20e798e5 100644 --- a/gcc/config/avr/avr.cc +++ b/gcc/config/avr/avr.cc @@ -229,6 +229,12 @@ bool avr_need_clear_bss_p = false; bool avr_need_copy_data_p = false; bool avr_has_rodata_p = false; +/* Counts how often pass avr-fuse-add has been executed. Is is kept in + sync with cfun->machine->n_avr_fuse_add_executed and serves as an + insn condition for shift insn splitters. */ +int n_avr_fuse_add_executed = 0; + + /* Transform UP into lowercase and write the result to LO. You must provide enough space for LO. Return LO. */ @@ -526,6 +532,14 @@ avr_option_override (void) } +int avr_optimize_size_level () +{ + return cfun && cfun->decl + ? opt_for_fn (cfun->decl, optimize_size) + : optimize_size; +} + + /* Implement `INIT_EXPANDERS'. */ /* The function works like a singleton. */ @@ -823,8 +837,12 @@ avr_set_current_function (tree decl) if (decl == NULL_TREE || current_function_decl == NULL_TREE || current_function_decl == error_mark_node - || ! cfun->machine - || cfun->machine->attributes_checked_p) + || ! cfun->machine) + return; + + n_avr_fuse_add_executed = cfun->machine->n_avr_fuse_add_executed; + + if (cfun->machine->attributes_checked_p) return; location_t loc = DECL_SOURCE_LOCATION (decl); @@ -6590,7 +6608,7 @@ avr_out_cmp_ext (rtx xop[], rtx_code code, int *plen) /* Generate asm equivalent for various shifts. This only handles cases - that are not already carefully hand-optimized in ?sh??i3_out. + that are not already carefully hand-optimized in ?sh<mode>3_out. OPERANDS[0] resp. %0 in TEMPL is the operand to be shifted. OPERANDS[2] is the shift count as CONST_INT, MEM or REG. @@ -7042,6 +7060,7 @@ ashlsi3_out (rtx_insn *insn, rtx operands[], int *plen) { int reg0 = true_regnum (operands[0]); int reg1 = true_regnum (operands[1]); + bool reg1_unused_after_p = reg_unused_after (insn, operands[1]); if (plen) *plen = 0; @@ -7070,6 +7089,30 @@ ashlsi3_out (rtx_insn *insn, rtx operands[], int *plen) "mov %B0,%A1" CR_TAB "mov %C0,%B1" CR_TAB "mov %D0,%C1", operands, plen, 4); + case 15: + avr_asm_len (reg1_unused_after_p + ? "lsr %C1" + : "bst %C1,0", operands, plen, 1); + if (reg0 + 2 != reg1) + { + if (AVR_HAVE_MOVW) + avr_asm_len ("movw %C0,%A1", operands, plen, 1); + else + avr_asm_len ("mov %C0,%A1" CR_TAB + "mov %D0,%B1", operands, plen, 2); + } + return reg1_unused_after_p + ? avr_asm_len ("clr %A0" CR_TAB + "clr %B0" CR_TAB + "ror %D0" CR_TAB + "ror %C0" CR_TAB + "ror %B0", operands, plen, 5) + : avr_asm_len ("clr %A0" CR_TAB + "clr %B0" CR_TAB + "lsr %D0" CR_TAB + "ror %C0" CR_TAB + "ror %B0" CR_TAB + "bld %D0,7", operands, plen, 6); case 16: if (reg0 + 2 == reg1) return avr_asm_len ("clr %B0" CR_TAB @@ -12392,9 +12435,14 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code, break; case 1: case 8: - case 16: *total = COSTS_N_INSNS (4); break; + case 15: + *total = COSTS_N_INSNS (8 - AVR_HAVE_MOVW); + break; + case 16: + *total = COSTS_N_INSNS (4 - AVR_HAVE_MOVW); + break; case 31: *total = COSTS_N_INSNS (6); break; diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h index df0462259db5..7d887a6579ce 100644 --- a/gcc/config/avr/avr.h +++ b/gcc/config/avr/avr.h @@ -610,6 +610,12 @@ struct GTY(()) machine_function /* 'true' if this function references .L__stack_usage like with __builtin_return_address. */ bool use_L__stack_usage; + + /* Counts how many times the execute() method of the avr-fuse-add + has been invoked. The count is even increased when the optimization + itself is not run. This purpose of this variable is to provide + information about where in the pass sequence we are. */ + int n_avr_fuse_add_executed; }; /* AVR does not round pushes, but the existence of this macro is diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index 04d838ef8a72..8c1326fdf167 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -184,73 +184,75 @@ ;; no_xmega: non-XMEGA core xmega : XMEGA core ;; no_adiw: ISA has no ADIW, SBIW adiw : ISA has ADIW, SBIW +;; The following ISA attributes are actually not architecture specific, +;; but depend on (optimization) options. This is because the "enabled" +;; attribut can't depend on more than one other attribute. This means +;; that 2op and 3op must work for all ISAs, and hence a 'flat' attribue +;; scheme can be used (as opposed to a true cartesian product). + +;; 2op : insn is a 2-operand insn 3op : insn is a 3-operand insn + (define_attr "isa" "mov,movw, rjmp,jmp, ijmp,eijmp, lpm,lpmx, elpm,elpmx, no_xmega,xmega, no_adiw,adiw, + 2op,3op, standard" (const_string "standard")) (define_attr "enabled" "" - (cond [(eq_attr "isa" "standard") - (const_int 1) + (if_then_else + (ior (eq_attr "isa" "standard") + + (and (eq_attr "isa" "mov") + (match_test "!AVR_HAVE_MOVW")) - (and (eq_attr "isa" "mov") - (match_test "!AVR_HAVE_MOVW")) - (const_int 1) + (and (eq_attr "isa" "movw") + (match_test "AVR_HAVE_MOVW")) - (and (eq_attr "isa" "movw") - (match_test "AVR_HAVE_MOVW")) - (const_int 1) + (and (eq_attr "isa" "rjmp") + (match_test "!AVR_HAVE_JMP_CALL")) - (and (eq_attr "isa" "rjmp") - (match_test "!AVR_HAVE_JMP_CALL")) - (const_int 1) + (and (eq_attr "isa" "jmp") + (match_test "AVR_HAVE_JMP_CALL")) - (and (eq_attr "isa" "jmp") - (match_test "AVR_HAVE_JMP_CALL")) - (const_int 1) + (and (eq_attr "isa" "ijmp") + (match_test "!AVR_HAVE_EIJMP_EICALL")) - (and (eq_attr "isa" "ijmp") - (match_test "!AVR_HAVE_EIJMP_EICALL")) - (const_int 1) + (and (eq_attr "isa" "eijmp") + (match_test "AVR_HAVE_EIJMP_EICALL")) - (and (eq_attr "isa" "eijmp") - (match_test "AVR_HAVE_EIJMP_EICALL")) - (const_int 1) + (and (eq_attr "isa" "lpm") + (match_test "!AVR_HAVE_LPMX")) - (and (eq_attr "isa" "lpm") - (match_test "!AVR_HAVE_LPMX")) - (const_int 1) + (and (eq_attr "isa" "lpmx") + (match_test "AVR_HAVE_LPMX")) - (and (eq_attr "isa" "lpmx") - (match_test "AVR_HAVE_LPMX")) - (const_int 1) + (and (eq_attr "isa" "elpm") + (match_test "AVR_HAVE_ELPM && !AVR_HAVE_ELPMX")) - (and (eq_attr "isa" "elpm") - (match_test "AVR_HAVE_ELPM && !AVR_HAVE_ELPMX")) - (const_int 1) + (and (eq_attr "isa" "elpmx") + (match_test "AVR_HAVE_ELPMX")) - (and (eq_attr "isa" "elpmx") - (match_test "AVR_HAVE_ELPMX")) - (const_int 1) + (and (eq_attr "isa" "xmega") + (match_test "AVR_XMEGA")) - (and (eq_attr "isa" "xmega") - (match_test "AVR_XMEGA")) - (const_int 1) + (and (eq_attr "isa" "no_xmega") + (match_test "!AVR_XMEGA")) - (and (eq_attr "isa" "no_xmega") - (match_test "!AVR_XMEGA")) - (const_int 1) + (and (eq_attr "isa" "adiw") + (match_test "AVR_HAVE_ADIW")) - (and (eq_attr "isa" "adiw") - (match_test "AVR_HAVE_ADIW")) - (const_int 1) + (and (eq_attr "isa" "no_adiw") + (match_test "!AVR_HAVE_ADIW")) - (and (eq_attr "isa" "no_adiw") - (match_test "!AVR_HAVE_ADIW")) - (const_int 1) + (and (eq_attr "isa" "2op") + (match_test "!avr_shift_is_3op ()")) - ] (const_int 0))) + (and (eq_attr "isa" "3op") + (match_test "avr_shift_is_3op ()")) + ) + (const_int 1) + (const_int 0))) ;; Define mode iterators @@ -5257,28 +5259,31 @@ ;; "ashlsq3" "ashlusq3" ;; "ashlsa3" "ashlusa3" (define_insn_and_split "ashl<mode>3" - [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r ,r,r,r") - (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r ,0,0,0") - (match_operand:QI 2 "nop_general_operand" "r,L,P,O C31,K,n,Qm")))] + [(set (match_operand:ALL4 0 "register_operand" "=r,r ,r ,r ,r ,r,r") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0 ,r ,0 ,r ,0,0") + (match_operand:QI 2 "nop_general_operand" "r,LPK,O C15 C31,C4l,C4l,n,Qm")))] "" "#" "&& reload_completed" [(parallel [(set (match_dup 0) (ashift:ALL4 (match_dup 1) (match_dup 2))) - (clobber (reg:CC REG_CC))])]) + (clobber (reg:CC REG_CC))])] + "" + [(set_attr "isa" "*,*,*,2op,3op,*,*")]) (define_insn "*ashl<mode>3" - [(set (match_operand:ALL4 0 "register_operand" "=r,r,r,r ,r,r,r") - (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,0,r ,0,0,0") - (match_operand:QI 2 "nop_general_operand" "r,L,P,O C31,K,n,Qm"))) + [(set (match_operand:ALL4 0 "register_operand" "=r,r ,r ,r ,r ,r,r") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0 ,r ,0 ,r ,0,0") + (match_operand:QI 2 "nop_general_operand" "r,LPK,O C15 C31,C4l,C4l,n,Qm"))) (clobber (reg:CC REG_CC))] "reload_completed" { return ashlsi3_out (insn, operands, NULL); } - [(set_attr "length" "8,0,4,5,8,10,12") - (set_attr "adjust_len" "ashlsi")]) + [(set_attr "length" "12") + (set_attr "adjust_len" "ashlsi") + (set_attr "isa" "*,*,*,2op,3op,*,*")]) ;; Optimize if a scratch register from LD_REGS happens to be available. @@ -5380,12 +5385,72 @@ [(set_attr "length" "0,2,2,4,10") (set_attr "adjust_len" "ashlhi")]) + +;; Split shift into a byte shift and a residual bit shift (without scratch) +(define_split + [(parallel [(set (match_operand:ALL4 0 "register_operand") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand") + (match_operand:QI 2 "const_int_operand"))) + (clobber (reg:CC REG_CC))])] + "avr_split_bit_shift + && n_avr_fuse_add_executed >= 1 + && satisfies_constraint_C4l (operands[2])" + [(parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 1) + (match_dup 3))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 0) + (match_dup 4))) + (clobber (reg:CC REG_CC))])] + { + if (avr_split_shift (operands, NULL_RTX, ASHIFT)) + DONE; + else if (REGNO (operands[0]) == REGNO (operands[1])) + FAIL; + int offset = INTVAL (operands[2]); + operands[3] = GEN_INT (offset & ~7); + operands[4] = GEN_INT (offset & 7); + }) + +;; Split shift into a byte shift and a residual bit shift (with scratch) +(define_split + [(parallel [(set (match_operand:ALL4 0 "register_operand") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand") + (match_operand:QI 2 "const_int_operand"))) + (clobber (match_operand:QI 3 "scratch_or_d_register_operand")) + (clobber (reg:CC REG_CC))])] + "avr_split_bit_shift + && n_avr_fuse_add_executed >= 1 + && satisfies_constraint_C4l (operands[2])" + [(parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 1) + (match_dup 4))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (match_dup 0) + (ashift:ALL4 (match_dup 0) + (match_dup 5))) + (clobber (match_dup 3)) + (clobber (reg:CC REG_CC))])] + { + if (avr_split_shift (operands, operands[3], ASHIFT)) + DONE; + else if (REGNO (operands[0]) == REGNO (operands[1])) + FAIL; + int offset = INTVAL (operands[2]); + operands[4] = GEN_INT (offset & ~7); + operands[5] = GEN_INT (offset & 7); + }) + + (define_peephole2 [(match_scratch:QI 3 "d") (parallel [(set (match_operand:ALL4 0 "register_operand" "") (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "") (match_operand:QI 2 "const_int_operand" ""))) - (clobber (reg:CC REG_CC))])] + (clobber (reg:CC REG_CC))]) + ;; $3 must not overlap with the output of the insn above. + (match_dup 3)] "" [(parallel [(set (match_dup 0) (ashift:ALL4 (match_dup 1) @@ -5393,35 +5458,20 @@ (clobber (match_dup 3)) (clobber (reg:CC REG_CC))])]) -;; "*ashlsi3_const" -;; "*ashlsq3_const" "*ashlusq3_const" -;; "*ashlsa3_const" "*ashlusa3_const" -(define_insn_and_split "*ashl<mode>3_const_split" - [(set (match_operand:ALL4 0 "register_operand" "=r,r,r ,r") - (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r ,0") - (match_operand:QI 2 "const_int_operand" "L,P,O C31,n"))) - (clobber (match_scratch:QI 3 "=X,X,X ,&d"))] - "reload_completed" - "#" - "&& reload_completed" - [(parallel [(set (match_dup 0) - (ashift:ALL4 (match_dup 1) - (match_dup 2))) - (clobber (match_dup 3)) - (clobber (reg:CC REG_CC))])]) (define_insn "*ashl<mode>3_const" - [(set (match_operand:ALL4 0 "register_operand" "=r,r,r ,r") - (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0,0,r ,0") - (match_operand:QI 2 "const_int_operand" "L,P,O C31,n"))) - (clobber (match_scratch:QI 3 "=X,X,X ,&d")) + [(set (match_operand:ALL4 0 "register_operand" "=r ,r ,r ,r ,r") + (ashift:ALL4 (match_operand:ALL4 1 "register_operand" "0 ,r ,0 ,r ,0") + (match_operand:QI 2 "const_int_operand" "LP,O C15 C31,C4l,C4l,n"))) + (clobber (match_operand:QI 3 "scratch_or_d_register_operand" "=X ,X ,&d ,&d ,&d")) (clobber (reg:CC REG_CC))] "reload_completed" { return ashlsi3_out (insn, operands, NULL); } - [(set_attr "length" "0,4,5,10") - (set_attr "adjust_len" "ashlsi")]) + [(set_attr "length" "10") + (set_attr "adjust_len" "ashlsi") + (set_attr "isa" "*,*,2op,3op,*")]) (define_expand "ashlpsi3" [(parallel [(set (match_operand:PSI 0 "register_operand" "") diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt index 5f2e52ccfc79..1b7e967dfc8e 100644 --- a/gcc/config/avr/avr.opt +++ b/gcc/config/avr/avr.opt @@ -94,6 +94,10 @@ maccumulate-args Target Mask(ACCUMULATE_OUTGOING_ARGS) Optimization Optimization. Accumulate outgoing function arguments and acquire/release the needed stack space for outgoing function arguments in function prologue/epilogue. Without this option, outgoing arguments are pushed before calling a function and popped afterwards. This option can lead to reduced code size for functions that call many functions that get their arguments on the stack like, for example printf. +msplit-bit-shift +Target Var(avr_split_bit_shift) Init(0) Optimization +Optimization. Split shifts of 4-byte values into a byte shift and a residual bit shift. + mstrict-X Target Var(avr_strict_X) Init(0) Optimization Optimization. When accessing RAM, use X as imposed by the hardware, i.e. just use pre-decrement, post-increment and indirect addressing with the X register. Without this option, the compiler may assume that there is an addressing mode X+const similar to Y+const and Z+const and emit instructions to emulate such an addressing mode for X. diff --git a/gcc/config/avr/constraints.md b/gcc/config/avr/constraints.md index ac64009b3c03..a362f31e30b8 100644 --- a/gcc/config/avr/constraints.md +++ b/gcc/config/avr/constraints.md @@ -263,6 +263,22 @@ (and (match_code "const_int,symbol_ref,const") (match_test "const_0mod256_operand (op, HImode)"))) +(define_constraint "C4a" + "A constant integer shift offset for a 4-byte ASHIFTRT that's opt to being split." + (and (match_code "const_int") + (match_test "avr_split_shift_p (4, ival, ASHIFTRT)"))) + +(define_constraint "C4r" + "A constant integer shift offset for a 4-byte LSHIFTRT that's opt to being split." + (and (match_code "const_int") + (match_test "avr_split_shift_p (4, ival, LSHIFTRT)"))) + +(define_constraint "C4l" + "A constant integer shift offset for a 4-byte ASHIFT that's opt to being split." + (and (match_code "const_int") + (match_test "avr_split_shift_p (4, ival, ASHIFT)"))) + + ;; CONST_FIXED is no element of 'n' so cook our own. ;; "i" or "s" would match but because the insn uses iterators that cover ;; INT_MODE, "i" or "s" is not always possible. diff --git a/gcc/pass_manager.h b/gcc/pass_manager.h index 294cdd0b1f7f..1a1c83fc6565 100644 --- a/gcc/pass_manager.h +++ b/gcc/pass_manager.h @@ -65,7 +65,7 @@ public: void execute_early_local_passes (); unsigned int execute_pass_mode_switching (); - /* Various passes are manually cloned by epiphany. */ + /* Various passes are manually cloned by avr and epiphany. */ opt_pass *get_pass_split_all_insns () const { return pass_split_all_insns_1; }