On Sun, Apr 21, 2013 at 11:50 PM, Jeff Law <l...@redhat.com> wrote: > The 60% number also tells me there'd be a lot to be gained by using the > scheduler's dependency information to drive filling. We'd end up looking at > far fewer insns.
FWIW I haven't had much time to hack my sched-dbr.c much, but attached is the latest copy of it. Stats on my collection of cc1-i files: $ for d in nodelay/ reorg/ dbr/ ; do grep -w nop ${d}/* | wc ; done 383532 767091 10550413 164616 329259 4185216 179938 359903 4217429 where nodelay is -fno-delayed-branch, reorg is current reorg.c but with annulling disabled, and dbr is this sched-dbr.c. It's obviously far from complete, it's not even in "toy" maturity :-) For one thing, annulling branches are not handled at all yet. Other things where it fails: * memory ops in call delay slots: @@ -7414,8 +7402,9 @@ ldx [%l4+%lo(valvar_pool)], %o0 .L: stx %g1, [%fp+2039] + stx %g2, [%fp+2031] call pool_alloc, 0 - stx %g2, [%fp+2031] + nop stb %g0, [%o0+12] stx %i4, [%o0] ldx [%fp+2031], %g2 ... @@ -7579,8 +7567,9 @@ ldx [%fp+2039], %o1 call htab_find_with_hash, 0 srl %o2, 0, %o2 + stx %o0, [%fp+2031] brz,pn %o0, .L - stx %o0, [%fp+2031] + nop .L: ldx [%fp+2039], %g1 brz,pn %g1, .L * funny things with returns that look wrong to me but haven't had time to look into: @@ -7950,9 +7939,8 @@ ldx [%i3], %o1 ldx [%g4+32], %o0 call notify_dependents_of_resolved_value.isra.46, 0 - mov %l7, %i0 - return %i7+8 - nop + jmp %i7+8 + restore %g0, %l7, %o0 .L: ldx [%fp+2039], %g2 ldx [%g2+8], %g1 * picking insns from after the delay insn: @@ -18,9 +18,9 @@ sra %i5, 0, %g1 add %g1, 2, %g1 sllx %g1, 3, %g1 - ldx [%i4+%g1], %g1 call %g1, 0 - add %i5, -1, %i5 + ldx [%i4+%g1], %g1 + add %i5, -1, %i5 cmp %i5, -1 bne,pt %icc, .L nop OTOH it also already successfully finds opportunities that reorg.c doesn't see, e.g.: sub %i4, %i5, %i3 sub %i4, %i5, %i3 mov %o0, %i1 mov %o0, %i1 sub %i2, %i3, %i5 < call bar, 0 call bar, 0 mov %i3, %o0 mov %i3, %o0 ba,pt %xcc, .L6 | ba,pt %xcc, .L2 cmp %i4, %i2 | sub %i2, %i3, %i5 reorg.c takes the cmp from the target but sched-dbr looks through the call to find the sub. Ciao! Steven
/* Perform delay slot filling. Copyright (C) 2013 Free Software Foundation, Inc. Contributed by Steven Bosscher. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ #include "config.h" #include "system.h" #include "coretypes.h" #include "tm.h" #include "target.h" #include "insn-config.h" #include "rtl.h" #include "conditions.h" #include "function.h" #include "basic-block.h" #include "df.h" #include "recog.h" #include "emit-rtl.h" #include "sched-int.h" #include "tree-pass.h" #include "vec.h" #include "obstack.h" #ifdef DELAY_SLOTS /* ??? dbr_schedule required sched-deps, which should be a simple dependency DAG builder but is badly intertwined with the Haifa scheduler. */ #ifndef INSN_SCHEDULING #error targets without scheduling support do not work with sched-dbr.c #endif #ifndef ANNUL_IFTRUE_SLOTS #define eligible_for_annul_true(INSN, SLOTS, TRIAL, FLAGS) 0 #endif #ifndef ANNUL_IFFALSE_SLOTS #define eligible_for_annul_false(INSN, SLOTS, TRIAL, FLAGS) 0 #endif /* Bah, sched-deps only works if the Haifa scheduler is set up also. That's a bug, so... FIXME. */ static struct haifa_sched_info sched_dbr_info = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 0 }; static struct sched_deps_info_def dbr_sched_deps_info = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }; static struct common_sched_info_def dbr_common_sched_info; typedef struct delay_slot_fill_data_d { rtx delay_insn, slot_fill_insn; } *delay_slot_fill_data_t; static vec<delay_slot_fill_data_t> delay_slot_fill_data; static struct obstack sched_dbr_obstack; /* Put SLOT_FILL_INSN into the delay slot of DELAY_INSN and replace them in the insns chain with a SEQUENCE. Returns the SEQUENCE that replaces DELAY_INSN. */ static rtx emit_delay_sequence2 (rtx delay_insn, rtx slot_fill_insn) { int slots_to_fill = num_delay_slots (delay_insn); gcc_assert (slots_to_fill == 1); /* Allocate the rtvec to hold the insns and the SEQUENCE. */ rtvec seqv = rtvec_alloc (slots_to_fill + 1); rtx seq = gen_rtx_SEQUENCE (VOIDmode, seqv); rtx seq_insn = make_insn_raw (seq); /* If DELAY_INSN has a location, use it for SEQ_INSN. If DELAY_INSN does not have a location, but the delay slot filling insn does, pick it up there. Clear the location on the delayed insn. The SPARC assembler, for instance, emit warning when debug info is output into the delay slot. */ INSN_LOCATION (seq_insn) = INSN_LOCATION (delay_insn); if (INSN_LOCATION (slot_fill_insn) && !INSN_LOCATION (seq_insn)) INSN_LOCATION (seq_insn) = INSN_LOCATION (slot_fill_insn); INSN_LOCATION (slot_fill_insn) = 0; /* Unlink DELAY_INSN from the insn chain, so that we can put it into the SEQUENCE. Remember where we want to emit SEQUENCE in AFTER. */ rtx after = PREV_INSN (delay_insn); remove_insn (delay_insn); NEXT_INSN (delay_insn) = PREV_INSN (delay_insn) = NULL; /* Unlink insn from its original place. Update AFTER if it is the delay slot fill insn. */ if (slot_fill_insn == after) after = PREV_INSN (slot_fill_insn); remove_insn (slot_fill_insn); NEXT_INSN (slot_fill_insn) = PREV_INSN (slot_fill_insn) = NULL; /* Remove any REG_DEAD notes because we can't rely on them now that the insn has been moved. ??? This is too conservative, we can keep the note if we took the delayed insn TEM from the basic block holding DELAY_INSN. */ rtx note, next; for (note = REG_NOTES (slot_fill_insn); note; note = next) { next = XEXP (note, 1); if (REG_NOTE_KIND (note) == REG_DEAD) remove_note (slot_fill_insn, note); } /* Build our SEQUENCE and rebuild the insn chain. Splice our SEQUENCE into the insn stream where INSN used to be. */ start_sequence (); XVECEXP (seq, 0, 0) = emit_insn (delay_insn); XVECEXP (seq, 0, 1) = emit_insn (slot_fill_insn); end_sequence (); add_insn_after (seq_insn, after, NULL); return seq_insn; } /* Encode and return branch direction and prediction information for INSN assuming it will jump to LABEL. Non conditional branches return no direction information and are predicted as very likely taken. */ static int get_jump_flags (rtx insn, rtx label) { int flags; /* get_jump_flags can be passed any insn with delay slots, these may be INSNs, CALL_INSNs, or JUMP_INSNs. Only JUMP_INSNs have branch direction information, and only if they are conditional jumps. If LABEL is a return, then there is no way to determine the branch direction. */ if (JUMP_P (insn) && any_condjump_p (insn) && !ANY_RETURN_P (label)) flags = (INSN_LUID (label) > INSN_LUID (insn)) ? ATTR_FLAG_forward : ATTR_FLAG_backward; /* No valid direction information. */ else flags = 0; return flags; } static void fill_delay_slots_from_bb (basic_block bb) { rtx head, tail; struct deps_desc tmp_deps; /* Compute dependencies. */ get_ebb_head_tail (bb, bb, &head, &tail); init_deps_global (); init_deps (&tmp_deps, false); sched_analyze (&tmp_deps, head, tail); free_deps (&tmp_deps); finish_deps_global (); /* Print dependency information. */ if (dump_file && sched_dump) { fprintf (sched_dump, ";; --------------- forward dependences: ------------ \n"); fprintf (sched_dump, "\n;; --- DBR Dependences --- for bb%d \n", bb->index); debug_dependencies (head, tail); } rtx delay_insn; FOR_BB_INSNS (bb, delay_insn) { if (! NONDEBUG_INSN_P (delay_insn) || (GET_CODE (PATTERN (delay_insn)) == USE || GET_CODE (PATTERN (delay_insn)) == CLOBBER)) continue; INSN_FROM_TARGET_P (delay_insn) = 0; if (JUMP_P (delay_insn)) INSN_ANNULLED_BRANCH_P (delay_insn) = 0; int slots_to_fill = num_delay_slots (delay_insn); if (slots_to_fill > 0) { rtx slot_fill_insn = NULL; /* We only handle insns with 1 delay slot. */ gcc_assert (slots_to_fill == 1); int flags; if (JUMP_P (delay_insn)) flags = get_jump_flags (delay_insn, JUMP_LABEL (delay_insn)); else flags = get_jump_flags (delay_insn, NULL_RTX); for (rtx cand_insn = PREV_INSN (delay_insn); cand_insn != BB_HEAD (bb); cand_insn = PREV_INSN (cand_insn)) { sd_iterator_def sd_it; dep_t dep; bool cand_insn_accepted = true; /* CAND_INSN must be a real insn. */ if (! NONDEBUG_INSN_P (cand_insn) || (GET_CODE (PATTERN (cand_insn)) == USE || GET_CODE (PATTERN (cand_insn)) == CLOBBER)) continue; /* It must not be a delay sequence. It is possible in principle to hoist insns across other insns with delayed branches, but it's a little more difficult because the DF_INSN_LUIDs are no longer valid. */ if (GET_CODE (PATTERN (cand_insn)) == SEQUENCE) continue; /* An 'asm' can't be in a delay slot. */ if (GET_CODE (PATTERN (cand_insn)) == ASM_INPUT || asm_noperands (PATTERN (cand_insn)) >= 0) continue; if (! eligible_for_delay (delay_insn, 0, cand_insn, flags)) continue; if (SCHED_GROUP_P (cand_insn)) continue; rtx set = single_set (cand_insn); if (! set || multiple_sets (cand_insn)) continue; rtx cand_set_dest = SET_DEST (set); FOR_EACH_DEP (cand_insn, SD_LIST_FORW, sd_it, dep) { rtx con = DEP_CON (dep); /* sched-deps insists on adding anti-dependencies for jumps. We can ignore them. */ if (JUMP_P (con) && DEP_TYPE (dep) == REG_DEP_ANTI) continue; /* If CAND_INSN sets a register that is also set by the call, the dependency can be ignored. */ if (CALL_P (delay_insn) && delay_insn == con && REG_P (cand_set_dest) && ! multiple_sets (delay_insn)) { /* TODO: Allow memory ops, but need to do something about dependencies crossing DELAY_INSN. */ rtx pat = PATTERN (delay_insn); if (GET_CODE (pat) == PARALLEL) pat = XVECEXP (pat, 0, 0); if (GET_CODE (pat) == SET) pat = SET_SRC (pat); if (get_call_rtx_from (pat) != NULL_RTX) continue; } /* A bare USE before a return can be ignored, too. */ if (GET_CODE (PATTERN (con)) == USE && JUMP_P (delay_insn) && ANY_RETURN_P (PATTERN (delay_insn))) continue; if (DF_INSN_LUID (delay_insn) < DF_INSN_LUID (con)) /* CAND_INSN has dependencies crossing over DELAY_INSN. */ ; else // not a deps leaf { cand_insn_accepted = false; break; } } if (! cand_insn_accepted) continue; if (slot_fill_insn == NULL || insn_default_latency (cand_insn) < insn_default_latency (slot_fill_insn)) slot_fill_insn = cand_insn; } if (slot_fill_insn != NULL) { delay_slot_fill_data_t dsf; dsf = XOBNEW (&sched_dbr_obstack, struct delay_slot_fill_data_d); dsf->delay_insn = delay_insn; dsf->slot_fill_insn = slot_fill_insn; delay_slot_fill_data.safe_push (dsf); } } } /* Free dependencies. */ sched_free_deps (head, tail, false); /* Transform the code. */ unsigned int ix; delay_slot_fill_data_t dsf; FOR_EACH_VEC_ELT (delay_slot_fill_data, ix, dsf) emit_delay_sequence2 (dsf->delay_insn, dsf->slot_fill_insn); delay_slot_fill_data.release (); } /* The main entry point in this file. */ static void fill_delay_slots (void) { basic_block bb; gcc_obstack_init (&sched_dbr_obstack); /* Set up the various scheduler hooks. */ memcpy (&dbr_common_sched_info, &haifa_common_sched_info, sizeof (dbr_common_sched_info)); common_sched_info = &dbr_common_sched_info; sched_deps_info = &dbr_sched_deps_info; current_sched_info = &sched_dbr_info; compute_bb_for_insn (); df_analyze (); haifa_sched_init (); /* Try to fill delay slots with insns from the same basic block as the one that contains the insns with the delay slots. */ FOR_EACH_BB (bb) { if (bb->flags & BB_DISABLE_SCHEDULE) continue; fill_delay_slots_from_bb (bb); // if (single_succ_p (bb) // && single_succ (bb) != EXIT_BLOCK_PTR // && single_pred_p (single_succ (bb))) // continue somehow (this happens a lot for basic blocks containing // a single jump following a branch) } free_bb_for_insn (); haifa_sched_finish (); obstack_free (&sched_dbr_obstack, NULL); return; } #endif static bool gate_handle_delay_slots2 (void) { /* At -O0 dataflow info isn't updated after RA. */ return optimize > 0 && flag_delayed_branch2; // && !crtl->dbr_scheduled_p; } /* Run delay slot optimization. */ static unsigned int rest_of_handle_delay_slots2 (void) { #ifdef DELAY_SLOTS fill_delay_slots (); #endif return 0; } struct rtl_opt_pass pass_delay_slots2 = { { RTL_PASS, "dbr2", /* name */ OPTGROUP_NONE, /* optinfo_flags */ gate_handle_delay_slots2, /* gate */ rest_of_handle_delay_slots2, /* execute */ NULL, /* sub */ NULL, /* next */ 0, /* static_pass_number */ TV_DBR_SCHED, /* tv_id */ 0, /* properties_required */ 0, /* properties_provided */ 0, /* properties_destroyed */ 0, /* todo_flags_start */ 0 /* todo_flags_finish */ } };