Re: Delay slot filling - what still matters, and what doesn't matter so much anymore?

Steven Bosscher Mon, 22 Apr 2013 14:30:04 -0700

On Sun, Apr 21, 2013 at 11:50 PM, Jeff Law <l...@redhat.com> wrote:
> The 60% number also tells me there'd be a lot to be gained by using the
> scheduler's dependency information to drive filling.  We'd end up looking at
> far fewer insns.


FWIW I haven't had much time to hack my sched-dbr.c much, but attached
is the latest copy of it. Stats on my collection of cc1-i files:

$ for d in nodelay/ reorg/ dbr/ ; do grep -w nop ${d}/* | wc ; done
 383532  767091 10550413
 164616  329259 4185216
 179938  359903 4217429

where nodelay is -fno-delayed-branch, reorg is current reorg.c but
with annulling disabled, and dbr is this sched-dbr.c.

It's obviously far from complete, it's not even in "toy" maturity :-)
For one thing, annulling branches are not handled at all yet. Other
things where it fails:

* memory ops in call delay slots:
@@ -7414,8 +7402,9 @@
        ldx     [%l4+%lo(valvar_pool)], %o0
 .L:
        stx     %g1, [%fp+2039]
+       stx     %g2, [%fp+2031]
        call    pool_alloc, 0
-        stx    %g2, [%fp+2031]
+        nop
        stb     %g0, [%o0+12]
        stx     %i4, [%o0]
        ldx     [%fp+2031], %g2
...
@@ -7579,8 +7567,9 @@
        ldx     [%fp+2039], %o1
        call    htab_find_with_hash, 0
         srl    %o2, 0, %o2
+       stx     %o0, [%fp+2031]
        brz,pn  %o0, .L
-        stx    %o0, [%fp+2031]
+        nop
 .L:
        ldx     [%fp+2039], %g1
        brz,pn  %g1, .L


* funny things with returns that look wrong to me but haven't had time
to look into:
@@ -7950,9 +7939,8 @@
        ldx     [%i3], %o1
        ldx     [%g4+32], %o0
        call    notify_dependents_of_resolved_value.isra.46, 0
-        mov    %l7, %i0
-       return  %i7+8
-        nop
+        jmp    %i7+8
+        restore %g0, %l7, %o0
 .L:
        ldx     [%fp+2039], %g2
        ldx     [%g2+8], %g1

* picking insns from after the delay insn:
@@ -18,9 +18,9 @@
        sra     %i5, 0, %g1
        add     %g1, 2, %g1
        sllx    %g1, 3, %g1
-       ldx     [%i4+%g1], %g1
        call    %g1, 0
-        add    %i5, -1, %i5
+        ldx    [%i4+%g1], %g1
+       add     %i5, -1, %i5
        cmp     %i5, -1
        bne,pt  %icc, .L
         nop


OTOH it also already successfully finds opportunities that reorg.c
doesn't see, e.g.:

sub    %i4, %i5, %i3             sub     %i4, %i5, %i3
mov    %o0, %i1                  mov     %o0, %i1
sub    %i2, %i3, %i5         <
call   bar, 0                    call    bar, 0
 mov    %i3, %o0                  mov    %i3, %o0
ba,pt  %xcc, .L6             |   ba,pt   %xcc, .L2
 cmp    %i4, %i2             |    sub    %i2, %i3, %i5

reorg.c takes the cmp from the target but sched-dbr looks through the
call to find the sub.

Ciao!
Steven

/* Perform delay slot filling.
   Copyright (C) 2013 Free Software Foundation, Inc.
   Contributed by Steven Bosscher.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */

#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "target.h"
#include "insn-config.h"
#include "rtl.h"
#include "conditions.h"
#include "function.h"
#include "basic-block.h"
#include "df.h"

#include "recog.h"
#include "emit-rtl.h"
#include "sched-int.h"

#include "tree-pass.h"
#include "vec.h"
#include "obstack.h"

#ifdef  DELAY_SLOTS

/* ???  dbr_schedule required sched-deps, which should be a simple dependency
	DAG builder but is badly intertwined with the Haifa scheduler.  */
#ifndef INSN_SCHEDULING
#error targets without scheduling support do not work with sched-dbr.c
#endif

#ifndef ANNUL_IFTRUE_SLOTS
#define eligible_for_annul_true(INSN, SLOTS, TRIAL, FLAGS) 0
#endif
#ifndef ANNUL_IFFALSE_SLOTS
#define eligible_for_annul_false(INSN, SLOTS, TRIAL, FLAGS) 0
#endif

/* Bah, sched-deps only works if the Haifa scheduler is set up also.
   That's a bug, so...  FIXME.  */
static struct haifa_sched_info sched_dbr_info =
{
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL, NULL,
  NULL, NULL,
  0, 0,
  NULL, NULL, NULL, NULL,
  NULL, NULL,
  0
};

static struct sched_deps_info_def dbr_sched_deps_info =
{
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  0, 0, 0, 0
};

static struct common_sched_info_def dbr_common_sched_info;


typedef struct delay_slot_fill_data_d
{
  rtx delay_insn, slot_fill_insn;
} *delay_slot_fill_data_t;
static vec<delay_slot_fill_data_t> delay_slot_fill_data;
static struct obstack sched_dbr_obstack;

/* Put SLOT_FILL_INSN into the delay slot of DELAY_INSN and replace them
   in the insns chain with a SEQUENCE.
   Returns the SEQUENCE that replaces DELAY_INSN.  */

static rtx
emit_delay_sequence2 (rtx delay_insn, rtx slot_fill_insn)
{
  int slots_to_fill = num_delay_slots (delay_insn);
  gcc_assert (slots_to_fill == 1);

  /* Allocate the rtvec to hold the insns and the SEQUENCE.  */
  rtvec seqv = rtvec_alloc (slots_to_fill + 1);
  rtx seq = gen_rtx_SEQUENCE (VOIDmode, seqv);
  rtx seq_insn = make_insn_raw (seq);

  /* If DELAY_INSN has a location, use it for SEQ_INSN.  If DELAY_INSN does
     not have a location, but the delay slot filling insn does, pick it up
     there.  Clear the location on the delayed insn.  The SPARC assembler,
     for instance, emit warning when debug info is output into the delay
     slot.  */
  INSN_LOCATION (seq_insn) = INSN_LOCATION (delay_insn);
  if (INSN_LOCATION (slot_fill_insn) && !INSN_LOCATION (seq_insn))
    INSN_LOCATION (seq_insn) = INSN_LOCATION (slot_fill_insn);
  INSN_LOCATION (slot_fill_insn) = 0;

  /* Unlink DELAY_INSN from the insn chain, so that we can put it into
     the SEQUENCE.   Remember where we want to emit SEQUENCE in AFTER.  */
  rtx after = PREV_INSN (delay_insn);
  remove_insn (delay_insn);
  NEXT_INSN (delay_insn) = PREV_INSN (delay_insn) = NULL;

  /* Unlink insn from its original place.
     Update AFTER if it is the delay slot fill insn.  */
  if (slot_fill_insn == after)
    after = PREV_INSN (slot_fill_insn);
  remove_insn (slot_fill_insn);
  NEXT_INSN (slot_fill_insn) = PREV_INSN (slot_fill_insn) = NULL;

  /* Remove any REG_DEAD notes because we can't rely on them now
     that the insn has been moved.
     ??? This is too conservative, we can keep the note if we took
     the delayed insn TEM from the basic block holding DELAY_INSN.  */
  rtx note, next;
  for (note = REG_NOTES (slot_fill_insn); note; note = next)
    {
      next = XEXP (note, 1);
      if (REG_NOTE_KIND (note) == REG_DEAD)
	remove_note (slot_fill_insn, note);
    }

  /* Build our SEQUENCE and rebuild the insn chain.
     Splice our SEQUENCE into the insn stream where INSN used to be.  */
  start_sequence ();
  XVECEXP (seq, 0, 0) = emit_insn (delay_insn);
  XVECEXP (seq, 0, 1) = emit_insn (slot_fill_insn);
  end_sequence ();
  add_insn_after (seq_insn, after, NULL);

  return seq_insn;
}

/*  Encode and return branch direction and prediction information for
    INSN assuming it will jump to LABEL.

    Non conditional branches return no direction information and
    are predicted as very likely taken.  */

static int
get_jump_flags (rtx insn, rtx label)
{
  int flags;

  /* get_jump_flags can be passed any insn with delay slots, these may
     be INSNs, CALL_INSNs, or JUMP_INSNs.  Only JUMP_INSNs have branch
     direction information, and only if they are conditional jumps.

     If LABEL is a return, then there is no way to determine the branch
     direction.  */
  if (JUMP_P (insn)
      && any_condjump_p (insn)
      && !ANY_RETURN_P (label))
    flags
      = (INSN_LUID (label) > INSN_LUID (insn))
	 ? ATTR_FLAG_forward : ATTR_FLAG_backward;
  /* No valid direction information.  */
  else
    flags = 0;

  return flags;
}

static void
fill_delay_slots_from_bb (basic_block bb)
{
  rtx head, tail;
  struct deps_desc tmp_deps;

  /* Compute dependencies.  */
  get_ebb_head_tail (bb, bb, &head, &tail);
  init_deps_global ();
  init_deps (&tmp_deps, false);
  sched_analyze (&tmp_deps, head, tail);
  free_deps (&tmp_deps);
  finish_deps_global ();

  /* Print dependency information.  */
  if (dump_file && sched_dump)
    {
      fprintf (sched_dump, ";;   --------------- forward dependences: ------------ \n");
      fprintf (sched_dump, "\n;;   --- DBR Dependences --- for bb%d \n", bb->index);
      debug_dependencies (head, tail);
    }

  rtx delay_insn;
  FOR_BB_INSNS (bb, delay_insn)
    {
      if (! NONDEBUG_INSN_P (delay_insn)
	  || (GET_CODE (PATTERN (delay_insn)) == USE
	      || GET_CODE (PATTERN (delay_insn)) == CLOBBER))
	continue;

      INSN_FROM_TARGET_P (delay_insn) = 0;
      if (JUMP_P (delay_insn))
	INSN_ANNULLED_BRANCH_P (delay_insn) = 0;
      int slots_to_fill = num_delay_slots (delay_insn);
      if (slots_to_fill > 0)
        {
	  rtx slot_fill_insn = NULL;

	  /* We only handle insns with 1 delay slot.  */
	  gcc_assert (slots_to_fill == 1);
	  int flags;
	  if (JUMP_P (delay_insn))
	    flags = get_jump_flags (delay_insn, JUMP_LABEL (delay_insn));
	  else
	    flags = get_jump_flags (delay_insn, NULL_RTX);

	  for (rtx cand_insn = PREV_INSN (delay_insn);
	       cand_insn != BB_HEAD (bb);
	       cand_insn = PREV_INSN (cand_insn))
	    {
	      sd_iterator_def sd_it;
	      dep_t dep;
	      bool cand_insn_accepted = true;

	      /* CAND_INSN must be a real insn.  */
	      if (! NONDEBUG_INSN_P (cand_insn)
		  || (GET_CODE (PATTERN (cand_insn)) == USE
		      || GET_CODE (PATTERN (cand_insn)) == CLOBBER))
		continue;

	      /* It must not be a delay sequence.  It is possible in
		 principle to hoist insns across other insns with delayed
		 branches, but it's a little more difficult because the
		 DF_INSN_LUIDs are no longer valid.  */
	      if (GET_CODE (PATTERN (cand_insn)) == SEQUENCE)
		continue;

	      /* An 'asm' can't be in a delay slot.  */
              if (GET_CODE (PATTERN (cand_insn)) == ASM_INPUT
	                    || asm_noperands (PATTERN (cand_insn)) >= 0)
		continue;

	      if (! eligible_for_delay (delay_insn, 0, cand_insn, flags))
		continue;

	      if (SCHED_GROUP_P (cand_insn))
		continue;

	      rtx set = single_set (cand_insn);
	      if (! set || multiple_sets (cand_insn))
		continue;
	      rtx cand_set_dest = SET_DEST (set);

	      FOR_EACH_DEP (cand_insn, SD_LIST_FORW, sd_it, dep)
		{
		  rtx con = DEP_CON (dep);

		  /* sched-deps insists on adding anti-dependencies for
		     jumps.  We can ignore them.   */
		  if (JUMP_P (con) && DEP_TYPE (dep) == REG_DEP_ANTI)
		    continue;

		  /* If CAND_INSN sets a register that is also set by the call,
		     the dependency can be ignored.  */
		  if (CALL_P (delay_insn) && delay_insn == con
		      && REG_P (cand_set_dest)
		      && ! multiple_sets (delay_insn))
		    {
		    /* TODO: Allow memory ops, but need to do something about
			     dependencies crossing DELAY_INSN.  */
		      rtx pat = PATTERN (delay_insn);
		      if (GET_CODE (pat) == PARALLEL)
			pat = XVECEXP (pat, 0, 0);
		      if (GET_CODE (pat) == SET)
			pat = SET_SRC (pat);
		      if (get_call_rtx_from (pat) != NULL_RTX)
			continue;
		    }

		  /* A bare USE before a return can be ignored, too.  */
		  if (GET_CODE (PATTERN (con)) == USE
		      && JUMP_P (delay_insn)
		      && ANY_RETURN_P (PATTERN (delay_insn)))
		    continue;

		  if (DF_INSN_LUID (delay_insn) < DF_INSN_LUID (con))
		    /* CAND_INSN has dependencies crossing over DELAY_INSN.  */ ;
		  else // not a deps leaf
		    {
		      cand_insn_accepted = false;
		      break;
		    }
		}

	      if (! cand_insn_accepted)
		continue;

	      if (slot_fill_insn == NULL
		  || insn_default_latency (cand_insn) < insn_default_latency (slot_fill_insn))
		slot_fill_insn = cand_insn;
	    }

	  if (slot_fill_insn != NULL)
	    {
	      delay_slot_fill_data_t dsf;
	      dsf = XOBNEW (&sched_dbr_obstack, struct delay_slot_fill_data_d);
	      dsf->delay_insn = delay_insn;
	      dsf->slot_fill_insn = slot_fill_insn;
	      delay_slot_fill_data.safe_push (dsf);
	    }
	}
    }

  /* Free dependencies.  */
  sched_free_deps (head, tail, false);

  /* Transform the code.  */
  unsigned int ix;
  delay_slot_fill_data_t dsf;
  FOR_EACH_VEC_ELT (delay_slot_fill_data, ix, dsf)
    emit_delay_sequence2 (dsf->delay_insn, dsf->slot_fill_insn);
  delay_slot_fill_data.release ();
}

/* The main entry point in this file.  */

static void
fill_delay_slots (void)
{
  basic_block bb;

  gcc_obstack_init (&sched_dbr_obstack);

  /* Set up the various scheduler hooks.  */
  memcpy (&dbr_common_sched_info, &haifa_common_sched_info,
	  sizeof (dbr_common_sched_info));
  common_sched_info = &dbr_common_sched_info;
  sched_deps_info = &dbr_sched_deps_info;
  current_sched_info = &sched_dbr_info;

  compute_bb_for_insn ();
  df_analyze ();

  haifa_sched_init ();

  /* Try to fill delay slots with insns from the same basic block as
     the one that contains the insns with the delay slots.  */
  FOR_EACH_BB (bb)
    {
      if (bb->flags & BB_DISABLE_SCHEDULE)
	continue;
      fill_delay_slots_from_bb (bb);
//      if (single_succ_p (bb)
//	  && single_succ (bb) != EXIT_BLOCK_PTR
//	  && single_pred_p (single_succ (bb)))
//	continue somehow (this happens a lot for basic blocks containing
//	a single jump following a branch)
    }

  free_bb_for_insn ();
  haifa_sched_finish ();

  obstack_free (&sched_dbr_obstack, NULL);

  return;
}
#endif


static bool
gate_handle_delay_slots2 (void)
{
  /* At -O0 dataflow info isn't updated after RA.  */
  return optimize > 0 && flag_delayed_branch2; // && !crtl->dbr_scheduled_p;
}

/* Run delay slot optimization.  */
static unsigned int
rest_of_handle_delay_slots2 (void)
{
#ifdef  DELAY_SLOTS
  fill_delay_slots ();
#endif
  return 0;
}

struct rtl_opt_pass pass_delay_slots2 =
{
 {
  RTL_PASS,
  "dbr2",                               /* name */
  OPTGROUP_NONE,                        /* optinfo_flags */
  gate_handle_delay_slots2,             /* gate */
  rest_of_handle_delay_slots2,          /* execute */
  NULL,                                 /* sub */
  NULL,                                 /* next */
  0,                                    /* static_pass_number */
  TV_DBR_SCHED,                         /* tv_id */
  0,                                    /* properties_required */
  0,                                    /* properties_provided */
  0,                                    /* properties_destroyed */
  0,                                    /* todo_flags_start */
  0                                     /* todo_flags_finish */
 }
};

Re: Delay slot filling - what still matters, and what doesn't matter so much anymore?

Reply via email to