Weird behavior in ivopts code

Jeffrey A Law Fri, 18 Mar 2005 08:55:57 -0800

I'm sitting here analyzing a regression with some pending jump threading
changes and I've stumbled upon this quirk in IV opts which, if nothing
else, makes it very difficult to evaluate the jump threading changes.


Specifically, the set of IVs selected for a loop changes when the 
version #s of objects used within change  -- even if there is no
difference in the actual code within the loop.  For example, we will get
a different set of IVs for these two loops:

  # BLOCK 1
  # PRED: 3 [100.0%]  (fallthru) 0 [100.0%]  (fallthru,exec)
  # ivtmp.4_4 = PHI <ivtmp.4_3(3), 8(0)>;
  # TMT.2_2 = PHI <TMT.2_12(3), TMT.2_11(0)>;
  # i_1 = PHI <i_9(3), 0(0)>;
<L0>:;
  #   TMT.2_12 = V_MAY_DEF <TMT.2_2>;
  this_8->data[i_1] = d_7;
  i_9 = i_1 + 1;
  ivtmp.4_3 = ivtmp.4_4 - 1;
  if (ivtmp.4_3 != 0) goto <L7>; else goto <L3>;
  # SUCC: 3 [87.5%]  (dfs_back,true,exec) 2 [12.5%]
(loop_exit,false,exec)

  # BLOCK 3
  # PRED: 1 [87.5%]  (dfs_back,true,exec)
<L7>:;
  goto <bb 1> (<L0>);
  # SUCC: 1 [100.0%]  (fallthru)

  # BLOCK 2
  # PRED: 1 [12.5%]  (loop_exit,false,exec)
<L3>:;
  return;
  # SUCC: EXIT [100.0%]




  # BLOCK 1
  # PRED: 3 [100.0%]  (fallthru) 0 [100.0%]  (fallthru,exec)
  # ivtmp.4_16 = PHI <ivtmp.4_15(3), 8(0)>;
  # TMT.2_18 = PHI <TMT.2_12(3), TMT.2_11(0)>;
  # i_19 = PHI <i_9(3), 0(0)>;
<L0>:;
  #   TMT.2_12 = V_MAY_DEF <TMT.2_18>;
  this_8->data[i_19] = d_7;
  i_9 = i_19 + 1;
  ivtmp.4_15 = ivtmp.4_16 - 1;
  if (ivtmp.4_15 != 0) goto <L6>; else goto <L3>;
  # SUCC: 3 [87.5%]  (dfs_back,true,exec) 2 [12.5%]
(loop_exit,false,exec)

  # BLOCK 3
  # PRED: 1 [87.5%]  (dfs_back,true,exec)
<L6>:;
  goto <bb 1> (<L0>);
  # SUCC: 1 [100.0%]  (fallthru)

  # BLOCK 2
  # PRED: 1 [12.5%]  (loop_exit,false,exec)
<L3>:;
  return;
  # SUCC: EXIT [100.0%]


If you look closely, you'll find that these two loops are functionally
equivalent -- all that has happened is that we have different SSA_NAMEs
in use.

After IVops the first loop looks like:

  # BLOCK 1
  # PRED: 3 [100.0%]  (fallthru) 0 [100.0%]  (fallthru,exec)
  # ivtmp.12_10 = PHI <ivtmp.12_13(3), this_8(0)>;
  # TMT.2_2 = PHI <TMT.2_12(3), TMT.2_11(0)>;
  # i_1 = PHI <i_9(3), 0(0)>;
<L0>:;
  D.1616_14 = (float *) ivtmp.12_10;
  this_15 = D.1616_14;
  #   TMT.2_12 = V_MAY_DEF <TMT.2_2>;
  *this_15 = d_7;
  i_9 = i_1 + 1;
  ivtmp.12_13 = ivtmp.12_10 + 4B;
  if (i_9 != 8) goto <L7>; else goto <L3>;
  # SUCC: 3 [87.5%]  (dfs_back,true,exec) 2 [12.5%]
(loop_exit,false,exec)

  # BLOCK 3
  # PRED: 1 [87.5%]  (dfs_back,true,exec)
<L7>:;
  goto <bb 1> (<L0>);
  # SUCC: 1 [100.0%]  (fallthru)

  # BLOCK 2
  # PRED: 1 [12.5%]  (loop_exit,false,exec)
<L3>:;
  return;
  # SUCC: EXIT [100.0%]

Whereas the second loop gets turned into:

 # BLOCK 1
  # PRED: 3 [100.0%]  (fallthru) 0 [100.0%]  (fallthru,exec)
  # ivtmp.12_23 = PHI <ivtmp.12_22(3), this_8(0)>;
  # ivtmp.4_16 = PHI <ivtmp.4_15(3), 8(0)>;
  # TMT.2_18 = PHI <TMT.2_12(3), TMT.2_11(0)>;
<L0>:;
  D.1615_6 = (float *) ivtmp.12_23;
  this_2 = D.1615_6;
  #   TMT.2_12 = V_MAY_DEF <TMT.2_18>;
  *this_2 = d_7;
  ivtmp.4_15 = ivtmp.4_16 - 1;
  ivtmp.12_22 = ivtmp.12_23 + 4B;
  if (ivtmp.4_15 != 0) goto <L6>; else goto <L3>;
  # SUCC: 3 [87.5%]  (dfs_back,true,exec) 2 [12.5%]
(loop_exit,false,exec)

  # BLOCK 3
  # PRED: 1 [87.5%]  (dfs_back,true,exec)
<L6>:;
  goto <bb 1> (<L0>);
  # SUCC: 1 [100.0%]  (fallthru)

  # BLOCK 2
  # PRED: 1 [12.5%]  (loop_exit,false,exec)
<L3>:;
  return;
  # SUCC: EXIT [100.0%]


Note carefully how the induction variables for the loop control
counter differ.


What's particularly interesting is the first loop results in the
following assembly code:

.LFB3:
        pushl   %ebp
.LCFI0:
        xorl    %edx, %edx
        movl    %esp, %ebp
.LCFI1:
        movl    8(%ebp), %eax
        movl    12(%ebp), %ecx
        .p2align 4,,15
.L2:
        incl    %edx
        movl    %ecx, (%eax)
        addl    $4, %eax
        cmpl    $8, %edx
        jne     .L2
        popl    %ebp
        ret



Which actually runs faster than the code for the second loop (which is
odd since the second loop looks to my eye to be more efficient than
the first loop).

.LFB3:
        pushl   %ebp
.LCFI0:
        movl    $8, %edx
        movl    %esp, %ebp
.LCFI1:
        movl    8(%ebp), %eax
        movl    12(%ebp), %ecx
        .p2align 4,,15
.L2:
        movl    %ecx, (%eax)
        addl    $4, %eax
        decl    %edx
        jne     .L2
        popl    %ebp
        ret


The problem appears to be related to how we add IV candidates:

/* Adds candidates based on the old induction variables.  */

static void
add_old_ivs_candidates (struct ivopts_data *data)
{
  unsigned i;
  struct iv *iv;
  bitmap_iterator bi;

  EXECUTE_IF_SET_IN_BITMAP (data->relevant, 0, i, bi)
    {
      iv = ver_info (data, i)->iv;
      if (iv && iv->biv_p && !zero_p (iv->step))
        add_old_iv_candidates (data, iv);
    }
}

Which, if I read it correctly, will add candidates to the IV array
in the same order as their SSA_NAME_VERSION.

Then we have this code:

  /* Try extending the set of induction variables by one.  */
  for (i = 0; i < n_iv_cands (data); i++)
    {
      cand = iv_cand (data, i);

      if (iv_ca_cand_used_p (ivs, cand))
        continue;

      acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs);
      if (!act_delta)
        continue;

      /* If we successfully added the candidate and the set is small
enough,
         try optimizing it by removing other candidates.  */
      if (n_ivs <= ALWAYS_PRUNE_CAND_SET_BOUND)
        {
          iv_ca_delta_commit (data, ivs, act_delta, true);
          acost = iv_ca_prune (data, ivs, cand, &tmp_delta);
          iv_ca_delta_commit (data, ivs, act_delta, false);
          act_delta = iv_ca_delta_join (act_delta, tmp_delta);
        }

      if (acost < best_cost)
        {
          best_cost = acost;
          iv_ca_delta_free (&best_delta);
          best_delta = act_delta;
        }
      else
        iv_ca_delta_free (&act_delta);
    }

Which appears to walk down the array and try and choose better IV sets.
Since it walks down the IV array, which is in SSA_NAME_VERSION order.
Thus two loops which are equivalent in all ways except that they use
different SSA_NAME_VERSIONs can get different IV sets.


Anyway, the instability of the IV opts code when presented with loops
that differ only in the version #s in the SSA_NAMEs they use is really
getting in the way of understanding the performance impact of the
new jump threading code.  I would expect this instability to also make
it difficult to analyze the IVopts in general.

Thoughts?

Jeff

Weird behavior in ivopts code

Reply via email to