I'm sitting here analyzing a regression with some pending jump threading changes and I've stumbled upon this quirk in IV opts which, if nothing else, makes it very difficult to evaluate the jump threading changes.
Specifically, the set of IVs selected for a loop changes when the version #s of objects used within change -- even if there is no difference in the actual code within the loop. For example, we will get a different set of IVs for these two loops: # BLOCK 1 # PRED: 3 [100.0%] (fallthru) 0 [100.0%] (fallthru,exec) # ivtmp.4_4 = PHI <ivtmp.4_3(3), 8(0)>; # TMT.2_2 = PHI <TMT.2_12(3), TMT.2_11(0)>; # i_1 = PHI <i_9(3), 0(0)>; <L0>:; # TMT.2_12 = V_MAY_DEF <TMT.2_2>; this_8->data[i_1] = d_7; i_9 = i_1 + 1; ivtmp.4_3 = ivtmp.4_4 - 1; if (ivtmp.4_3 != 0) goto <L7>; else goto <L3>; # SUCC: 3 [87.5%] (dfs_back,true,exec) 2 [12.5%] (loop_exit,false,exec) # BLOCK 3 # PRED: 1 [87.5%] (dfs_back,true,exec) <L7>:; goto <bb 1> (<L0>); # SUCC: 1 [100.0%] (fallthru) # BLOCK 2 # PRED: 1 [12.5%] (loop_exit,false,exec) <L3>:; return; # SUCC: EXIT [100.0%] # BLOCK 1 # PRED: 3 [100.0%] (fallthru) 0 [100.0%] (fallthru,exec) # ivtmp.4_16 = PHI <ivtmp.4_15(3), 8(0)>; # TMT.2_18 = PHI <TMT.2_12(3), TMT.2_11(0)>; # i_19 = PHI <i_9(3), 0(0)>; <L0>:; # TMT.2_12 = V_MAY_DEF <TMT.2_18>; this_8->data[i_19] = d_7; i_9 = i_19 + 1; ivtmp.4_15 = ivtmp.4_16 - 1; if (ivtmp.4_15 != 0) goto <L6>; else goto <L3>; # SUCC: 3 [87.5%] (dfs_back,true,exec) 2 [12.5%] (loop_exit,false,exec) # BLOCK 3 # PRED: 1 [87.5%] (dfs_back,true,exec) <L6>:; goto <bb 1> (<L0>); # SUCC: 1 [100.0%] (fallthru) # BLOCK 2 # PRED: 1 [12.5%] (loop_exit,false,exec) <L3>:; return; # SUCC: EXIT [100.0%] If you look closely, you'll find that these two loops are functionally equivalent -- all that has happened is that we have different SSA_NAMEs in use. After IVops the first loop looks like: # BLOCK 1 # PRED: 3 [100.0%] (fallthru) 0 [100.0%] (fallthru,exec) # ivtmp.12_10 = PHI <ivtmp.12_13(3), this_8(0)>; # TMT.2_2 = PHI <TMT.2_12(3), TMT.2_11(0)>; # i_1 = PHI <i_9(3), 0(0)>; <L0>:; D.1616_14 = (float *) ivtmp.12_10; this_15 = D.1616_14; # TMT.2_12 = V_MAY_DEF <TMT.2_2>; *this_15 = d_7; i_9 = i_1 + 1; ivtmp.12_13 = ivtmp.12_10 + 4B; if (i_9 != 8) goto <L7>; else goto <L3>; # SUCC: 3 [87.5%] (dfs_back,true,exec) 2 [12.5%] (loop_exit,false,exec) # BLOCK 3 # PRED: 1 [87.5%] (dfs_back,true,exec) <L7>:; goto <bb 1> (<L0>); # SUCC: 1 [100.0%] (fallthru) # BLOCK 2 # PRED: 1 [12.5%] (loop_exit,false,exec) <L3>:; return; # SUCC: EXIT [100.0%] Whereas the second loop gets turned into: # BLOCK 1 # PRED: 3 [100.0%] (fallthru) 0 [100.0%] (fallthru,exec) # ivtmp.12_23 = PHI <ivtmp.12_22(3), this_8(0)>; # ivtmp.4_16 = PHI <ivtmp.4_15(3), 8(0)>; # TMT.2_18 = PHI <TMT.2_12(3), TMT.2_11(0)>; <L0>:; D.1615_6 = (float *) ivtmp.12_23; this_2 = D.1615_6; # TMT.2_12 = V_MAY_DEF <TMT.2_18>; *this_2 = d_7; ivtmp.4_15 = ivtmp.4_16 - 1; ivtmp.12_22 = ivtmp.12_23 + 4B; if (ivtmp.4_15 != 0) goto <L6>; else goto <L3>; # SUCC: 3 [87.5%] (dfs_back,true,exec) 2 [12.5%] (loop_exit,false,exec) # BLOCK 3 # PRED: 1 [87.5%] (dfs_back,true,exec) <L6>:; goto <bb 1> (<L0>); # SUCC: 1 [100.0%] (fallthru) # BLOCK 2 # PRED: 1 [12.5%] (loop_exit,false,exec) <L3>:; return; # SUCC: EXIT [100.0%] Note carefully how the induction variables for the loop control counter differ. What's particularly interesting is the first loop results in the following assembly code: .LFB3: pushl %ebp .LCFI0: xorl %edx, %edx movl %esp, %ebp .LCFI1: movl 8(%ebp), %eax movl 12(%ebp), %ecx .p2align 4,,15 .L2: incl %edx movl %ecx, (%eax) addl $4, %eax cmpl $8, %edx jne .L2 popl %ebp ret Which actually runs faster than the code for the second loop (which is odd since the second loop looks to my eye to be more efficient than the first loop). .LFB3: pushl %ebp .LCFI0: movl $8, %edx movl %esp, %ebp .LCFI1: movl 8(%ebp), %eax movl 12(%ebp), %ecx .p2align 4,,15 .L2: movl %ecx, (%eax) addl $4, %eax decl %edx jne .L2 popl %ebp ret The problem appears to be related to how we add IV candidates: /* Adds candidates based on the old induction variables. */ static void add_old_ivs_candidates (struct ivopts_data *data) { unsigned i; struct iv *iv; bitmap_iterator bi; EXECUTE_IF_SET_IN_BITMAP (data->relevant, 0, i, bi) { iv = ver_info (data, i)->iv; if (iv && iv->biv_p && !zero_p (iv->step)) add_old_iv_candidates (data, iv); } } Which, if I read it correctly, will add candidates to the IV array in the same order as their SSA_NAME_VERSION. Then we have this code: /* Try extending the set of induction variables by one. */ for (i = 0; i < n_iv_cands (data); i++) { cand = iv_cand (data, i); if (iv_ca_cand_used_p (ivs, cand)) continue; acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs); if (!act_delta) continue; /* If we successfully added the candidate and the set is small enough, try optimizing it by removing other candidates. */ if (n_ivs <= ALWAYS_PRUNE_CAND_SET_BOUND) { iv_ca_delta_commit (data, ivs, act_delta, true); acost = iv_ca_prune (data, ivs, cand, &tmp_delta); iv_ca_delta_commit (data, ivs, act_delta, false); act_delta = iv_ca_delta_join (act_delta, tmp_delta); } if (acost < best_cost) { best_cost = acost; iv_ca_delta_free (&best_delta); best_delta = act_delta; } else iv_ca_delta_free (&act_delta); } Which appears to walk down the array and try and choose better IV sets. Since it walks down the IV array, which is in SSA_NAME_VERSION order. Thus two loops which are equivalent in all ways except that they use different SSA_NAME_VERSIONs can get different IV sets. Anyway, the instability of the IV opts code when presented with loops that differ only in the version #s in the SSA_NAMEs they use is really getting in the way of understanding the performance impact of the new jump threading code. I would expect this instability to also make it difficult to analyze the IVopts in general. Thoughts? Jeff