[PATCH][expmed] Properly account for the cost and latency of shift+add ops when synthesizing mults

Kyrill Tkachov Tue, 14 Apr 2015 01:12:16 -0700

Hi all,

The description of the relevant code at 
https://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html is helpful for this...
The mult synthesis code at some points assumes that a shift operation can 
execute in parallel with previous steps
in the algorithm on superscalar machines. However, that assumption is made in 
the wrong place i.e. the alg_add_factor
and alg_sub_factor steps. These two steps shift the previously accumulated step 
and then add(subtract) it to itself.
The comment says:
>> alg_add_factor       total := total * coeff + total;
>> alg_sub_factor       total := total * coeff - total;


This means there's a dependency and the shift part cannot execute in parallel 
with the calculation of the
subtotal. Looking at the code, the only place where a shift can execute in 
parallel with a subtotal
calculation is in the case of:
>> alg_add_t_m2         total := total + multiplicand * coeff;
>> alg_sub_t_m2         total := total - multiplicand * coeff;

But alg_add_t_m2 is only ever used with a shift of 0, so alg_sub_t_m2 is the 
only place where
it makes sense to make that assumption.

This patch fixes the cost calculations by moving the assumption that in a 
shift+sub operation
the shift can execute in parallel with the subtotal calculation to the 
alg_sub_t_m2 and removing
it from the alg_add_factor and alg_add_factor cases.

Is this a correct line of reasoning, or am I misunderstanding something in the 
code?


Of course the effect on codegen of this patch depends a lot on the rtx costs in 
the backend.
On aarch64 with -mcpu=cortex-a57 tuning I see the cost limit being exceeded in 
more cases and the
expansion code choosing instead to do a move-immediate and a mul instruction.
No regressions on SPEC2006 on a Cortex-A57.

For example, for code:
long f0 (int x, int y)
{
  return (long)x * 6L;
}


int f1(int x)
{
  return x * 10;
}

int f2(int x)
{
    return x * 100;
}

int f3(int x)
{
    return x * 20;
}

int f4(int x)
{
    return x * 25;
}

int f5(int x)
{
      return x * 11;
}

before this patch we generate:
f0:
    sxtw    x0, w0
    lsl    x1, x0, 3
    sub    x0, x1, x0, lsl 1
    ret

f1:
    lsl    w1, w0, 3
    add    w0, w1, w0, lsl 1
    ret

f2:
    mov    w1, 100
    mul    w0, w0, w1
    ret

f3:
    lsl    w1, w0, 4
    add    w0, w1, w0, lsl 2
    ret

f4:
    lsl    w1, w0, 5
    sub    w1, w1, w0, lsl 3
    add    w0, w1, w0
    ret

f5:
    lsl    w1, w0, 4
    sub    w1, w1, w0, lsl 2
    sub    w0, w1, w0
    ret


with this patch we generate:
f0:
    mov    w1, 6
    smull    x0, w0, w1
    ret
f1:
    add    w0, w0, w0, lsl 2
    lsl    w0, w0, 1
    ret

f2:
    mov    w1, 100
    mul    w0, w0, w1
    ret

f3:
    add    w0, w0, w0, lsl 2
    lsl    w0, w0, 2
    ret

f4:
    mov    w1, 25
    mul    w0, w0, w1
    ret

f5:
    mov    w1, 11
    mul    w0, w0, w1
    ret


Bootstrapped and tested on arm, aarch64, x86_64-linux.
Ok for trunk?

Thanks,
Kyrill

2015-04-14  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * expmed.c: (synth_mult): Only assume overlapping
    shift with previous steps in alg_sub_t_m2 case.

commit dce3d3ba2e16a812348e4d8c4184c3779dc47c3d
Author: Kyrylo Tkachov <kyrylo.tkac...@arm.com>
Date:   Thu Mar 12 17:38:20 2015 +0000

    [expmed] Properly account for the cost and latency of shift+sub ops when synthesizing mults

diff --git a/gcc/expmed.c b/gcc/expmed.c
index 4fc35df..961b79e 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -2653,14 +2653,28 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       m = exact_log2 (-orig_t + 1);
       if (m >= 0 && m < maxm)
 	{
-	  op_cost = shiftsub1_cost (speed, mode, m);
+	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
+	  /* If the target has a cheap shift-and-subtract insn use
+	     that in preference to a shift insn followed by a sub insn.
+	     Assume that the shift-and-sub is "atomic" with a latency
+	     equal to it's cost, otherwise assume that on superscalar
+	     hardware the shift may be executed concurrently with the
+	     earlier steps in the algorithm.  */
+	  if (shiftsub1_cost (speed, mode, m) <= op_cost)
+	    {
+	      op_cost = shiftsub1_cost (speed, mode, m);
+	      op_latency = op_cost;
+	    }
+	  else
+	    op_latency = add_cost (speed, mode);
+
 	  new_limit.cost = best_cost.cost - op_cost;
-	  new_limit.latency = best_cost.latency - op_cost;
+	  new_limit.latency = best_cost.latency - op_latency;
 	  synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m,
 		      &new_limit, mode);
 
 	  alg_in->cost.cost += op_cost;
-	  alg_in->cost.latency += op_cost;
+	  alg_in->cost.latency += op_latency;
 	  if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
 	    {
 	      best_cost = alg_in->cost;
@@ -2693,20 +2707,12 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       if (t % d == 0 && t > d && m < maxm
 	  && (!cache_hit || cache_alg == alg_add_factor))
 	{
-	  /* If the target has a cheap shift-and-add instruction use
-	     that in preference to a shift insn followed by an add insn.
-	     Assume that the shift-and-add is "atomic" with a latency
-	     equal to its cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
 	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-	  if (shiftadd_cost (speed, mode, m) < op_cost)
-	    {
-	      op_cost = shiftadd_cost (speed, mode, m);
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost (speed, mode);
+	  if (shiftadd_cost (speed, mode, m) <= op_cost)
+	    op_cost = shiftadd_cost (speed, mode, m);
+
+	  op_latency = op_cost;
+
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;
@@ -2731,20 +2737,11 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
       if (t % d == 0 && t > d && m < maxm
 	  && (!cache_hit || cache_alg == alg_sub_factor))
 	{
-	  /* If the target has a cheap shift-and-subtract insn use
-	     that in preference to a shift insn followed by a sub insn.
-	     Assume that the shift-and-sub is "atomic" with a latency
-	     equal to it's cost, otherwise assume that on superscalar
-	     hardware the shift may be executed concurrently with the
-	     earlier steps in the algorithm.  */
 	  op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
-	  if (shiftsub0_cost (speed, mode, m) < op_cost)
-	    {
-	      op_cost = shiftsub0_cost (speed, mode, m);
-	      op_latency = op_cost;
-	    }
-	  else
-	    op_latency = add_cost (speed, mode);
+	  if (shiftsub0_cost (speed, mode, m) <= op_cost)
+	    op_cost = shiftsub0_cost (speed, mode, m);
+
+	  op_latency = op_cost;
 
 	  new_limit.cost = best_cost.cost - op_cost;
 	  new_limit.latency = best_cost.latency - op_latency;

[PATCH][expmed] Properly account for the cost and latency of shift+add ops when synthesizing mults

Reply via email to