MIPS: 2'nd pass of ira, causes weird register allocation for 2-op mult

Klaus Pedersen Mon, 23 Apr 2012 08:30:27 -0700

The summery goes something like this:

It is possible for the second pass of ira to get confused and decide that
NO_REGS or a hard float register are better choices for the result of the
2 operand mult. First pass already optimally allocated in GR_AND_MD1_REGS.


Two pass ira is enabled with "-fexpensive-optimizations".


Below is the code that will provoke the problem (pre-processed fixed point
function from libgcc)

-8<------------------------------------
typedef unsigned long size_t;
typedef int HItype __attribute__ ((mode (HI)));
typedef unsigned int UHItype __attribute__ ((mode (HI)));
typedef _Fract HQtype __attribute__ ((mode (HQ)));
typedef unsigned _Fract UHQtype __attribute__ ((mode (UHQ)));
typedef int SItype __attribute__ ((mode (SI)));
typedef unsigned int USItype __attribute__ ((mode (SI)));
extern void *memcpy (void *, const void *, size_t);
extern USItype __saturate1uhq (USItype);

UHQtype
__mulhelperuhq (UHQtype a, UHQtype b, int satp)
{
  UHQtype c;
  UHItype x, y, z;
  USItype dx, dy, dz;

  memcpy (&x, &a, 2);
  memcpy (&y, &b, 2);
  dx = (USItype) x;
  dy = (USItype) y;
  dz = dx * dy;
  dz += ((USItype) 1 << (16 - 1));
  dz = dz >> 16;
  if (satp)
    dz = __saturate1uhq (dz);
  z = (UHItype) dz;

  memcpy (&c, &z, 2);
  return c;
}
-8<------------------------------------

Compiling with -O1 give pretty optimal code (Check that impressive optimi-
zation of memcpy()):

-8<------------------------------------
        .file   1 "u1.c"
        .section .mdebug.abi32
        .previous
        .gnu_attribute 4, 3

 # -G value = 8, Arch = mips1, ISA = 1
 # GNU C version 4.7.0 (mips-sde-elf)
 #      compiled by GNU C version 4.6.3 20120306 (Red Hat 4.6.3-2), GMP
version 4.3.2, MPFR version 3.0.0, MPC version 0.9
 # GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
 # options passed:  u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all
 # -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm
 # -frandom-seed=0 -O1 -msoft-float -fno-expensive-optimizations
...
__mulhelperuhq:
        .frame  $sp,24,$31              # vars= 0, regs= 1/0, args= 16, gp= 0
        .mask   0x80000000,-4
        .fmask  0x00000000,0
        .set    noreorder
        .set    nomacro
        andi    $5,$5,0xffff     # b, b
        andi    $4,$4,0xffff     # a, a
        mult    $5,$4    # b, a
        mflo    $2       # dz
        li      $3,32768                        # 0x8000         # tmp209,
        addu    $2,$2,$3         # dz, dz, tmp209
        beq     $6,$0,.L5        #, satp,,
        srl     $2,$2,16         # dz, dz,

        addiu   $sp,$sp,-24      #,,
        sw      $31,20($sp)      #,
        jal     __saturate1uhq   #
        move    $4,$2    #, dz

        lw      $31,20($sp)      #,
        addiu   $sp,$sp,24       #,,
.L5:
        j       $31
        nop
...
        .ident  "GCC: 4.7.0"
-8<------------------------------------

It looks as optimal as it gets...

Unfortunately, when enabling -fexpensive-optimizations the code get really
bad:

-8<------------------------------------
...
 # options passed:  u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all
 # -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm
 # -frandom-seed=0 -O1 -msoft-float -fexpensive-optimizations
...
__mulhelperuhq:
        .frame  $sp,32,$31              # vars= 8, regs= 1/0, args= 16, gp= 0
...
        addiu   $sp,$sp,-32      # <<< set up stack frame
        sw      $31,28($sp)      # <<< save link reg
        andi    $5,$5,0xffff     # b, b
        andi    $4,$4,0xffff     # a, a
        mult    $5,$4            # b, a
        mflo    $2               # <<< move from mdlo
        sw      $2,16($sp)       # <<< store mdlo on the stack
        li      $2,32768
        mflo    $3               # <<< move from mdlo again!
        addu    $2,$3,$2         # dz,, tmp209
        beq     $6,$0,.L2        #, satp,,
        srl     $2,$2,16         # dz, dz,

        jal     __saturate1uhq
        move    $4,$2    #, dz

.L2:
        lw      $31,28($sp)
        nop
        j       $31
        addiu   $sp,$sp,32
...
-8<------------------------------------

Here two additional instructions, to get mdlo and store it on the stack,
has been added. Notice how the valid mdlo value is overwritten and then
immediately reloaded and how 16($sp) is never actually used:

        mflo    $2               # <<< move from mdlo
        sw      $2,16($sp)       # <<< store mdlo on the stack
        li      $2,32768
        mflo    $3               # <<< move from mdlo again!


The problem seem to originate from the ira pass find_costs_and_classes()
(ira-costs.c) when the second pass fails to find something better than pass
one.

One reason for this to happen could be because the way mflo is penaltizied:

-8<------------------------------------
static int
mips_move_to_gpr_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
                       reg_class_t from)
{
  switch (from)
    {
    case GENERAL_REGS:
      /* A MIPS16 MOVE instruction, or a non-MIPS16 MOVE macro.  */
      return 2;

    case ACC_REGS:
      /* MFLO and MFHI.  */
      return 6;

    case FP_REGS:
      /* MFC1, etc.  */
      return 4;
-8<------------------------------------


ACC_REGS is always the target of the 2-op mult operation, so making mflo
more expensive than memory or FP doesn't make much sense as can be seen
below.

Here follows the results from the IRA pass for 3 different options,

   A: "no expensive opts"

   B: expensive opts, hard float

   C: expensive opts, soft float


The resulting code is the same in all cases - this is the few lines after
the mult:

(insn 10 9 11 2 (set (reg/v:SI 199 [ dz ])
        (mult:SI (reg:SI 207 [ b+-2 ])
            (reg:SI 208 [ a+-2 ]))) u1.c:27 35 {mulsi3_internal}
     (expr_list:REG_DEAD (reg:SI 208 [ a+-2 ])
        (expr_list:REG_DEAD (reg:SI 207 [ b+-2 ])
            (nil))))

(insn 11 10 12 2 (set (reg:SI 209)
        (const_int 32768 [0x8000])) u1.c:28 280 {*movsi_internal}
     (expr_list:REG_EQUIV (const_int 32768 [0x8000])
        (nil)))

(insn 12 11 13 2 (set (reg/v:SI 200 [ dz ])
        (plus:SI (reg/v:SI 199 [ dz ])
            (reg:SI 209))) u1.c:28 10 {*addsi3}
     (expr_list:REG_DEAD (reg:SI 209)
        (expr_list:REG_DEAD (reg/v:SI 199 [ dz ])
            (nil))))


OPTION A
--------
$build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all
-fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all
-fverbose-asm -msoft-float -fno-expensive-optimizations


Result of IRA, (A) 'no expensive opts' (only one pass):
(Here r199, goes to GR_AND_MD1_REGS, that is a pretty good choice!)

Pass 0 for finding pseudo/allocno costs

...
    r199: preferred GR_AND_MD1_REGS, alternative NO_REGS, allocno
GR_AND_MD1_REGS
    a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS
...
  a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000
M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000
LEA_REGS:6000,6000 GR_REGS:6000,6000 MD1_REG:6000,6000
MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000
GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000
GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000
ALL_REGS:2000000,2000000 MEM:14000,14000
...
      Allocno a3r199 of GR_AND_MD1_REGS(27) has 25 avail. regs  2-5
7-25 30 65, node:  2-5 7-25 30 65 (confl regs =  0 1 6 26-29 31-64
66-186)
...
Disposition:
    0:r194 l0     2    3:r199 l0     2    1:r200 l0     2    5:r207 l0     5
    4:r208 l0     4    2:r209 l0     3
New iteration of spill/restore move
+++Costs: overall 1944, reg 1944, mem 0, ld 0, st 0, move 0



OPTION B
--------
$build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all
-fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all
-fverbose-asm -msoft-float -fexpensive-optimizations


Result of IRA, (B) 'expensive opts, hard fload' (two pass):
(Here r199 is eventually allocated in FP_REGS - that is a pretty *bad*
choice!)

Pass 0 for finding pseudo/allocno costs

    a2 (r209,l0) best GR_REGS, allocno GR_REGS
    a4 (r208,l0) best GR_REGS, allocno GR_REGS
    a5 (r207,l0) best GR_REGS, allocno GR_REGS
    a1 (r200,l0) best GR_REGS, allocno GR_REGS
    a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS
    a0 (r194,l0) best GR_REGS, allocno GR_REGS

...
  a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000
M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000
LEA_REGS:6000,6000 GR_REGS:6000,6000 FP_REGS:14000,14000
MD1_REG:6000,6000 MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000
GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000
GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000
ALL_REGS:2000000,2000000 MEM:14000,14000
...

Pass 1 for finding pseudo/allocno costs

    r209: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r208: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r207: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r200: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r199: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
    a3 (r199,l0) best FP_REGS, allocno FP_REGS
    r194: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS

...
  a3(r199,l0) costs: FP_REGS:14000,14000 MD_REGS:2000000,2000000
ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000
GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000
GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000
MEM:14000,14000
...
      Allocno a3r199 of FP_REGS(32) has 32 avail. regs  32-63, node:
32-63 (confl regs =  0-31 64-186)
...
Disposition:
    0:r194 l0     2    3:r199 l0    32    1:r200 l0     2    5:r207 l0     5
    4:r208 l0     4    2:r209 l0     2
New iteration of spill/restore move
+++Costs: overall 9944, reg 9944, mem 0, ld 0, st 0, move 0



OPTION C
--------
$build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all
-fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all
-fverbose-asm -mhard-float -fexpensive-optimizations

It is hard to imagine it can be any worse than this. But sure enough,
it will if no hard FP regs are available. (Partly because the FP reg
in example B will be optimized out by later passes so it will never
end up in real code):

Here IRA comes to the conclusion that NO_REGS is the better choice!
    r199: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
    a3 (r199,l0) best NO_REGS, allocno NO_REGS



Pass 0 for finding pseudo/allocno costs

    a2 (r209,l0) best GR_REGS, allocno GR_REGS
    a4 (r208,l0) best GR_REGS, allocno GR_REGS
    a5 (r207,l0) best GR_REGS, allocno GR_REGS
    a1 (r200,l0) best GR_REGS, allocno GR_REGS
    a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS
    a0 (r194,l0) best GR_REGS, allocno GR_REGS

...
  a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000
M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000
LEA_REGS:6000,6000 GR_REGS:6000,6000 MD1_REG:6000,6000
MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000
GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000
GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000
ALL_REGS:2000000,2000000 MEM:14000,14000
...

Pass 1 for finding pseudo/allocno costs

    r209: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r208: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r207: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r200: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS
    r199: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
    a3 (r199,l0) best NO_REGS, allocno NO_REGS
    r194: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS

...
  a3(r199,l0) costs: MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000
GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000
GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000
ALL_REGS:2000000,2000000 MEM:14000,14000
...
      Popping a4(r208,l0)  -- assign reg 4
Disposition:
    0:r194 l0     2    3:r199 l0   mem    1:r200 l0     2    5:r207 l0     5
    4:r208 l0     4    2:r209 l0     2
New iteration of spill/restore move
+++Costs: overall 9944, reg -4056, mem 14000, ld 0, st 0, move 0

MIPS: 2'nd pass of ira, causes weird register allocation for 2-op mult

Reply via email to