The summery goes something like this: It is possible for the second pass of ira to get confused and decide that NO_REGS or a hard float register are better choices for the result of the 2 operand mult. First pass already optimally allocated in GR_AND_MD1_REGS.
Two pass ira is enabled with "-fexpensive-optimizations". Below is the code that will provoke the problem (pre-processed fixed point function from libgcc) -8<------------------------------------ typedef unsigned long size_t; typedef int HItype __attribute__ ((mode (HI))); typedef unsigned int UHItype __attribute__ ((mode (HI))); typedef _Fract HQtype __attribute__ ((mode (HQ))); typedef unsigned _Fract UHQtype __attribute__ ((mode (UHQ))); typedef int SItype __attribute__ ((mode (SI))); typedef unsigned int USItype __attribute__ ((mode (SI))); extern void *memcpy (void *, const void *, size_t); extern USItype __saturate1uhq (USItype); UHQtype __mulhelperuhq (UHQtype a, UHQtype b, int satp) { UHQtype c; UHItype x, y, z; USItype dx, dy, dz; memcpy (&x, &a, 2); memcpy (&y, &b, 2); dx = (USItype) x; dy = (USItype) y; dz = dx * dy; dz += ((USItype) 1 << (16 - 1)); dz = dz >> 16; if (satp) dz = __saturate1uhq (dz); z = (UHItype) dz; memcpy (&c, &z, 2); return c; } -8<------------------------------------ Compiling with -O1 give pretty optimal code (Check that impressive optimi- zation of memcpy()): -8<------------------------------------ .file 1 "u1.c" .section .mdebug.abi32 .previous .gnu_attribute 4, 3 # -G value = 8, Arch = mips1, ISA = 1 # GNU C version 4.7.0 (mips-sde-elf) # compiled by GNU C version 4.6.3 20120306 (Red Hat 4.6.3-2), GMP version 4.3.2, MPFR version 3.0.0, MPC version 0.9 # GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 # options passed: u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all # -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm # -frandom-seed=0 -O1 -msoft-float -fno-expensive-optimizations ... __mulhelperuhq: .frame $sp,24,$31 # vars= 0, regs= 1/0, args= 16, gp= 0 .mask 0x80000000,-4 .fmask 0x00000000,0 .set noreorder .set nomacro andi $5,$5,0xffff # b, b andi $4,$4,0xffff # a, a mult $5,$4 # b, a mflo $2 # dz li $3,32768 # 0x8000 # tmp209, addu $2,$2,$3 # dz, dz, tmp209 beq $6,$0,.L5 #, satp,, srl $2,$2,16 # dz, dz, addiu $sp,$sp,-24 #,, sw $31,20($sp) #, jal __saturate1uhq # move $4,$2 #, dz lw $31,20($sp) #, addiu $sp,$sp,24 #,, .L5: j $31 nop ... .ident "GCC: 4.7.0" -8<------------------------------------ It looks as optimal as it gets... Unfortunately, when enabling -fexpensive-optimizations the code get really bad: -8<------------------------------------ ... # options passed: u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all # -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm # -frandom-seed=0 -O1 -msoft-float -fexpensive-optimizations ... __mulhelperuhq: .frame $sp,32,$31 # vars= 8, regs= 1/0, args= 16, gp= 0 ... addiu $sp,$sp,-32 # <<< set up stack frame sw $31,28($sp) # <<< save link reg andi $5,$5,0xffff # b, b andi $4,$4,0xffff # a, a mult $5,$4 # b, a mflo $2 # <<< move from mdlo sw $2,16($sp) # <<< store mdlo on the stack li $2,32768 mflo $3 # <<< move from mdlo again! addu $2,$3,$2 # dz,, tmp209 beq $6,$0,.L2 #, satp,, srl $2,$2,16 # dz, dz, jal __saturate1uhq move $4,$2 #, dz .L2: lw $31,28($sp) nop j $31 addiu $sp,$sp,32 ... -8<------------------------------------ Here two additional instructions, to get mdlo and store it on the stack, has been added. Notice how the valid mdlo value is overwritten and then immediately reloaded and how 16($sp) is never actually used: mflo $2 # <<< move from mdlo sw $2,16($sp) # <<< store mdlo on the stack li $2,32768 mflo $3 # <<< move from mdlo again! The problem seem to originate from the ira pass find_costs_and_classes() (ira-costs.c) when the second pass fails to find something better than pass one. One reason for this to happen could be because the way mflo is penaltizied: -8<------------------------------------ static int mips_move_to_gpr_cost (enum machine_mode mode ATTRIBUTE_UNUSED, reg_class_t from) { switch (from) { case GENERAL_REGS: /* A MIPS16 MOVE instruction, or a non-MIPS16 MOVE macro. */ return 2; case ACC_REGS: /* MFLO and MFHI. */ return 6; case FP_REGS: /* MFC1, etc. */ return 4; -8<------------------------------------ ACC_REGS is always the target of the 2-op mult operation, so making mflo more expensive than memory or FP doesn't make much sense as can be seen below. Here follows the results from the IRA pass for 3 different options, A: "no expensive opts" B: expensive opts, hard float C: expensive opts, soft float The resulting code is the same in all cases - this is the few lines after the mult: (insn 10 9 11 2 (set (reg/v:SI 199 [ dz ]) (mult:SI (reg:SI 207 [ b+-2 ]) (reg:SI 208 [ a+-2 ]))) u1.c:27 35 {mulsi3_internal} (expr_list:REG_DEAD (reg:SI 208 [ a+-2 ]) (expr_list:REG_DEAD (reg:SI 207 [ b+-2 ]) (nil)))) (insn 11 10 12 2 (set (reg:SI 209) (const_int 32768 [0x8000])) u1.c:28 280 {*movsi_internal} (expr_list:REG_EQUIV (const_int 32768 [0x8000]) (nil))) (insn 12 11 13 2 (set (reg/v:SI 200 [ dz ]) (plus:SI (reg/v:SI 199 [ dz ]) (reg:SI 209))) u1.c:28 10 {*addsi3} (expr_list:REG_DEAD (reg:SI 209) (expr_list:REG_DEAD (reg/v:SI 199 [ dz ]) (nil)))) OPTION A -------- $build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm -msoft-float -fno-expensive-optimizations Result of IRA, (A) 'no expensive opts' (only one pass): (Here r199, goes to GR_AND_MD1_REGS, that is a pretty good choice!) Pass 0 for finding pseudo/allocno costs ... r199: preferred GR_AND_MD1_REGS, alternative NO_REGS, allocno GR_AND_MD1_REGS a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS ... a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000 M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000 LEA_REGS:6000,6000 GR_REGS:6000,6000 MD1_REG:6000,6000 MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000 MEM:14000,14000 ... Allocno a3r199 of GR_AND_MD1_REGS(27) has 25 avail. regs 2-5 7-25 30 65, node: 2-5 7-25 30 65 (confl regs = 0 1 6 26-29 31-64 66-186) ... Disposition: 0:r194 l0 2 3:r199 l0 2 1:r200 l0 2 5:r207 l0 5 4:r208 l0 4 2:r209 l0 3 New iteration of spill/restore move +++Costs: overall 1944, reg 1944, mem 0, ld 0, st 0, move 0 OPTION B -------- $build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm -msoft-float -fexpensive-optimizations Result of IRA, (B) 'expensive opts, hard fload' (two pass): (Here r199 is eventually allocated in FP_REGS - that is a pretty *bad* choice!) Pass 0 for finding pseudo/allocno costs a2 (r209,l0) best GR_REGS, allocno GR_REGS a4 (r208,l0) best GR_REGS, allocno GR_REGS a5 (r207,l0) best GR_REGS, allocno GR_REGS a1 (r200,l0) best GR_REGS, allocno GR_REGS a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS a0 (r194,l0) best GR_REGS, allocno GR_REGS ... a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000 M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000 LEA_REGS:6000,6000 GR_REGS:6000,6000 FP_REGS:14000,14000 MD1_REG:6000,6000 MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000 MEM:14000,14000 ... Pass 1 for finding pseudo/allocno costs r209: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r208: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r207: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r200: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r199: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS a3 (r199,l0) best FP_REGS, allocno FP_REGS r194: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS ... a3(r199,l0) costs: FP_REGS:14000,14000 MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000 MEM:14000,14000 ... Allocno a3r199 of FP_REGS(32) has 32 avail. regs 32-63, node: 32-63 (confl regs = 0-31 64-186) ... Disposition: 0:r194 l0 2 3:r199 l0 32 1:r200 l0 2 5:r207 l0 5 4:r208 l0 4 2:r209 l0 2 New iteration of spill/restore move +++Costs: overall 9944, reg 9944, mem 0, ld 0, st 0, move 0 OPTION C -------- $build/gcc/cc1 u1.c -mno-mips16 -O1 -march=mips1 -fdump-tree-all -fdump-ipa-all -ftree-vectorizer-verbose=9 -fdump-rtl-all -fverbose-asm -mhard-float -fexpensive-optimizations It is hard to imagine it can be any worse than this. But sure enough, it will if no hard FP regs are available. (Partly because the FP reg in example B will be optimized out by later passes so it will never end up in real code): Here IRA comes to the conclusion that NO_REGS is the better choice! r199: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS a3 (r199,l0) best NO_REGS, allocno NO_REGS Pass 0 for finding pseudo/allocno costs a2 (r209,l0) best GR_REGS, allocno GR_REGS a4 (r208,l0) best GR_REGS, allocno GR_REGS a5 (r207,l0) best GR_REGS, allocno GR_REGS a1 (r200,l0) best GR_REGS, allocno GR_REGS a3 (r199,l0) best GR_AND_MD1_REGS, allocno GR_AND_MD1_REGS a0 (r194,l0) best GR_REGS, allocno GR_REGS ... a3(r199,l0) costs: M16_REGS:6000,6000 T_REG:6000,6000 M16_T_REGS:6000,6000 PIC_FN_ADDR_REG:6000,6000 V1_REG:6000,6000 LEA_REGS:6000,6000 GR_REGS:6000,6000 MD1_REG:6000,6000 MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000 MEM:14000,14000 ... Pass 1 for finding pseudo/allocno costs r209: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r208: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r207: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r200: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS r199: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS a3 (r199,l0) best NO_REGS, allocno NO_REGS r194: preferred GR_REGS, alternative NO_REGS, allocno GR_REGS ... a3(r199,l0) costs: MD_REGS:2000000,2000000 ACC_REGS:2000000,2000000 GR_AND_MD0_REGS:2000000,2000000 GR_AND_MD1_REGS:18000,18000 GR_AND_MD_REGS:2000000,2000000 GR_AND_ACC_REGS:2000000,2000000 ALL_REGS:2000000,2000000 MEM:14000,14000 ... Popping a4(r208,l0) -- assign reg 4 Disposition: 0:r194 l0 2 3:r199 l0 mem 1:r200 l0 2 5:r207 l0 5 4:r208 l0 4 2:r209 l0 2 New iteration of spill/restore move +++Costs: overall 9944, reg -4056, mem 14000, ld 0, st 0, move 0