Hi there,

I have implemented a move of a v16sf type like this because it is held by 4 v4sf registers:

--- snip ---

(define_expand "movv16sf"
[(set (match_operand:V16SF 0 "nonimmediate_operand" "")
(match_operand:V16SF 1 "general_operand" ""))]
""
" if ((reload_in_progress | reload_completed) == 0
&& !register_operand (operands[0], V16SFmode)
&& !nonmemory_operand (operands[1], V16SFmode))
operands[1] = force_reg (V16SFmode, operands[1]);

move_v16sf( operands );
DONE;
")

--- end snip ---


and in the config's .c file:


--- snip ---

void
move_v16sf (operands )
rtx operands[];
{
rtx op0 = operands[0];
rtx op1 = operands[1];
enum rtx_code code0 = GET_CODE (operands[0]);
enum rtx_code code1 = GET_CODE (operands[1]);
int subreg_offset0 = 0;
int subreg_offset1 = 0;
enum delay_type delay = DELAY_NONE;

if (code0 == REG)
{
int regno0 = REGNO (op0) + subreg_offset0;

if (code1 == REG)
{
int regno1 = REGNO (op1) + subreg_offset1;

/* Just in case, don't do anything for assigning a register
to itself, unless we are filling a delay slot. */
if (regno0 == regno1 && set_nomacro == 0) return;

emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 0 ), gen_rtx_SUBREG( V4SFmode, op1, 0 ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 16 ), gen_rtx_SUBREG( V4SFmode, op1, 16 ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 32 ), gen_rtx_SUBREG( V4SFmode, op1, 32 ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 48 ), gen_rtx_SUBREG( V4SFmode, op1, 48 ) );
}
else if (code1 == MEM)
{
rtx src_reg;

src_reg = copy_addr_to_reg ( XEXP (op1,0) );

emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 0 ), gen_rtx_MEM( V4SFmode, src_reg ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 16 ), gen_rtx_MEM( V4SFmode, plus_constant( src_reg, 16 ) ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 32 ), gen_rtx_MEM( V4SFmode, plus_constant( src_reg, 32 ) ) );
emit_move_insn( gen_rtx_SUBREG (V4SFmode, op0, 48 ), gen_rtx_MEM( V4SFmode, plus_constant( src_reg, 48 ) ) );
}

}

else if (code0 == MEM)
{
if (code1 == REG)
{
rtx dest_reg;

dest_reg = copy_addr_to_reg ( XEXP (op0,0) );

emit_move_insn( gen_rtx_MEM( V4SFmode, dest_reg ), gen_rtx_SUBREG (V4SFmode, op1, 0 ) );
emit_move_insn( gen_rtx_MEM( V4SFmode, plus_constant( dest_reg, 16) ), gen_rtx_SUBREG (V4SFmode, op1, 16 ) );
emit_move_insn( gen_rtx_MEM( V4SFmode, plus_constant( dest_reg, 32) ), gen_rtx_SUBREG (V4SFmode, op1, 32 ) );
emit_move_insn( gen_rtx_MEM( V4SFmode, plus_constant( dest_reg, 48) ), gen_rtx_SUBREG (V4SFmode, op1, 48 ) );
}
}

}
--- end snip ---


This works ok, but it produces inefficient code, here some sample source code:

--- snip ---

typedef int v4 __attribute__((mode(V4SF)));
typedef int m4 __attribute__((mode(V16SF)));

v4 vec1, vec2;
m4 frog;

int main( int argc, char* argv[] )
{
m4 blob;

asm( "some_instruction %0,%1,%2,%3" : "=&j" (blob): "j" (vec1), "j" (vec2), "j" (frog) );
asm( "some_instruction2 %0,%1" : "=&j" (frog) : "j" (blob) );

return 0;
}

--- end snip ---

where j is the register class for v4sf and v16sf types.
This produces a move of the v16sf type between the two asm instructions, when it doesn't need to, does anyone have any ideas why this move isn't eliminated?

#APP
some_instruction r10,r22,r20,r00
#NO_APP
move r00,r10
move r01,r11
move r02,r12
move r03,r13
#APP
some_instruction2 r10, r00


r10 isn't needed to be preserved (it isn't written out) but it seems to be making a copy anyway. Worse, if "blob" is defined in global space like "frog", then it also writes out r10 to memory when it shouldn't.


Any ideas appreciated.

Regards

Reply via email to