Hello! The testcase from the PR triggers a split in *movdf_internal_rex64 move pattern too late in the compilation process. Attached patch fixes this by emitting relevant moves directly, without splitting them to DImode moves at all [it looks to me, that _rex64 pattern was copied directly from 32bit *movdf_internal, since it doesn't take into account the fact, that we can move DFmode value with movq/movabsq insn].
The only complication is with DFmode immediates. A movabsq insn can copy the value into register directly, with a bit of trickery in ix86_print_operand. However, to encourage gcc to load FP constants from memory, "F -> r" case is penalized with "!". "F -> m" case is also penalized, since there is no direct DF/DImode move of immediate to memory, and this alternative still has to be split (this alternative is the same as in DImode case). As an added benefit, all new instructions (modulo F->m case) can be marked as "imove" type instead of "multi" type. 2011-03-23 Uros Bizjak <ubiz...@gmail.com> PR target/48237 * config/i386/i386.md (*movdf_internal_rex64): Do not split alternatives that can be handled with movq or movabsq insn. (*movdf_internal): Disable for !TARGET_64BIT. (*movdf_internal_nointeger): Ditto. * config/i386/i386.c (ix86_print_operand): Handle DFmode immediates. testsuite/ChangeLog: 2011-03-23 Uros Bizjak <ubiz...@gmail.com> PR target/48237 * gcc.target/i386/pr48237.c: New test. The newly added code in ix86_print_operand is in fact never triggered in the testsuite, and I was not able to construct a testcase that would exercise this part of the compiler, so in order to avoid nasty surprises, I would kindly ask other x86 maintainers to review newly added code, especially ix86_print_operand part (the approach was in fact copied from output_pic_addr_const function). Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {, -m32}. Thanks, Uros.
Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 171353) +++ config/i386/i386.md (working copy) @@ -2915,9 +2915,9 @@ (define_insn "*movdf_internal_rex64" [(set (match_operand:DF 0 "nonimmediate_operand" - "=f,m,f,r ,m ,Y2*x,Y2*x,Y2*x,m ,Yi,r ") + "=f,m,f,r ,m,!r,!m,Y2*x,Y2*x,Y2*x,m ,Yi,r ") (match_operand:DF 1 "general_operand" - "fm,f,G,rmF,Fr,C ,Y2*x,m ,Y2*x,r ,Yi"))] + "fm,f,G,rm,r,F ,F ,C ,Y2*x,m ,Y2*x,r ,Yi"))] "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) && (reload_in_progress || reload_completed || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) @@ -2938,9 +2938,15 @@ case 3: case 4: - return "#"; + return "mov{q}\t{%1, %0|%0, %1}"; case 5: + return "movabs{q}\t{%1, %0|%0, %1}"; + + case 6: + return "#"; + + case 7: switch (get_attr_mode (insn)) { case MODE_V4SF: @@ -2958,9 +2964,9 @@ default: gcc_unreachable (); } - case 6: - case 7: case 8: + case 9: + case 10: switch (get_attr_mode (insn)) { case MODE_V4SF: @@ -2995,17 +3001,27 @@ gcc_unreachable (); } - case 9: - case 10: + case 11: + case 12: return "%vmovd\t{%1, %0|%0, %1}"; default: gcc_unreachable(); } } - [(set_attr "type" "fmov,fmov,fmov,multi,multi,sselog1,ssemov,ssemov,ssemov,ssemov,ssemov") + [(set_attr "type" "fmov,fmov,fmov,imov,imov,imov,multi,sselog1,ssemov,ssemov,ssemov,ssemov,ssemov") + (set (attr "modrm") + (if_then_else + (and (eq_attr "alternative" "5") (eq_attr "type" "imov")) + (const_string "0") + (const_string "*"))) + (set (attr "length_immediate") + (if_then_else + (and (eq_attr "alternative" "5") (eq_attr "type" "imov")) + (const_string "8") + (const_string "*"))) (set (attr "prefix") - (if_then_else (eq_attr "alternative" "0,1,2,3,4") + (if_then_else (eq_attr "alternative" "0,1,2,3,4,5,6") (const_string "orig") (const_string "maybe_vex"))) (set (attr "prefix_data16") @@ -3015,18 +3031,18 @@ (set (attr "mode") (cond [(eq_attr "alternative" "0,1,2") (const_string "DF") - (eq_attr "alternative" "3,4,9,10") + (eq_attr "alternative" "3,4,5,6,11,12") (const_string "DI") /* For SSE1, we have many fewer alternatives. */ (eq (symbol_ref "TARGET_SSE2") (const_int 0)) - (cond [(eq_attr "alternative" "5,6") + (cond [(eq_attr "alternative" "7,8") (const_string "V4SF") ] (const_string "V2SF")) /* xorps is one byte shorter. */ - (eq_attr "alternative" "5") + (eq_attr "alternative" "7") (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") (const_int 0)) (const_string "V4SF") @@ -3041,7 +3057,7 @@ chains, otherwise use short move to avoid extra work. movaps encodes one byte shorter. */ - (eq_attr "alternative" "6") + (eq_attr "alternative" "8") (cond [(ne (symbol_ref "optimize_function_for_size_p (cfun)") (const_int 0)) @@ -3054,7 +3070,7 @@ /* For architectures resolving dependencies on register parts we may avoid extra work to zero out upper part of register. */ - (eq_attr "alternative" "7") + (eq_attr "alternative" "9") (if_then_else (ne (symbol_ref "TARGET_SSE_SPLIT_REGS") (const_int 0)) @@ -3068,7 +3084,7 @@ "=f,m,f,r ,o ,Y2*x,Y2*x,Y2*x,m ") (match_operand:DF 1 "general_operand" "fm,f,G,roF,Fr,C ,Y2*x,m ,Y2*x"))] - "!(MEM_P (operands[0]) && MEM_P (operands[1])) + "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) && optimize_function_for_speed_p (cfun) && TARGET_INTEGER_DFMODE_MOVES && (reload_in_progress || reload_completed @@ -3208,9 +3224,9 @@ "=f,m,f,*r ,o ,Y2*x,Y2*x,Y2*x ,m ") (match_operand:DF 1 "general_operand" "fm,f,G,*roF,*Fr,C ,Y2*x,mY2*x,Y2*x"))] - "!(MEM_P (operands[0]) && MEM_P (operands[1])) - && ((optimize_function_for_size_p (cfun) - || !TARGET_INTEGER_DFMODE_MOVES) && !TARGET_64BIT) + "!TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1])) + && (optimize_function_for_size_p (cfun) + || !TARGET_INTEGER_DFMODE_MOVES) && (reload_in_progress || reload_completed || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) || (!(TARGET_SSE2 && TARGET_SSE_MATH) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 171353) +++ config/i386/i386.c (working copy) @@ -14398,17 +14398,26 @@ ix86_print_operand (FILE *file, rtx x, i fprintf (file, "0x%08x", (unsigned int) l); } - /* These float cases don't actually occur as immediate operands. */ else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode) { - char dstr[30]; + REAL_VALUE_TYPE r; + long l[2]; - real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); - fputs (dstr, file); + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + REAL_VALUE_TO_TARGET_DOUBLE (r, l); + + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + /* We can use %d if the number is <32 bits and positive. */ + if (l[1] || l[0] < 0) + fprintf (file, "0x%lx%08lx", + (unsigned long) l[1], (unsigned long) l[0]); + else + fprintf (file, HOST_WIDE_INT_PRINT_DEC, l[0]); } - else if (GET_CODE (x) == CONST_DOUBLE - && GET_MODE (x) == XFmode) + /* These float cases don't actually occur as immediate operands. */ + else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode) { char dstr[30]; Index: testsuite/gcc.target/i386/pr48237.c =================================================================== --- testsuite/gcc.target/i386/pr48237.c (revision 0) +++ testsuite/gcc.target/i386/pr48237.c (revision 0) @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fcaller-saves -fschedule-insns2 -fselective-scheduling2 -mtune=core2" } */ + +union double_union +{ + double d; + int i[2]; +}; + +void bar (int, ...); + +void +foo (double d) +{ + union double_union du = { d }; + while (1) + { + du.i[1] -= 0x100000L; + bar (0, du.d); + du.d += d; + } +}