Hello! Attached patch avoids a deficiency in reload, where reload gives up on handling subregs of pseudos (please see the PR [1] for explanation by Ulrich). The patch simply avoids generating V4SF moves with V4SF subregs of V2DF values unless really necessary (i.e. moving SSE2 modes without SSE2 enabled, which shouldn't happen anyway). With patched gcc, expand pass emits (unaligned) moves in their original mode, and this mode is kept until asm is generated. The asm instruction is chosen according to the mode of insn pattern, and the mode is calculated using various influencing conditions.
2012-05-09 Uros Bizjak <ubiz...@gmail.com> PR target/44141 * config/i386/i386.c (ix86_expand_vector_move_misalign): Do not handle 128 bit vectors specially for TARGET_AVX. Emit sse2_movupd and sse_movupd RTXes for TARGET_AVX, TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL or when optimizing for size. * config/i386/sse.md (*mov<mode>_internal): Remove TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling from asm output code. Calculate "mode" attribute according to optimize_function_for_size_p and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flag. (*<sse>_movu<ssemodesuffix><avxsizesuffix>): Choose asm template depending on the mode of the instruction. Calculate "mode" attribute according to optimize_function_for_size_p, TARGET_SSE_TYPELESS_STORES and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flags. (*<sse2>_movdqu<avxsizesuffix>): Ditto. Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}. The patch also fixes the testcase from the PR. Patch will be committed to mainline SVN. [1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44141#c16 Uros.
Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 187286) +++ config/i386/sse.md (working copy) @@ -449,8 +449,6 @@ && (misaligned_operand (operands[0], <MODE>mode) || misaligned_operand (operands[1], <MODE>mode))) return "vmovupd\t{%1, %0|%0, %1}"; - else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - return "%vmovaps\t{%1, %0|%0, %1}"; else return "%vmovapd\t{%1, %0|%0, %1}"; @@ -460,8 +458,6 @@ && (misaligned_operand (operands[0], <MODE>mode) || misaligned_operand (operands[1], <MODE>mode))) return "vmovdqu\t{%1, %0|%0, %1}"; - else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - return "%vmovaps\t{%1, %0|%0, %1}"; else return "%vmovdqa\t{%1, %0|%0, %1}"; @@ -475,19 +471,21 @@ [(set_attr "type" "sselog1,ssemov,ssemov") (set_attr "prefix" "maybe_vex") (set (attr "mode") - (cond [(match_test "TARGET_AVX") + (cond [(and (eq_attr "alternative" "1,2") + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) + (if_then_else + (match_test "GET_MODE_SIZE (<MODE>mode) > 16") + (const_string "V8SF") + (const_string "V4SF")) + (match_test "TARGET_AVX") (const_string "<sseinsnmode>") - (ior (ior (match_test "optimize_function_for_size_p (cfun)") - (not (match_test "TARGET_SSE2"))) + (ior (and (eq_attr "alternative" "1,2") + (match_test "optimize_function_for_size_p (cfun)")) (and (eq_attr "alternative" "2") (match_test "TARGET_SSE_TYPELESS_STORES"))) (const_string "V4SF") - (eq (const_string "<MODE>mode") (const_string "V4SFmode")) - (const_string "V4SF") - (eq (const_string "<MODE>mode") (const_string "V2DFmode")) - (const_string "V2DF") ] - (const_string "TI")))]) + (const_string "<sseinsnmode>")))]) (define_insn "sse2_movq128" [(set (match_operand:V2DI 0 "register_operand" "=x") @@ -597,11 +595,33 @@ [(match_operand:VF 1 "nonimmediate_operand" "xm,x")] UNSPEC_MOVU))] "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}" +{ + switch (get_attr_mode (insn)) + { + case MODE_V8SF: + case MODE_V4SF: + return "%vmovups\t{%1, %0|%0, %1}"; + default: + return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"; + } +} [(set_attr "type" "ssemov") (set_attr "movu" "1") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "<MODE>")]) + (set (attr "mode") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (if_then_else + (match_test "GET_MODE_SIZE (<MODE>mode) > 16") + (const_string "V8SF") + (const_string "V4SF")) + (match_test "TARGET_AVX") + (const_string "<MODE>") + (ior (match_test "optimize_function_for_size_p (cfun)") + (and (eq_attr "alternative" "1") + (match_test "TARGET_SSE_TYPELESS_STORES"))) + (const_string "V4SF") + ] + (const_string "<MODE>")))]) (define_expand "<sse2>_movdqu<avxsizesuffix>" [(set (match_operand:VI1 0 "nonimmediate_operand") @@ -618,7 +638,16 @@ (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")] UNSPEC_MOVU))] "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "%vmovdqu\t{%1, %0|%0, %1}" +{ + switch (get_attr_mode (insn)) + { + case MODE_V8SF: + case MODE_V4SF: + return "%vmovups\t{%1, %0|%0, %1}"; + default: + return "%vmovdqu\t{%1, %0|%0, %1}"; + } +} [(set_attr "type" "ssemov") (set_attr "movu" "1") (set (attr "prefix_data16") @@ -627,7 +656,20 @@ (const_string "*") (const_string "1"))) (set_attr "prefix" "maybe_vex") - (set_attr "mode" "<sseinsnmode>")]) + (set (attr "mode") + (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (if_then_else + (match_test "GET_MODE_SIZE (<MODE>mode) > 16") + (const_string "V8SF") + (const_string "V4SF")) + (match_test "TARGET_AVX") + (const_string "<sseinsnmode>") + (ior (match_test "optimize_function_for_size_p (cfun)") + (and (eq_attr "alternative" "1") + (match_test "TARGET_SSE_TYPELESS_STORES"))) + (const_string "V4SF") + ] + (const_string "<sseinsnmode>")))]) (define_insn "<sse3>_lddqu<avxsizesuffix>" [(set (match_operand:VI1 0 "register_operand" "=x") Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 187289) +++ config/i386/i386.c (working copy) @@ -15907,60 +15907,19 @@ ix86_expand_vector_move_misalign (enum machine_mod op0 = operands[0]; op1 = operands[1]; - if (TARGET_AVX) + if (TARGET_AVX + && GET_MODE_SIZE (mode) == 32) { switch (GET_MODE_CLASS (mode)) { case MODE_VECTOR_INT: case MODE_INT: - switch (GET_MODE_SIZE (mode)) - { - case 16: - if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - } - else - { - op0 = gen_lowpart (V16QImode, op0); - op1 = gen_lowpart (V16QImode, op1); - emit_insn (gen_sse2_movdqu (op0, op1)); - } - break; - case 32: - op0 = gen_lowpart (V32QImode, op0); - op1 = gen_lowpart (V32QImode, op1); - ix86_avx256_split_vector_move_misalign (op0, op1); - break; - default: - gcc_unreachable (); - } - break; + op0 = gen_lowpart (V32QImode, op0); + op1 = gen_lowpart (V32QImode, op1); + /* FALLTHRU */ + case MODE_VECTOR_FLOAT: - switch (mode) - { - case V4SFmode: - emit_insn (gen_sse_movups (op0, op1)); - break; - case V2DFmode: - if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - } - else - emit_insn (gen_sse2_movupd (op0, op1)); - break; - case V8SFmode: - case V4DFmode: - ix86_avx256_split_vector_move_misalign (op0, op1); - break; - default: - gcc_unreachable (); - } + ix86_avx256_split_vector_move_misalign (op0, op1); break; default: @@ -15972,16 +15931,6 @@ ix86_expand_vector_move_misalign (enum machine_mod if (MEM_P (op1)) { - /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p () - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - return; - } - /* ??? If we have typed data, then it would appear that using movdqu is the only way to get unaligned data loaded with integer type. */ @@ -15989,16 +15938,19 @@ ix86_expand_vector_move_misalign (enum machine_mod { op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); + /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_movdqu (op0, op1)); - return; } - - if (TARGET_SSE2 && mode == V2DFmode) + else if (TARGET_SSE2 && mode == V2DFmode) { rtx zero; - if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + if (TARGET_AVX + || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + || optimize_function_for_size_p (cfun)) { + /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_movupd (op0, op1)); return; } @@ -16030,7 +15982,10 @@ ix86_expand_vector_move_misalign (enum machine_mod } else { - if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + if (TARGET_AVX + || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + || optimize_function_for_size_p (cfun)) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); @@ -16045,6 +16000,7 @@ ix86_expand_vector_move_misalign (enum machine_mod if (mode != V4SFmode) op0 = gen_lowpart (V4SFmode, op0); + m = adjust_address (op1, V2SFmode, 0); emit_insn (gen_sse_loadlps (op0, op0, m)); m = adjust_address (op1, V2SFmode, 8); @@ -16053,30 +16009,20 @@ ix86_expand_vector_move_misalign (enum machine_mod } else if (MEM_P (op0)) { - /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p () - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - return; - } - - /* ??? Similar to above, only less clear - because of typeless stores. */ - if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) { op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); + /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_movdqu (op0, op1)); - return; } - - if (TARGET_SSE2 && mode == V2DFmode) + else if (TARGET_SSE2 && mode == V2DFmode) { - if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + if (TARGET_AVX + || TARGET_SSE_UNALIGNED_STORE_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + || optimize_function_for_size_p (cfun)) + /* We will eventually emit movups based on insn attributes. */ emit_insn (gen_sse2_movupd (op0, op1)); else { @@ -16091,7 +16037,10 @@ ix86_expand_vector_move_misalign (enum machine_mod if (mode != V4SFmode) op1 = gen_lowpart (V4SFmode, op1); - if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + if (TARGET_AVX + || TARGET_SSE_UNALIGNED_STORE_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + || optimize_function_for_size_p (cfun)) { op0 = gen_lowpart (V4SFmode, op0); emit_insn (gen_sse_movups (op0, op1));