Hi! On #define N 4096 unsigned int b[N], d[N];
void foo (void) { int i; for (i = 0; i < N; i++) d[i] = b[i] / 3; } testcase I was looking earlier today because of the XOP issues, I've noticed we generate unnecessary code: vmovdqa .LC0(%rip), %ymm2 ... vpsrlq $32, %ymm2, %ymm3 before the loop and in the loop: vmovdqa b(%rax), %ymm0 vpmuludq b(%rax), %ymm2, %ymm1 ... vpsrlq $32, %ymm0, %ymm0 vpmuludq %ymm3, %ymm0, %ymm0 ... .LC0: .long -1431655765 .long -1431655765 .long -1431655765 .long -1431655765 .long -1431655765 .long -1431655765 .long -1431655765 .long -1431655765 The first vpsrlq and having an extra register live across the loop is not needed, if each pair of constants in the constant vector is equal, we can just use .LC0(%rip) (i.e. %ymm2 above) in both places. Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux. Apparently ix86_expand_binop_builtin wasn't prepared for NULL predicates (but generic code is), alternatively perhaps I could add a predicate that would accept nonimmediate_operand or CONSTANT_VECTOR if that is preferrable that way. Also, not sure if force_reg or copy_to_mode_reg is preferrable. 2013-04-26 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (ix86_expand_binop_builtin): Allow NULL predicate. (const_vector_equal_evenodd_p): New function. (ix86_expand_mul_widen_evenodd): Force op1 resp. op2 into register if they aren't nonimmediate operands. If their original values satisfy const_vector_equal_evenodd_p, don't shift them. * config/i386/sse.md (mul<mode>3): Remove predicates. For the SSE4.1 case force operands[{1,2}] into registers if not nonimmediate_operand. (vec_widen_smult_even_v4si): Use nonimmediate_operand predicates instead of register_operand. (vec_widen_<s>mult_odd_<mode>): Remove predicates. --- gcc/config/i386/i386.c.jj 2013-04-26 15:11:37.000000000 +0200 +++ gcc/config/i386/i386.c 2013-04-26 19:03:54.777293448 +0200 @@ -30149,9 +30150,11 @@ ix86_expand_binop_builtin (enum insn_cod op1 = gen_lowpart (TImode, x); } - if (!insn_data[icode].operand[1].predicate (op0, mode0)) + if (insn_data[icode].operand[1].predicate + && !insn_data[icode].operand[1].predicate (op0, mode0)) op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[2].predicate (op1, mode1)) + if (insn_data[icode].operand[2].predicate + && !insn_data[icode].operand[2].predicate (op1, mode1)) op1 = copy_to_mode_reg (mode1, op1); pat = GEN_FCN (icode) (target, op0, op1); @@ -40826,6 +40829,24 @@ ix86_expand_vecop_qihi (enum rtx_code co gen_rtx_fmt_ee (code, qimode, op1, op2)); } +/* Helper function of ix86_expand_mul_widen_evenodd. Return true + if op is CONST_VECTOR with all odd elements equal to their + preceeding element. */ + +static bool +const_vector_equal_evenodd_p (rtx op) +{ + enum machine_mode mode = GET_MODE (op); + int i, nunits = GET_MODE_NUNITS (mode); + if (GET_CODE (op) != CONST_VECTOR + || nunits != CONST_VECTOR_NUNITS (op)) + return false; + for (i = 0; i < nunits; i += 2) + if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) + return false; + return true; +} + void ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, bool uns_p, bool odd_p) @@ -40833,6 +40854,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, enum machine_mode mode = GET_MODE (op1); enum machine_mode wmode = GET_MODE (dest); rtx x; + rtx orig_op1 = op1, orig_op2 = op2; + + if (!nonimmediate_operand (op1, mode)) + op1 = force_reg (mode, op1); + if (!nonimmediate_operand (op2, mode)) + op2 = force_reg (mode, op2); /* We only play even/odd games with vectors of SImode. */ gcc_assert (mode == V4SImode || mode == V8SImode); @@ -40849,10 +40876,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, } x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); - op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), - x, NULL, 1, OPTAB_DIRECT); - op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), - x, NULL, 1, OPTAB_DIRECT); + if (!const_vector_equal_evenodd_p (orig_op1)) + op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), + x, NULL, 1, OPTAB_DIRECT); + if (!const_vector_equal_evenodd_p (orig_op2)) + op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), + x, NULL, 1, OPTAB_DIRECT); op1 = gen_lowpart (mode, op1); op2 = gen_lowpart (mode, op2); } --- gcc/config/i386/sse.md.jj 2013-04-26 15:11:37.000000000 +0200 +++ gcc/config/i386/sse.md 2013-04-26 18:59:03.838753277 +0200 @@ -5631,14 +5631,16 @@ (define_insn "*sse2_pmaddwd" (define_expand "mul<mode>3" [(set (match_operand:VI4_AVX2 0 "register_operand") (mult:VI4_AVX2 - (match_operand:VI4_AVX2 1 "nonimmediate_operand") - (match_operand:VI4_AVX2 2 "nonimmediate_operand")))] + (match_operand:VI4_AVX2 1) + (match_operand:VI4_AVX2 2)))] "TARGET_SSE2" { if (TARGET_SSE4_1) { - if (CONSTANT_P (operands[2])) - operands[2] = force_const_mem (<MODE>mode, operands[2]); + if (!nonimmediate_operand (operands[1], <MODE>mode)) + operands[1] = force_reg (<MODE>mode, operands[1]); + if (!nonimmediate_operand (operands[2], <MODE>mode)) + operands[2] = force_reg (<MODE>mode, operands[2]); ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); } else @@ -5702,8 +5704,8 @@ (define_expand "vec_widen_<s>mult_lo_<mo ;; named patterns, but signed V4SI needs special help for plain SSE2. (define_expand "vec_widen_smult_even_v4si" [(match_operand:V2DI 0 "register_operand") - (match_operand:V4SI 1 "register_operand") - (match_operand:V4SI 2 "register_operand")] + (match_operand:V4SI 1 "nonimmediate_operand") + (match_operand:V4SI 2 "nonimmediate_operand")] "TARGET_SSE2" { ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], @@ -5714,8 +5716,8 @@ (define_expand "vec_widen_smult_even_v4s (define_expand "vec_widen_<s>mult_odd_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (any_extend:<sseunpackmode> - (match_operand:VI4_AVX2 1 "register_operand")) - (match_operand:VI4_AVX2 2 "register_operand")] + (match_operand:VI4_AVX2 1)) + (match_operand:VI4_AVX2 2)] "TARGET_SSE2" { ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], Jakub