* config/i386/sse.md (mul<VI1_AVX2>3): Change from insn_and_split to pure expander; move expansion code ... * config/i386/i386.c (ix86_expand_vecop_qihi): ... here. New function. * config/i386/i386-protos.h: Update.
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 431db6c..4e7469d 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -192,6 +192,8 @@ extern void ix86_expand_rounddf_32 (rtx, rtx); extern void ix86_expand_trunc (rtx, rtx); extern void ix86_expand_truncdf_32 (rtx, rtx); +extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx); + #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); #endif /* TREE_CODE */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 8167770..e23c418 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -38438,6 +38438,91 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) expand_vec_perm_even_odd_1 (&d, odd); } +/* Expand a vector operation CODE for a V*QImode in terms of the + same operation on V*HImode. */ + +void +ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) +{ + enum machine_mode qimode = GET_MODE (dest); + enum machine_mode himode; + rtx (*gen_il) (rtx, rtx, rtx); + rtx (*gen_ih) (rtx, rtx, rtx); + rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; + struct expand_vec_perm_d d; + bool ok; + int i; + + if (qimode == V16QImode) + { + himode = V8HImode; + gen_il = gen_vec_interleave_lowv16qi; + gen_ih = gen_vec_interleave_highv16qi; + } + else if (qimode == V32QImode) + { + himode = V16HImode; + gen_il = gen_avx2_interleave_lowv32qi; + gen_ih = gen_avx2_interleave_highv32qi; + } + else + gcc_unreachable (); + + /* Unpack data such that we've got a source byte in each low byte of + each word. We don't care what goes into the high byte of each word. + Rather than trying to get zero in there, most convenient is to let + it be a copy of the low byte. */ + op1_l = gen_reg_rtx (qimode); + op1_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op1_l, op1, op1)); + emit_insn (gen_ih (op1_h, op1, op1)); + + op2_l = gen_reg_rtx (qimode); + op2_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op2_l, op2, op2)); + emit_insn (gen_ih (op2_h, op2, op2)); + + /* Perform the operation. */ + res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l), + gen_lowpart (himode, op2_l), NULL_RTX, + 1, OPTAB_DIRECT); + res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h), + gen_lowpart (himode, op2_h), NULL_RTX, + 1, OPTAB_DIRECT); + gcc_assert (res_l && res_h); + + /* Merge the data back into the right place. */ + d.target = dest; + d.op0 = gen_lowpart (qimode, res_l); + d.op1 = gen_lowpart (qimode, res_h); + d.vmode = qimode; + d.nelt = GET_MODE_NUNITS (qimode); + d.one_operand_p = false; + d.testing_p = false; + + if (qimode == V16QImode) + { + /* For SSE2, we used an full interleave, so the desired + results are in the even elements. */ + for (i = 0; i < 16; ++i) + d.perm[i] = i * 2; + } + else + { + /* For AVX, the interleave used above was not cross-lane. So the + extraction is evens but with the second and third quarter swapped. + Happily, that is even one insn shorter than even extraction. */ + for (i = 0; i < 32; ++i) + d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); + } + + ok = ix86_expand_vec_perm_const_1 (&d); + gcc_assert (ok); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_fmt_ee (code, qimode, op1, op2)); +} + void ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6c54d33..2f361a6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5213,70 +5213,13 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn_and_split "mul<mode>3" +(define_expand "mul<mode>3" [(set (match_operand:VI1_AVX2 0 "register_operand") (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand") (match_operand:VI1_AVX2 2 "register_operand")))] - "TARGET_SSE2 - && can_create_pseudo_p ()" - "#" - "&& 1" - [(const_int 0)] + "TARGET_SSE2" { - rtx t[6]; - int i; - enum machine_mode mulmode = <sseunpackmode>mode; - - for (i = 0; i < 6; ++i) - t[i] = gen_reg_rtx (<MODE>mode); - - /* Unpack data such that we've got a source byte in each low byte of - each word. We don't care what goes into the high byte of each word. - Rather than trying to get zero in there, most convenient is to let - it be a copy of the low byte. */ - emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[0], operands[1], - operands[1])); - emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[1], operands[2], - operands[2])); - emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[2], operands[1], - operands[1])); - emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[3], operands[2], - operands[2])); - - /* Multiply words. The end-of-line annotations here give a picture of what - the output of that instruction looks like. Dot means don't care; the - letters are the bytes of the result with A being the most significant. */ - emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[4]), - gen_rtx_MULT (mulmode, /* .A.B.C.D.E.F.G.H */ - gen_lowpart (mulmode, t[0]), - gen_lowpart (mulmode, t[1])))); - emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[5]), - gen_rtx_MULT (mulmode, /* .I.J.K.L.M.N.O.P */ - gen_lowpart (mulmode, t[2]), - gen_lowpart (mulmode, t[3])))); - - /* Extract the even bytes and merge them back together. */ - if (<MODE>mode == V16QImode) - ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); - else - { - /* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane, - this can't be normal even extraction, but one where additionally - the second and third quarter are swapped. That is even one insn - shorter than even extraction. */ - rtvec v = rtvec_alloc (32); - for (i = 0; i < 32; ++i) - RTVEC_ELT (v, i) - = GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0)); - t[0] = operands[0]; - t[1] = t[5]; - t[2] = t[4]; - t[3] = gen_rtx_CONST_VECTOR (<MODE>mode, v); - ix86_expand_vec_perm_const (t); - } - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (<MODE>mode, operands[1], operands[2])); + ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); DONE; }) -- 1.7.10.2