* config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for V*QImode shifts and multiply. (ix86_expand_vecop_qihi): Support shifts. * config/i386/i386.md (any_shift): New code iterator. * config/i386/sse.md (ashlv16qi3): Merge ... (<any_shiftrt>v16qi3): ... into ... (<any_shift><VI1_AVX2>3): ... here. Use ix86_expand_vecop_qihi to support SSE and AVX.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7ae2060..fc30632 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -31938,9 +31938,10 @@ ix86_set_reg_reg_cost (enum machine_mode mode) scanned. In either case, *TOTAL contains the cost result. */ static bool -ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, +ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, bool speed) { + enum rtx_code code = (enum rtx_code) code_i; enum rtx_code outer_code = (enum rtx_code) outer_code_i; enum machine_mode mode = GET_MODE (x); const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; @@ -32045,7 +32046,31 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, /* ??? Should be SSE vector operation cost. */ /* At least for published AMD latencies, this really is the same as the latency for a simple fpu operation like fabs. */ - *total = cost->fabs; + /* V*QImode is emulated with 1-11 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int count; + if (TARGET_XOP && mode == V16QImode) + { + /* For XOP we use vpshab, which requires a broadcast of the + value to the variable shift insn. For constants this + means a V16Q const in mem; even when we can perform the + shift with one insn set the cost to prefer paddb. */ + if (CONSTANT_P (XEXP (x, 1))) + { + *total = (cost->fabs + + rtx_cost (XEXP (x, 0), code, 0, speed) + + (speed ? 2 : COSTS_N_BYTES (16))); + return true; + } + count = 3; + } + else + count = TARGET_SSSE3 ? 7 : 11; + *total = cost->fabs * count; + } + else + *total = cost->fabs; return false; } if (GET_MODE_SIZE (mode) < UNITS_PER_WORD) @@ -32119,9 +32144,15 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, } else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) { + /* V*QImode is emulated with 7-13 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11; + *total = cost->fmul * 2 + cost->fabs * extra; + } /* Without sse4.1, we don't have PMULLD; it's emulated with 7 insns, including two PMULUDQ. */ - if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) + else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) *total = cost->fmul * 2 + cost->fabs * 5; else *total = cost->fmul; @@ -38448,44 +38479,66 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) rtx (*gen_ih) (rtx, rtx, rtx); rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; struct expand_vec_perm_d d; - bool ok; + bool ok, full_interleave; + bool uns_p = false; int i; - if (qimode == V16QImode) + switch (qimode) { + case V16QImode: himode = V8HImode; gen_il = gen_vec_interleave_lowv16qi; gen_ih = gen_vec_interleave_highv16qi; - } - else if (qimode == V32QImode) - { + break; + case V32QImode: himode = V16HImode; gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; + break; + default: + gcc_unreachable (); } - else - gcc_unreachable (); - /* Unpack data such that we've got a source byte in each low byte of - each word. We don't care what goes into the high byte of each word. - Rather than trying to get zero in there, most convenient is to let - it be a copy of the low byte. */ - op1_l = gen_reg_rtx (qimode); - op1_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op1_l, op1, op1)); - emit_insn (gen_ih (op1_h, op1, op1)); + op2_l = op2_h = op2; + switch (code) + { + case MULT: + /* Unpack data such that we've got a source byte in each low byte of + each word. We don't care what goes into the high byte of each word. + Rather than trying to get zero in there, most convenient is to let + it be a copy of the low byte. */ + op2_l = gen_reg_rtx (qimode); + op2_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op2_l, op2, op2)); + emit_insn (gen_ih (op2_h, op2, op2)); + /* FALLTHRU */ - op2_l = gen_reg_rtx (qimode); - op2_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op2_l, op2, op2)); - emit_insn (gen_ih (op2_h, op2, op2)); + op1_l = gen_reg_rtx (qimode); + op1_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op1_l, op1, op1)); + emit_insn (gen_ih (op1_h, op1, op1)); + full_interleave = qimode == V16QImode; + break; + + case ASHIFT: + case LSHIFTRT: + uns_p = true; + /* FALLTHRU */ + case ASHIFTRT: + op1_l = gen_reg_rtx (himode); + op1_h = gen_reg_rtx (himode); + ix86_expand_sse_unpack (op1_l, op1, uns_p, false); + ix86_expand_sse_unpack (op1_h, op1, uns_p, true); + full_interleave = true; + break; + default: + gcc_unreachable (); + } /* Perform the operation. */ - res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l), - gen_lowpart (himode, op2_l), NULL_RTX, + res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, 1, OPTAB_DIRECT); - res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h), - gen_lowpart (himode, op2_h), NULL_RTX, + res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, 1, OPTAB_DIRECT); gcc_assert (res_l && res_h); @@ -38498,11 +38551,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) d.one_operand_p = false; d.testing_p = false; - if (qimode == V16QImode) + if (full_interleave) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 16; ++i) + for (i = 0; i < 32; ++i) d.perm[i] = i * 2; } else diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 879b87b..da2f4b2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -711,6 +711,9 @@ ;; Mapping of shift-right operators (define_code_iterator any_shiftrt [lshiftrt ashiftrt]) +;; Mapping of all shift operators +(define_code_iterator any_shift [ashift lshiftrt ashiftrt]) + ;; Base name for define_insn (define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")]) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c7c6392..691256d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -10550,60 +10550,42 @@ (set_attr "prefix_extra" "2") (set_attr "mode" "TI")]) -;; SSE2 doesn't have some shift variants, so define versions for XOP -(define_expand "ashlv16qi3" - [(set (match_operand:V16QI 0 "register_operand") - (ashift:V16QI - (match_operand:V16QI 1 "register_operand") - (match_operand:SI 2 "nonmemory_operand")))] - "TARGET_XOP" -{ - rtx reg = gen_reg_rtx (V16QImode); - rtx par; - int i; - - par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); - for (i = 0; i < 16; i++) - XVECEXP (par, 0, i) = operands[2]; - - emit_insn (gen_vec_initv16qi (reg, par)); - emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], reg)); - DONE; -}) - -(define_expand "<shift_insn>v16qi3" - [(set (match_operand:V16QI 0 "register_operand") - (any_shiftrt:V16QI - (match_operand:V16QI 1 "register_operand") +(define_expand "<shift_insn><mode>3" + [(set (match_operand:VI1_AVX2 0 "register_operand") + (any_shift:VI1_AVX2 + (match_operand:VI1_AVX2 1 "register_operand") (match_operand:SI 2 "nonmemory_operand")))] - "TARGET_XOP" + "TARGET_SSE2" { - rtx reg = gen_reg_rtx (V16QImode); - rtx par; - bool negate = false; - rtx (*shift_insn)(rtx, rtx, rtx); - int i; - - if (CONST_INT_P (operands[2])) - operands[2] = GEN_INT (-INTVAL (operands[2])); - else - negate = true; + if (TARGET_XOP && <MODE>mode == V16QImode) + { + bool negate = false; + rtx (*gen) (rtx, rtx, rtx); + rtx tmp, par; + int i; - par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); - for (i = 0; i < 16; i++) - XVECEXP (par, 0, i) = operands[2]; + if (<CODE> != ASHIFT) + { + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (-INTVAL (operands[2])); + else + negate = true; + } + par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); + for (i = 0; i < 16; i++) + XVECEXP (par, 0, i) = operands[2]; - emit_insn (gen_vec_initv16qi (reg, par)); + tmp = gen_reg_rtx (V16QImode); + emit_insn (gen_vec_initv16qi (tmp, par)); - if (negate) - emit_insn (gen_negv16qi2 (reg, reg)); + if (negate) + emit_insn (gen_negv16qi2 (tmp, tmp)); - if (<CODE> == LSHIFTRT) - shift_insn = gen_xop_shlv16qi3; + gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); + emit_insn (gen (operands[0], operands[1], tmp)); + } else - shift_insn = gen_xop_shav16qi3; - - emit_insn (shift_insn (operands[0], operands[1], reg)); + ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); DONE; }) -- 1.7.10.2