On Mon, Jun 15, 2020 at 09:29:29PM +0800, Hongtao Liu via Gcc-patches wrote: > Basically i "copy" this optimization from clang i386 backend, Refer > to pr95524 for details. > Bootstrap is ok, regression test on i386/x86-64 backend is ok. > > gcc/ChangeLog: > PR target/95524 > * gcc/config/i386/i386-expand.c > (ix86_expand_vec_shift_qihi_constant): New function.
No gcc/ prefix in gcc/ChangeLog (git push would be refused). And with or without the prefix it fits, so: * config/i386/i386-expand.c (ix86_expand_vec_shift_qihi_constant): New function. > * gcc/config/i386/i386-protos.h: Declare. This needs to mention the function name again. > * gcc/config/i386/sse.md: Optimize shift V*QImode by constant. And thus needs to mention the define_expand, (<shift_insn><mode>3) in this case. + machine_mode qimode, himode; + unsigned int shift_constant, and_constant, xor_constant; + rtx vec_const_and, vec_const_xor; + rtx tmp, op1_subreg; + rtx (*gen_shift) (rtx, rtx, rtx); + rtx (*gen_and) (rtx, rtx, rtx); + rtx (*gen_xor) (rtx, rtx, rtx); + rtx (*gen_sub) (rtx, rtx, rtx); + + /* Only optimize shift by constant. */ + if (!CONST_INT_P (op2)) + return false; + + qimode = GET_MODE (dest); + shift_constant = INTVAL (op2); I wonder if shift_constant shouldn't be unsigned HOST_WIDE_INT instead, and which >= 8 values you should try to do something about rather than punt on optimizing. If this is just from normal C shifts, then perhaps anything >= 32U would be invalid, if it is from intrinsics, perhaps there is a different behavior (masked with something?). + /* Shift constant greater equal 8 result into 0. */ + if (shift_constant > 7) + { + if (code == ASHIFT || code == LSHIFTRT) + { + emit_move_insn (dest, CONST0_RTX (qimode)); + return true; + } + /* Sign bit not known. */ + else if (code == ASHIFTRT) + return false; + else + gcc_unreachable (); + } + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + /* Record sign bit. */ + xor_constant = 1 << (8 - shift_constant - 1); + + /* Zero upper/lower bits shift from left/right element. */ + and_constant = code == ASHIFT ? 256 - (1 << shift_constant) : + (1 << (8 - shift_constant)) - 1; Formatting. Should be: and_constant = (code == ASHIFT ? 256 - (1 << shift_constant) : (1 << (8 - shift_constant)) - 1); or so. + + switch (qimode) + { + case V16QImode: + himode = V8HImode; + gen_shift = (code == ASHIFT) ? gen_ashlv8hi3 : + (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3; Similarly. : or ? should just never appear at the end of line. And probably no reason to wrap the comparisons in parens, instead wrap the whole expression. So: gen_shift = (code == ASHIFT ? gen_ashlv8hi3 : code == ASHIFTRT ? gen_ashrv8hi3 : gen_lshrv8hi3); or so? + gen_and = gen_andv16qi3; + gen_xor = gen_xorv16qi3; + gen_sub = gen_subv16qi3; + break; + case V32QImode: + himode = V16HImode; + gen_shift = (code == ASHIFT) ? gen_ashlv16hi3 : + (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3; Ditto. + gen_and = gen_andv32qi3; + gen_xor = gen_xorv32qi3; + gen_sub = gen_subv32qi3; + break; + case V64QImode: + himode = V32HImode; + gen_shift = (code == ASHIFT) ? gen_ashlv32hi3 : + (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3; Ditto. + tmp = gen_reg_rtx (himode); + vec_const_and = gen_reg_rtx (qimode); + op1_subreg = simplify_gen_subreg (himode, op1, qimode, 0); Just use lowpart_subreg? + + /* For ASHIFT and LSHIFTRT, perform operation like + vpsllw/vpsrlw $shift_constant, %op1, %dest. + vpand %vec_const_and, %dest. */ + emit_insn (gen_shift (tmp, op1_subreg, op2)); + emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0)); + emit_move_insn (vec_const_and, + ix86_build_const_vector (qimode, true, + GEN_INT (and_constant))); + emit_insn (gen_and (dest, dest, vec_const_and)); + + /* For ASHIFTRT, perform extra operation like + vpxor %vec_const_xor, %dest, %dest + vpsubb %vec_const_xor, %dest, %dest */ + if (code == ASHIFTRT) + { + vec_const_xor = gen_reg_rtx (qimode); + emit_move_insn (vec_const_xor, + ix86_build_const_vector (qimode, true, + GEN_INT (xor_constant))); + emit_insn (gen_xor (dest, dest, vec_const_xor)); + emit_insn (gen_sub (dest, dest, vec_const_xor)); + } + return true; +} + /* Expand a vector operation CODE for a V*QImode in terms of the same operation on V*HImode. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index f5320494fa1..7c2ce618f3f 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -206,6 +206,7 @@ extern void ix86_expand_round_sse4 (rtx, rtx); extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx); extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx); +extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx); extern rtx ix86_split_stack_guard (void); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index aa9fdc87c68..b466950af40 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19863,6 +19863,9 @@ gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } + else if (ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0], + operands[1], operands[2])) + DONE; Perhaps do instead: else if (!ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0], operands[1], operands[2])) ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); DONE; ? Jakub