https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93613
--- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> --- I've tried: --- gcc/config/i386/sse.md.jj 2020-02-06 13:40:27.485007762 +0100 +++ gcc/config/i386/sse.md 2020-02-06 15:24:35.097743017 +0100 @@ -81,7 +81,6 @@ (define_c_enum "unspec" [ ;; For AVX2 support UNSPEC_VPERMVAR - UNSPEC_VPERMTI UNSPEC_GATHER UNSPEC_VSIBADDR @@ -20224,15 +20223,55 @@ (define_insn "avx512f_perm<mode>_1<mask_ (set_attr "prefix" "<mask_prefix2>") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_permv2ti" - [(set (match_operand:V4DI 0 "register_operand" "=x") - (unspec:V4DI - [(match_operand:V4DI 1 "register_operand" "x") - (match_operand:V4DI 2 "nonimmediate_operand" "xm") - (match_operand:SI 3 "const_0_to_255_operand" "n")] - UNSPEC_VPERMTI))] +(define_expand "avx2_permv2ti" + [(match_operand:V4DI 0 "register_operand") + (match_operand:V4DI 1 "register_operand") + (match_operand:V4DI 2 "nonimmediate_operand") + (match_operand:SI 3 "const_0_to_255_operand")] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" +{ + int mask = INTVAL (operands[3]); + int first = (mask & 0x08) ? 8 : (mask & 0x03) * 2; + int second = (mask & 0x80) ? 8 : (mask & 0x30) / 8; + emit_insn (gen_avx2_permv2ti_1 (operands[0], operands[1], + operands[2], CONST0_RTX (V8DImode), + GEN_INT (first), + GEN_INT (first + 1), + GEN_INT (second), + GEN_INT (second + 1))); + DONE; +}) + +(define_insn "avx2_permv2ti_1" + [(set (match_operand:V4DI 0 "register_operand" "=x") + (vec_select:V4DI + (vec_concat:V16DI + (vec_concat:V8DI + (match_operand:V4DI 1 "register_operand" "x") + (match_operand:V4DI 2 "nonimmediate_operand" "xm")) + (match_operand:V8DI 3 "const0_operand" "C")) + (parallel [(match_operand 4 "const_0_to_15_operand") + (match_operand 5 "const_0_to_15_operand") + (match_operand 6 "const_0_to_15_operand") + (match_operand 7 "const_0_to_15_operand")])))] + "TARGET_AVX2 + && (INTVAL (operands[4]) & 2) == 0 + && INTVAL (operands[5]) == INTVAL (operands[4]) + 1 + && (INTVAL (operands[6]) & 2) == 0 + && INTVAL (operands[7]) == INTVAL (operands[6]) + 1" +{ + int mask = 0; + if (INTVAL (operands[4]) >= 8) + mask |= 0x08; + else + mask |= INTVAL (operands[4]) / 2; + if (INTVAL (operands[6]) >= 8) + mask |= 0x80; + else + mask |= INTVAL (operands[6]) * 8; + operands[4] = GEN_INT (mask); + return "vperm2i128\t{%4, %2, %1, %0|%0, %1, %2, %4}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) but unfortunately it doesn't help, guess we'll need to improve simplify-rtx.c to deal with that (and for the last 3 functions it even makes things worse, as combine then simplifies those patterns to vector constants but we don't have an instruction that would force the const_vector into memory that combine could match and could be split before reload). For those I guess we want gimple folding of the builtin. Of course, people really should use __builtin_shuffle instead of this mess... ;)