https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93613

--- Comment #1 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
I've tried:
--- gcc/config/i386/sse.md.jj   2020-02-06 13:40:27.485007762 +0100
+++ gcc/config/i386/sse.md      2020-02-06 15:24:35.097743017 +0100
@@ -81,7 +81,6 @@ (define_c_enum "unspec" [

   ;; For AVX2 support
   UNSPEC_VPERMVAR
-  UNSPEC_VPERMTI
   UNSPEC_GATHER
   UNSPEC_VSIBADDR

@@ -20224,15 +20223,55 @@ (define_insn "avx512f_perm<mode>_1<mask_
    (set_attr "prefix" "<mask_prefix2>")
    (set_attr "mode" "<sseinsnmode>")])

-(define_insn "avx2_permv2ti"
-  [(set (match_operand:V4DI 0 "register_operand" "=x")
-       (unspec:V4DI
-         [(match_operand:V4DI 1 "register_operand" "x")
-          (match_operand:V4DI 2 "nonimmediate_operand" "xm")
-          (match_operand:SI 3 "const_0_to_255_operand" "n")]
-         UNSPEC_VPERMTI))]
+(define_expand "avx2_permv2ti"
+  [(match_operand:V4DI 0 "register_operand")
+   (match_operand:V4DI 1 "register_operand")
+   (match_operand:V4DI 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+{
+  int mask = INTVAL (operands[3]);
+  int first = (mask & 0x08) ? 8 : (mask & 0x03) * 2;
+  int second = (mask & 0x80) ? 8 : (mask & 0x30) / 8;
+  emit_insn (gen_avx2_permv2ti_1 (operands[0], operands[1],
+                                 operands[2], CONST0_RTX (V8DImode),
+                                 GEN_INT (first),
+                                 GEN_INT (first + 1),
+                                 GEN_INT (second),
+                                 GEN_INT (second + 1)));
+  DONE;
+})
+
+(define_insn "avx2_permv2ti_1"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (vec_select:V4DI
+         (vec_concat:V16DI
+           (vec_concat:V8DI
+             (match_operand:V4DI 1 "register_operand" "x")
+             (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+           (match_operand:V8DI 3 "const0_operand" "C"))
+         (parallel [(match_operand 4 "const_0_to_15_operand")
+                    (match_operand 5 "const_0_to_15_operand")
+                    (match_operand 6 "const_0_to_15_operand")
+                    (match_operand 7 "const_0_to_15_operand")])))]
+  "TARGET_AVX2
+   && (INTVAL (operands[4]) & 2) == 0
+   && INTVAL (operands[5]) == INTVAL (operands[4]) + 1
+   && (INTVAL (operands[6]) & 2) == 0
+   && INTVAL (operands[7]) == INTVAL (operands[6]) + 1"
+{
+  int mask = 0;
+  if (INTVAL (operands[4]) >= 8)
+    mask |= 0x08;
+  else
+    mask |= INTVAL (operands[4]) / 2;
+  if (INTVAL (operands[6]) >= 8)
+    mask |= 0x80;
+  else
+    mask |= INTVAL (operands[6]) * 8;
+  operands[4] = GEN_INT (mask);
+  return "vperm2i128\t{%4, %2, %1, %0|%0, %1, %2, %4}";
+}
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])

but unfortunately it doesn't help, guess we'll need to improve simplify-rtx.c
to deal with that (and for the last 3 functions it even makes things worse, as
combine then simplifies those patterns to vector constants but we don't have an
instruction that would force the const_vector into memory that combine could
match and could be split before reload).  For those I guess we want gimple
folding of the builtin.  Of course, people really should use __builtin_shuffle
instead of this mess... ;)

Reply via email to