On Wed, Nov 25, 2020 at 11:57:00AM +0100, Uros Bizjak wrote:
> > I haven't done the AVX512{BW,VL} define_insn_and_split, they'll need
> > to match the UNSPEC_PCMP which are used for avx512 comparisons.
> >
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> I *think* this could be done with a combine splitter. The benefit of
> using otherwise extremely picky combine splitter (it also doesn't
> report why it can't split a combined insn) is that the split insn can
> be used in follow-up combine attempts.

So like this?  Passes the new testcase, full bootstrap/regtest queued.

2020-11-25  Jakub Jelinek  <ja...@redhat.com>

        PR target/96906
        * config/i386/sse.md (VI12_AVX2): Remove V64QI/V32HI modes.
        (VI12_AVX2_AVX512BW): New mode iterator.
        (<sse2_avx2>_<plusminus_insn><mode>3<mask_name>,
        *<sse2_avx2>_<plusminus_insn><mode>3<mask_name>,
        uavg<mode>3_ceil, <sse2_avx2>_uavg<mode>3<mask_name>,
        *<sse2_avx2>_uavg<mode>3<mask_name>): Use VI12_AVX2_AVX512BW
        iterator instead of VI12_AVX2.
        (*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>): Add a new
        define_split after this insn.

        * gcc.target/i386/pr96906-1.c: New test.

--- gcc/config/i386/sse.md.jj   2020-11-25 11:58:13.003565443 +0100
+++ gcc/config/i386/sse.md      2020-11-25 12:24:44.931585573 +0100
@@ -466,6 +466,10 @@ (define_mode_iterator SSESCALARMODE
   [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI])
 
 (define_mode_iterator VI12_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI12_AVX2_AVX512BW
   [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI
    (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI])
 
@@ -11395,18 +11399,18 @@ (define_insn "*<plusminus_insn><mode>3_m
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "<sse2_avx2>_<plusminus_insn><mode>3<mask_name>"
-  [(set (match_operand:VI12_AVX2 0 "register_operand")
-       (sat_plusminus:VI12_AVX2
-         (match_operand:VI12_AVX2 1 "vector_operand")
-         (match_operand:VI12_AVX2 2 "vector_operand")))]
+  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
+       (sat_plusminus:VI12_AVX2_AVX512BW
+         (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand")
+         (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))]
   "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<sse2_avx2>_<plusminus_insn><mode>3<mask_name>"
-  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v")
-       (sat_plusminus:VI12_AVX2
-         (match_operand:VI12_AVX2 1 "vector_operand" "<comm>0,v")
-         (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))]
+  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v")
+       (sat_plusminus:VI12_AVX2_AVX512BW
+         (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "<comm>0,v")
+         (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" "xBm,vm")))]
   "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>
    && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "@
@@ -11418,6 +11422,24 @@ (define_insn "*<sse2_avx2>_<plusminus_in
    (set_attr "prefix" "orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+;; PR96906 - optimize psubusw compared to 0 into pminuw compared to op0.
+(define_split
+  [(set (match_operand:VI12_AVX2 0 "register_operand")
+       (eq:VI12_AVX2
+         (us_minus:VI12_AVX2
+           (match_operand:VI12_AVX2 1 "vector_operand")
+           (match_operand:VI12_AVX2 2 "vector_operand"))
+         (match_operand:VI12_AVX2 3 "const0_operand")))]
+  "TARGET_SSE2
+   && (<MODE>mode != V8HImode || TARGET_SSE4_1)
+   && ix86_pre_reload_split ()
+   && ix86_binary_operator_ok (US_MINUS, <MODE>mode, operands)"
+  [(set (match_dup 4)
+        (umin:VI12_AVX2 (match_dup 1) (match_dup 2)))
+   (set (match_dup 0)
+       (eq:VI12_AVX2 (match_dup 4) (match_dup 1)))]
+  "operands[4] = gen_reg_rtx (<MODE>mode);")
+
 (define_expand "mulv8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
        (mult:V8QI (match_operand:V8QI 1 "register_operand")
@@ -12022,15 +12044,15 @@ (define_expand "sdot_prodv4si"
 })
 
 (define_expand "uavg<mode>3_ceil"
-  [(set (match_operand:VI12_AVX2 0 "register_operand")
-       (truncate:VI12_AVX2
+  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
+       (truncate:VI12_AVX2_AVX512BW
          (lshiftrt:<ssedoublemode>
            (plus:<ssedoublemode>
              (plus:<ssedoublemode>
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 1 "vector_operand"))
+                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand"))
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 2 "vector_operand")))
+                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))
              (match_dup 3))
            (const_int 1))))]
   "TARGET_SSE2"
@@ -15744,15 +15766,15 @@ (define_expand "vec_unpacks_hi_<mode>"
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_expand "<sse2_avx2>_uavg<mode>3<mask_name>"
-  [(set (match_operand:VI12_AVX2 0 "register_operand")
-       (truncate:VI12_AVX2
+  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
+       (truncate:VI12_AVX2_AVX512BW
          (lshiftrt:<ssedoublemode>
            (plus:<ssedoublemode>
              (plus:<ssedoublemode>
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 1 "vector_operand"))
+                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand"))
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 2 "vector_operand")))
+                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))
              (match_dup <mask_expand_op3>))
            (const_int 1))))]
   "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
@@ -15762,15 +15784,15 @@ (define_expand "<sse2_avx2>_uavg<mode>3<
 })
 
 (define_insn "*<sse2_avx2>_uavg<mode>3<mask_name>"
-  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v")
-       (truncate:VI12_AVX2
+  [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand" "=x,v")
+       (truncate:VI12_AVX2_AVX512BW
          (lshiftrt:<ssedoublemode>
            (plus:<ssedoublemode>
              (plus:<ssedoublemode>
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 1 "vector_operand" "%0,v"))
+                 (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand" "%0,v"))
                (zero_extend:<ssedoublemode>
-                 (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm")))
+                 (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand" 
"xBm,vm")))
              (match_operand:<ssedoublemode> <mask_expand_op3> 
"const1_operand"))
            (const_int 1))))]
   "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>
--- gcc/testsuite/gcc.target/i386/pr96906-1.c.jj        2020-11-25 
12:22:57.510798838 +0100
+++ gcc/testsuite/gcc.target/i386/pr96906-1.c   2020-11-25 12:22:57.510798838 
+0100
@@ -0,0 +1,62 @@
+/* PR target/96906 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpminub\[^\n\r]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpminuw\[^\n\r]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpcmpeqb\[^\n\r]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "\tvpcmpeqw\[^\n\r]*ymm" 2 } } */
+/* { dg-final { scan-assembler-not "\tvpsubus\[bw]" } } */
+
+#include <x86intrin.h>
+
+__m128i
+f1 (__m128i x, __m128i y)
+{
+  return _mm_cmpeq_epi16 (_mm_subs_epu16 (x, y), _mm_setzero_si128 ());
+}
+
+__m128i
+f2 (__m128i x, __m128i y)
+{
+  return _mm_cmpeq_epi16 (_mm_min_epu16 (x, y), x);
+}
+
+__m128i
+f3 (__m128i x, __m128i y)
+{
+  return _mm_cmpeq_epi8 (_mm_subs_epu8 (x, y), _mm_setzero_si128 ());
+}
+
+__m128i
+f4 (__m128i x, __m128i y)
+{
+  return _mm_cmpeq_epi8 (_mm_min_epu8 (x, y), x);
+}
+
+__m256i
+f5 (__m256i x, __m256i y)
+{
+  return _mm256_cmpeq_epi16 (_mm256_subs_epu16 (x, y), _mm256_setzero_si256 
());
+}
+
+__m256i
+f6 (__m256i x, __m256i y)
+{
+  return _mm256_cmpeq_epi16 (_mm256_min_epu16 (x, y), x);
+}
+
+__m256i
+f7 (__m256i x, __m256i y)
+{
+  return _mm256_cmpeq_epi8 (_mm256_subs_epu8 (x, y), _mm256_setzero_si256 ());
+}
+
+__m256i
+f8 (__m256i x, __m256i y)
+{
+  return _mm256_cmpeq_epi8 (_mm256_min_epu8 (x, y), x);
+}

        Jakub

Reply via email to