Here it is, the patch that enables auto-vectorization for 32bit modes.

Sent as RFC, because the patch fails some vectorizer scans, as it
obviously enables more vectorization to happen:

Running target unix
FAIL: gcc.dg/vect/pr71264.c -flto -ffat-lto-objects  scan-tree-dump
vect "vectorized 1 loops in function"
FAIL: gcc.dg/vect/pr71264.c scan-tree-dump vect "vectorized 1 loops in function"
FAIL: gcc.dg/vect/slp-28.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorized 1 loops" 1
FAIL: gcc.dg/vect/slp-28.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorizing stmts using SLP" 1
FAIL: gcc.dg/vect/slp-28.c scan-tree-dump-times vect "vectorized 1 loops" 1
FAIL: gcc.dg/vect/slp-28.c scan-tree-dump-times vect "vectorizing
stmts using SLP" 1
FAIL: gcc.dg/vect/slp-3.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorized 3 loops" 1
FAIL: gcc.dg/vect/slp-3.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorizing stmts using SLP" 3
FAIL: gcc.dg/vect/slp-3.c scan-tree-dump-times vect "vectorized 3 loops" 1
FAIL: gcc.dg/vect/slp-3.c scan-tree-dump-times vect "vectorizing stmts
using SLP" 3


Running target unix/-m32
FAIL: gcc.dg/vect/no-vfa-vect-101.c scan-tree-dump-times vect "can't
determine dependence" 1
FAIL: gcc.dg/vect/no-vfa-vect-102.c scan-tree-dump-times vect
"possible dependence between data-refs" 1
FAIL: gcc.dg/vect/no-vfa-vect-102a.c scan-tree-dump-times vect
"possible dependence between data-refs" 1
FAIL: gcc.dg/vect/no-vfa-vect-37.c scan-tree-dump-times vect "can't
determine dependence" 2
FAIL: gcc.dg/vect/pr71264.c -flto -ffat-lto-objects  scan-tree-dump
vect "vectorized 1 loops in function"
FAIL: gcc.dg/vect/pr71264.c scan-tree-dump vect "vectorized 1 loops in function"
FAIL: gcc.dg/vect/slp-28.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorized 1 loops" 1
FAIL: gcc.dg/vect/slp-28.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorizing stmts using SLP" 1
FAIL: gcc.dg/vect/slp-28.c scan-tree-dump-times vect "vectorized 1 loops" 1
FAIL: gcc.dg/vect/slp-28.c scan-tree-dump-times vect "vectorizing
stmts using SLP" 1
FAIL: gcc.dg/vect/slp-3.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorized 3 loops" 1
FAIL: gcc.dg/vect/slp-3.c -flto -ffat-lto-objects
scan-tree-dump-times vect "vectorizing stmts using SLP" 3
FAIL: gcc.dg/vect/slp-3.c scan-tree-dump-times vect "vectorized 3 loops" 1
FAIL: gcc.dg/vect/slp-3.c scan-tree-dump-times vect "vectorizing stmts
using SLP" 3
FAIL: gcc.dg/vect/vect-104.c -flto -ffat-lto-objects
scan-tree-dump-times vect "possible dependence between data-refs" 1
FAIL: gcc.dg/vect/vect-104.c scan-tree-dump-times vect "possible
dependence between data-refs" 1

Please also note that V4QI and V2HI modes do not use MMX registers, so
auto-vectorization can also be enabled on 32bit x86 targets.

Uros.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f3b451835da..f43f3ba060e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -22187,12 +22187,15 @@ ix86_autovectorize_vector_modes (vector_modes *modes, 
bool all)
       modes->safe_push (V16QImode);
       modes->safe_push (V32QImode);
     }
-  else if (TARGET_MMX_WITH_SSE)
+  else if (TARGET_SSE2)
     modes->safe_push (V16QImode);
 
   if (TARGET_MMX_WITH_SSE)
     modes->safe_push (V8QImode);
 
+  if (TARGET_SSE2)
+    modes->safe_push (V4QImode);
+
   return 0;
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100637-3b.c 
b/gcc/testsuite/gcc.target/i386/pr100637-3b.c
new file mode 100644
index 00000000000..16df70059a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100637-3b.c
@@ -0,0 +1,56 @@
+/* PR target/100637 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse4" } */
+
+char r[4], a[4], b[4];
+unsigned char ur[4], ua[4], ub[4];
+
+void maxs (void)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    r[i] = a[i] > b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pmaxsb" } } */
+
+void maxu (void)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    ur[i] = ua[i] > ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pmaxub" } } */
+
+void mins (void)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    r[i] = a[i] < b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pminsb" } } */
+
+void minu (void)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    ur[i] = ua[i] < ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pminub" } } */
+
+void _abs (void)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    r[i] = a[i] < 0 ? -a[i] : a[i];
+}
+
+/* { dg-final { scan-assembler "pabsb" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100637-3w.c 
b/gcc/testsuite/gcc.target/i386/pr100637-3w.c
new file mode 100644
index 00000000000..7f1882e7a56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100637-3w.c
@@ -0,0 +1,86 @@
+/* PR target/100637 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse4" } */
+
+short r[2], a[2], b[2];
+unsigned short ur[2], ua[2], ub[2];
+
+void mulh (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = ((int) a[i] * b[i]) >> 16;
+}
+
+/* { dg-final { scan-assembler "pmulhw" { xfail *-*-* } } } */
+
+void mulhu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = ((unsigned int) ua[i] * ub[i]) >> 16;
+}
+
+/* { dg-final { scan-assembler "pmulhuw" { xfail *-*-* } } } */
+
+void mulhrs (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = ((((int) a[i] * b[i]) >> 14) + 1) >> 1;
+}
+
+/* { dg-final { scan-assembler "pmulhrsw" } } */
+
+void maxs (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] > b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pmaxsw" } } */
+
+void maxu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = ua[i] > ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pmaxuw" } } */
+
+void mins (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] < b[i] ? a[i] : b[i];
+}
+
+/* { dg-final { scan-assembler "pminsw" } } */
+
+void minu (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    ur[i] = ua[i] < ub[i] ? ua[i] : ub[i];
+}
+
+/* { dg-final { scan-assembler "pminuw" } } */
+
+void _abs (void)
+{
+  int i;
+
+  for (i = 0; i < 2; i++)
+    r[i] = a[i] < 0 ? -a[i] : a[i];
+}
+
+/* { dg-final { scan-assembler "pabsw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100637-4b.c 
b/gcc/testsuite/gcc.target/i386/pr100637-4b.c
new file mode 100644
index 00000000000..198e3dd3352
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100637-4b.c
@@ -0,0 +1,19 @@
+/* PR target/100637 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+typedef char T;
+
+#define M 4
+
+extern T a[M], b[M], s1[M], s2[M], r[M];
+
+void foo (void)
+{
+  int j;
+
+  for (j = 0; j < M; j++)
+    r[j] = (a[j] < b[j]) ? s1[j] : s2[j];
+}
+
+/* { dg-final { scan-assembler "pcmpgtb" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100637-4w.c 
b/gcc/testsuite/gcc.target/i386/pr100637-4w.c
new file mode 100644
index 00000000000..0f5dacce906
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100637-4w.c
@@ -0,0 +1,19 @@
+/* PR target/100637 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+
+typedef short T;
+
+#define M 2
+
+extern T a[M], b[M], s1[M], s2[M], r[M];
+
+void foo (void)
+{
+  int j;
+
+  for (j = 0; j < M; j++)
+    r[j] = (a[j] < b[j]) ? s1[j] : s2[j];
+}
+
+/* { dg-final { scan-assembler "pcmpgtw" { xfail *-*-* } } } */

Reply via email to