On 30/11/2022 15:37, Jakub Jelinek wrote:
On Wed, Nov 30, 2022 at 03:17:30PM +0000, Andrew Stubbs wrote:
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16.c
@@ -0,0 +1,89 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
...
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */

Maybe better add -ffat-lto-objects to dg-additional-options and drop
the dg-skip-if (if it works with that, for all similar tests)?

The tests are already run with -ffat-lto-objects and the test still fails (pattern found zero times). I don't know why.

Aside from that, I've made all the other changes you requested.

OK now?

Andrew
vect: inbranch SIMD clones

There has been support for generating "inbranch" SIMD clones for a long time,
but nothing actually uses them (as far as I can see).

This patch add supports for a sub-set of possible cases (those using
mask_mode == VOIDmode).  The other cases fail to vectorize, just as before,
so there should be no regressions.

The sub-set of support should cover all cases needed by amdgcn, at present.

gcc/ChangeLog:

        * internal-fn.cc (expand_MASK_CALL): New.
        * internal-fn.def (MASK_CALL): New.
        * internal-fn.h (expand_MASK_CALL): New prototype.
        * omp-simd-clone.cc (simd_clone_adjust_argument_types): Set vector_type
        for mask arguments also.
        * tree-if-conv.cc: Include cgraph.h.
        (if_convertible_stmt_p): Do if conversions for calls to SIMD calls.
        (predicate_statements): Convert functions to IFN_MASK_CALL.
        * tree-vect-loop.cc (vect_get_datarefs_in_loop): Recognise
        IFN_MASK_CALL as a SIMD function call.
        * tree-vect-stmts.cc (vectorizable_simd_clone_call): Handle
        IFN_MASK_CALL as an inbranch SIMD function call.
        Generate the mask vector arguments.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-simd-clone-16.c: New test.
        * gcc.dg/vect/vect-simd-clone-16b.c: New test.
        * gcc.dg/vect/vect-simd-clone-16c.c: New test.
        * gcc.dg/vect/vect-simd-clone-16d.c: New test.
        * gcc.dg/vect/vect-simd-clone-16e.c: New test.
        * gcc.dg/vect/vect-simd-clone-16f.c: New test.
        * gcc.dg/vect/vect-simd-clone-17.c: New test.
        * gcc.dg/vect/vect-simd-clone-17b.c: New test.
        * gcc.dg/vect/vect-simd-clone-17c.c: New test.
        * gcc.dg/vect/vect-simd-clone-17d.c: New test.
        * gcc.dg/vect/vect-simd-clone-17e.c: New test.
        * gcc.dg/vect/vect-simd-clone-17f.c: New test.
        * gcc.dg/vect/vect-simd-clone-18.c: New test.
        * gcc.dg/vect/vect-simd-clone-18b.c: New test.
        * gcc.dg/vect/vect-simd-clone-18c.c: New test.
        * gcc.dg/vect/vect-simd-clone-18d.c: New test.
        * gcc.dg/vect/vect-simd-clone-18e.c: New test.
        * gcc.dg/vect/vect-simd-clone-18f.c: New test.

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 9471f543191..d9e11bfc62a 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4527,3 +4527,10 @@ void
 expand_ASSUME (internal_fn, gcall *)
 {
 }
+
+void
+expand_MASK_CALL (internal_fn, gcall *)
+{
+  /* This IFN should only exist between ifcvt and vect passes.  */
+  gcc_unreachable ();
+}
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 61516dab66d..301c3780659 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -466,6 +466,9 @@ DEF_INTERNAL_FN (TRAP, ECF_CONST | ECF_LEAF | ECF_NORETURN
 DEF_INTERNAL_FN (ASSUME, ECF_CONST | ECF_LEAF | ECF_NOTHROW
                         | ECF_LOOPING_CONST_OR_PURE, NULL)
 
+/* For if-conversion of inbranch SIMD clones.  */
+DEF_INTERNAL_FN (MASK_CALL, ECF_NOVOPS, NULL)
+
 #undef DEF_INTERNAL_INT_FN
 #undef DEF_INTERNAL_FLT_FN
 #undef DEF_INTERNAL_FLT_FLOATN_FN
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 21b1ce43df6..ced92c041bb 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -244,6 +244,7 @@ extern void expand_SHUFFLEVECTOR (internal_fn, gcall *);
 extern void expand_SPACESHIP (internal_fn, gcall *);
 extern void expand_TRAP (internal_fn, gcall *);
 extern void expand_ASSUME (internal_fn, gcall *);
+extern void expand_MASK_CALL (internal_fn, gcall *);
 
 extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
 
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 21d69aa8747..afb7d99747b 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -937,6 +937,7 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
        }
       sc->args[i].orig_type = base_type;
       sc->args[i].arg_type = SIMD_CLONE_ARG_TYPE_MASK;
+      sc->args[i].vector_type = adj.type;
     }
 
   if (node->definition)
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16.c
new file mode 100644
index 00000000000..ffaabb30d1e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16.c
@@ -0,0 +1,89 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd
+TYPE __attribute__((noinline))
+foo (TYPE a)
+{
+  return a + 1;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noinline))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+    b[i] = a[i]<1 ? foo(a[i]) : a[i];
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noinline))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+    b[i] = a[i]<1 ? foo(a[i]) : a[i];
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+    if (((TYPE)i < 1 && b[i] != (TYPE)(i + 1))
+       || ((TYPE)i >= 1 && b[i] != (TYPE)i))
+      {
+       __builtin_printf ("error at %d\n", i);
+       __builtin_exit (1);
+      }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+    a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+     factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size);
+      check_masked (b, size);
+    }
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size-4);
+      check_masked (b, size-4);
+    }
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16b.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16b.c
new file mode 100644
index 00000000000..a503ef85238
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16b.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE float
+#include "vect-simd-clone-16.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16c.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16c.c
new file mode 100644
index 00000000000..6563879df71
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16c.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE short
+#include "vect-simd-clone-16.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=short.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16d.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16d.c
new file mode 100644
index 00000000000..6c5e69482e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16d.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE char
+#include "vect-simd-clone-16.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=char.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16e.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16e.c
new file mode 100644
index 00000000000..6690844deae
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16e.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE double
+#include "vect-simd-clone-16.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
new file mode 100644
index 00000000000..e7b35a6a2dc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE __INT64_TYPE__
+#include "vect-simd-clone-16.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 7 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=int64.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17.c
new file mode 100644
index 00000000000..6f5d374a417
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17.c
@@ -0,0 +1,89 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd uniform(b)
+TYPE __attribute__((noinline))
+foo (TYPE a, TYPE b)
+{
+  return a + b;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noinline))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+    b[i] = a[i]<1 ? foo(a[i], 1) : a[i];
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noinline))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+    b[i] = a[i]<1 ? foo(a[i], 1) : a[i];
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+    if (((TYPE)i < 1 && b[i] != (TYPE)(i + 1))
+       || ((TYPE)i >= 1 && b[i] != (TYPE)i))
+      {
+       __builtin_printf ("error at %d\n", i);
+       __builtin_exit (1);
+      }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+    a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+     factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size);
+      check_masked (b, size);
+    }
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size-4);
+      check_masked (b, size-4);
+    }
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17b.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17b.c
new file mode 100644
index 00000000000..1e2c3ab11b3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17b.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE float
+#include "vect-simd-clone-17.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17c.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17c.c
new file mode 100644
index 00000000000..007001de669
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17c.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE short
+#include "vect-simd-clone-17.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=short.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17d.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17d.c
new file mode 100644
index 00000000000..abb85a4ceee
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17d.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE char
+#include "vect-simd-clone-17.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=char.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17e.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17e.c
new file mode 100644
index 00000000000..2c1d8a659bd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17e.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE double
+#include "vect-simd-clone-17.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
new file mode 100644
index 00000000000..582e690304f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE __INT64_TYPE__
+#include "vect-simd-clone-17.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=int64.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18.c
new file mode 100644
index 00000000000..750a3f92b62
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18.c
@@ -0,0 +1,89 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd uniform(b)
+TYPE __attribute__((noinline))
+foo (TYPE b, TYPE a)
+{
+  return a + b;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noinline))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+    b[i] = a[i]<1 ? foo(1, a[i]) : a[i];
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noinline))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+    b[i] = a[i]<1 ? foo(1, a[i]) : a[i];
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+    if (((TYPE)i < 1 && b[i] != (TYPE)(i + 1))
+       || ((TYPE)i >= 1 && b[i] != (TYPE)i))
+      {
+       __builtin_printf ("error at %d\n", i);
+       __builtin_exit (1);
+      }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+    a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+     factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size);
+      check_masked (b, size);
+    }
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+    {
+      masked (a, b, size-4);
+      check_masked (b, size-4);
+    }
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18b.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18b.c
new file mode 100644
index 00000000000..a77ccf3bfcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18b.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE float
+#include "vect-simd-clone-18.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18c.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18c.c
new file mode 100644
index 00000000000..bee5f338abe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18c.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE short
+#include "vect-simd-clone-18.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=short.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18d.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18d.c
new file mode 100644
index 00000000000..a749edefdd7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18d.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE char
+#include "vect-simd-clone-18.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=char.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 16 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18e.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18e.c
new file mode 100644
index 00000000000..061e0dc2621
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18e.c
@@ -0,0 +1,14 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE double
+#include "vect-simd-clone-18.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 19 "optimized" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
new file mode 100644
index 00000000000..a3037f5809a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
@@ -0,0 +1,16 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd -fdump-tree-optimized" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+#define TYPE __INT64_TYPE__
+#include "vect-simd-clone-18.c"
+
+/* Ensure the the in-branch simd clones are used on targets that support
+   them.  These counts include all call and definitions.  */
+
+/* { dg-final { scan-tree-dump-times "simdclone" 6 "optimized" { target 
amdgcn-*-* } } } */
+/* TODO: aarch64 */
+
+/* Fails to use in-branch clones for TYPE=int64.  */
+/* { dg-skip-if "" { x86_64-*-* } { "-flto" } { "" } } */
+/* { dg-final { scan-tree-dump-times "simdclone" 18 "optimized" { target 
x86_64-*-* } } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 64b20b4a9e1..127ee873d25 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -123,6 +123,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-dse.h"
 #include "tree-vectorizer.h"
 #include "tree-eh.h"
+#include "cgraph.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -1063,7 +1064,8 @@ if_convertible_gimple_assign_stmt_p (gimple *stmt,
    A statement is if-convertible if:
    - it is an if-convertible GIMPLE_ASSIGN,
    - it is a GIMPLE_LABEL or a GIMPLE_COND,
-   - it is builtins call.  */
+   - it is builtins call.
+   - it is a call to a function with a SIMD clone.  */
 
 static bool
 if_convertible_stmt_p (gimple *stmt, vec<data_reference_p> refs)
@@ -1083,13 +1085,23 @@ if_convertible_stmt_p (gimple *stmt, 
vec<data_reference_p> refs)
        tree fndecl = gimple_call_fndecl (stmt);
        if (fndecl)
          {
+           /* We can vectorize some builtins and functions with SIMD
+              "inbranch" clones.  */
            int flags = gimple_call_flags (stmt);
+           struct cgraph_node *node = cgraph_node::get (fndecl);
            if ((flags & ECF_CONST)
                && !(flags & ECF_LOOPING_CONST_OR_PURE)
-               /* We can only vectorize some builtins at the moment,
-                  so restrict if-conversion to those.  */
                && fndecl_built_in_p (fndecl))
              return true;
+           else if (node && node->simd_clones != NULL)
+             /* Ensure that at least one clone can be "inbranch".  */
+             for (struct cgraph_node *n = node->simd_clones; n != NULL;
+                  n = n->simdclone->next_clone)
+               if (n->simdclone->inbranch)
+                 {
+                   need_to_predicate = true;
+                   return true;
+                 }
          }
        return false;
       }
@@ -2603,6 +2615,29 @@ predicate_statements (loop_p loop)
              gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi));
              update_stmt (stmt);
            }
+
+         /* Convert functions that have a SIMD clone to IFN_MASK_CALL.  This
+            will cause the vectorizer to match the "in branch" clone variants,
+            and serves to build the mask vector in a natural way.  */
+         gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
+         if (call && !gimple_call_internal_p (call))
+           {
+             tree orig_fn = gimple_call_fn (call);
+             int orig_nargs = gimple_call_num_args (call);
+             auto_vec<tree> args;
+             args.safe_push (orig_fn);
+             for (int i=0; i < orig_nargs; i++)
+               args.safe_push (gimple_call_arg (call, i));
+             args.safe_push (cond);
+
+             /* Replace the call with a IFN_MASK_CALL that has the extra
+                condition parameter. */
+             gcall *new_call = gimple_build_call_internal_vec (IFN_MASK_CALL,
+                                                               args);
+             gimple_call_set_lhs (new_call, gimple_call_lhs (call));
+             gsi_replace (&gsi, new_call, true);
+           }
+
          lhs = gimple_get_lhs (gsi_stmt (gsi));
          if (lhs && TREE_CODE (lhs) == SSA_NAME)
            ssa_names.add (lhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index aacbb12580c..60efd4d7525 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2121,6 +2121,15 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
            if (is_gimple_call (stmt) && loop->safelen)
              {
                tree fndecl = gimple_call_fndecl (stmt), op;
+               if (fndecl == NULL_TREE
+                   && gimple_call_internal_p (stmt)
+                   && gimple_call_internal_fn (stmt) == IFN_MASK_CALL)
+                 {
+                   fndecl = gimple_call_arg (stmt, 0);
+                   gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
+                   fndecl = TREE_OPERAND (fndecl, 0);
+                   gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
+                 }
                if (fndecl != NULL_TREE)
                  {
                    cgraph_node *node = cgraph_node::get (fndecl);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 5485da58b38..0310c80c79d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3987,6 +3987,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
+  int arg_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
@@ -3994,6 +3995,16 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
     return false;
 
   fndecl = gimple_call_fndecl (stmt);
+  if (fndecl == NULL_TREE
+      && gimple_call_internal_p (stmt)
+      && gimple_call_internal_fn (stmt) == IFN_MASK_CALL)
+    {
+      fndecl = gimple_call_arg (stmt, 0);
+      gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
+      fndecl = TREE_OPERAND (fndecl, 0);
+      gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
+      arg_offset = 1;
+    }
   if (fndecl == NULL_TREE)
     return false;
 
@@ -4024,7 +4035,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
     return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt);
+  nargs = gimple_call_num_args (stmt) - arg_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4042,7 +4053,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
       thisarginfo.op = NULL_TREE;
       thisarginfo.simd_lane_linear = false;
 
-      op = gimple_call_arg (stmt, i);
+      op = gimple_call_arg (stmt, i + arg_offset);
       if (!vect_is_simple_use (op, vinfo, &thisarginfo.dt,
                               &thisarginfo.vectype)
          || thisarginfo.dt == vect_uninitialized_def)
@@ -4057,16 +4068,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
          || thisarginfo.dt == vect_external_def)
        gcc_assert (thisarginfo.vectype == NULL_TREE);
       else
-       {
-         gcc_assert (thisarginfo.vectype != NULL_TREE);
-         if (VECTOR_BOOLEAN_TYPE_P (thisarginfo.vectype))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "vector mask arguments are not supported\n");
-             return false;
-           }
-       }
+       gcc_assert (thisarginfo.vectype != NULL_TREE);
 
       /* For linear arguments, the analyze phase should have saved
         the base and step in STMT_VINFO_SIMD_CLONE_INFO.  */
@@ -4159,9 +4161,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
        if (target_badness < 0)
          continue;
        this_badness += target_badness * 512;
-       /* FORNOW: Have to add code to add the mask argument.  */
-       if (n->simdclone->inbranch)
-         continue;
        for (i = 0; i < nargs; i++)
          {
            switch (n->simdclone->args[i].arg_type)
@@ -4169,7 +4168,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
              case SIMD_CLONE_ARG_TYPE_VECTOR:
                if (!useless_type_conversion_p
                        (n->simdclone->args[i].orig_type,
-                        TREE_TYPE (gimple_call_arg (stmt, i))))
+                        TREE_TYPE (gimple_call_arg (stmt, i + arg_offset))))
                  i = -1;
                else if (arginfo[i].dt == vect_constant_def
                         || arginfo[i].dt == vect_external_def
@@ -4199,7 +4198,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                i = -1;
                break;
              case SIMD_CLONE_ARG_TYPE_MASK:
-               gcc_unreachable ();
+               break;
              }
            if (i == (size_t) -1)
              break;
@@ -4225,18 +4224,55 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
     return false;
 
   for (i = 0; i < nargs; i++)
-    if ((arginfo[i].dt == vect_constant_def
-        || arginfo[i].dt == vect_external_def)
-       && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
-      {
-       tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i));
-       arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
-                                                         slp_node);
-       if (arginfo[i].vectype == NULL
-           || !constant_multiple_p (bestn->simdclone->simdlen,
-                                    simd_clone_subparts (arginfo[i].vectype)))
+    {
+      if ((arginfo[i].dt == vect_constant_def
+          || arginfo[i].dt == vect_external_def)
+         && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
+       {
+         tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i + arg_offset));
+         arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
+                                                           slp_node);
+         if (arginfo[i].vectype == NULL
+             || !constant_multiple_p (bestn->simdclone->simdlen,
+                                      simd_clone_subparts 
(arginfo[i].vectype)))
+           return false;
+       }
+
+      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR
+         && VECTOR_BOOLEAN_TYPE_P (bestn->simdclone->args[i].vector_type))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "vector mask arguments are not supported.\n");
          return false;
-      }
+       }
+
+      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK
+         && bestn->simdclone->mask_mode == VOIDmode
+         && (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
+             != simd_clone_subparts (arginfo[i].vectype)))
+       {
+         /* FORNOW we only have partial support for vector-type masks that
+            can't hold all of simdlen. */
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                            vect_location,
+                            "in-branch vector clones are not yet"
+                            " supported for mismatched vector sizes.\n");
+         return false;
+       }
+      if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK
+         && bestn->simdclone->mask_mode != VOIDmode)
+       {
+         /* FORNOW don't support integer-type masks.  */
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                            vect_location,
+                            "in-branch vector clones are not yet"
+                            " supported for integer mask modes.\n");
+         return false;
+       }
+    }
 
   fndecl = bestn->decl;
   nunits = bestn->simdclone->simdlen;
@@ -4326,7 +4362,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
        {
          unsigned int k, l, m, o;
          tree atype;
-         op = gimple_call_arg (stmt, i);
+         op = gimple_call_arg (stmt, i + arg_offset);
          switch (bestn->simdclone->args[i].arg_type)
            {
            case SIMD_CLONE_ARG_TYPE_VECTOR:
@@ -4425,6 +4461,65 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                    }
                }
              break;
+           case SIMD_CLONE_ARG_TYPE_MASK:
+             atype = bestn->simdclone->args[i].vector_type;
+             if (bestn->simdclone->mask_mode != VOIDmode)
+               {
+                 /* FORNOW: this is disabled above.  */
+                 gcc_unreachable ();
+               }
+             else
+               {
+                 tree elt_type = TREE_TYPE (atype);
+                 tree one = fold_convert (elt_type, integer_one_node);
+                 tree zero = fold_convert (elt_type, integer_zero_node);
+                 o = vector_unroll_factor (nunits,
+                                           simd_clone_subparts (atype));
+                 for (m = j * o; m < (j + 1) * o; m++)
+                   {
+                     if (simd_clone_subparts (atype)
+                         < simd_clone_subparts (arginfo[i].vectype))
+                       {
+                         /* The mask type has fewer elements than simdlen.  */
+
+                         /* FORNOW */
+                         gcc_unreachable ();
+                       }
+                     else if (simd_clone_subparts (atype)
+                              == simd_clone_subparts (arginfo[i].vectype))
+                       {
+                         /* The SIMD clone function has the same number of
+                            elements as the current function.  */
+                         if (m == 0)
+                           {
+                             vect_get_vec_defs_for_operand (vinfo, stmt_info,
+                                                            o * ncopies,
+                                                            op,
+                                                            &vec_oprnds[i]);
+                             vec_oprnds_i[i] = 0;
+                           }
+                         vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
+                         vec_oprnd0
+                           = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
+                                     build_vector_from_val (atype, one),
+                                     build_vector_from_val (atype, zero));
+                         gassign *new_stmt
+                           = gimple_build_assign (make_ssa_name (atype),
+                                                  vec_oprnd0);
+                         vect_finish_stmt_generation (vinfo, stmt_info,
+                                                      new_stmt, gsi);
+                         vargs.safe_push (gimple_assign_lhs (new_stmt));
+                       }
+                     else
+                       {
+                         /* The mask type has more elements than simdlen.  */
+
+                         /* FORNOW */
+                         gcc_unreachable ();
+                       }
+                   }
+               }
+             break;
            case SIMD_CLONE_ARG_TYPE_UNIFORM:
              vargs.safe_push (op);
              break;

Reply via email to