On 09/27/13 09:23, Jakub Jelinek wrote:
On Thu, Sep 26, 2013 at 02:31:33PM -0500, Aldy Hernandez wrote:
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42806,6 +42806,43 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
return val;
}
+/* Return the default vector mangling ISA code when none is specified
+ in a `processor' clause. */
+
+static char
+ix86_cilkplus_default_vector_mangling_isa_code (struct cgraph_node *clone
+ ATTRIBUTE_UNUSED)
+{
+ return 'x';
+}
I think rth was suggesting using vecsize_mangle, vecsize_modifier or something
else,
instead of ISA, because it won't represent the ISA on all targets.
It is just some magic letter used in mangling of the simd functions.
I thought he was only talking about the local vecsize_mangle() function,
not the target hooks. Fair enough, I have changed all the ISA
references when they can be replaced with *mangle* or something similar.
+
+ /* To distinguish from an OpenMP simd clone, Cilk Plus functions to
+ be cloned have a distinctive artificial label in addition to "omp
+ declare simd". */
+ bool cilk_clone = flag_enable_cilkplus
+ && lookup_attribute ("cilk plus elemental",
+ DECL_ATTRIBUTES (new_node->symbol.decl));
Formatting. I'd say it should be
bool cilk_clone
= (flag_enable_cilkplus
&& lookup_attribute ("cilk plus elemental",
DECL_ATTRIBUTES (new_node->symbol.decl)));
+ if (cilk_clone)
+ remove_attribute ("cilk plus elemental",
+ DECL_ATTRIBUTES (new_node->symbol.decl));
I think it doesn't make sense to remove the attribute.
Done.
+ pretty_printer vars_pp;
Do you really need two different pretty printers?
Whoops, fixed. Nice catch.
Can't you just print "_ZGV%c%c%d into pp (is pp_printf
that cheap, wouldn't it be better to pp_string (&pp, "_ZGV"),
2 pp_character + one pp_decimal_int?), and then do the loop over
the args, which right now writes into vars_pp and finally
pp_underscore and pp_string the normally mangled name?
pp_printf() would be cheap. It's only used for a few cloned functions
in a compilation unit. I like printf. It's pretty and clean. Not
using it, is like saving sex for your old age ;-). But just to keep you
happy, I changed it...global maintainers are free to live their celibate
monk lives as they see fit :).
+/* Create a simd clone of OLD_NODE and return it. */
+
+static struct cgraph_node *
+simd_clone_create (struct cgraph_node *old_node)
+{
+ struct cgraph_node *new_node;
+ new_node = cgraph_function_versioning (old_node, vNULL, NULL, NULL, false,
+ NULL, NULL, "simdclone");
+
My understanding of how IPA cloning etc. works is that you first
set up various data structures describing how you change the arguments
and only then actually do cgraph_function_versioning which already during
the copying will do some of the transformations of the IL.
But perhaps those transformations are too complicated to describe for
tree-inline.c to make them for you.
Sure, we can worry about that when we're actually emitting the actual
clones (as discussed below), and when we start adapting the vectorizer.
+ tree attr = lookup_attribute ("omp declare simd",
+ DECL_ATTRIBUTES (node->symbol.decl));
+ if (!attr)
+ return;
+ do
+ {
+ struct cgraph_node *new_node = simd_clone_create (node);
+
+ bool inbranch_clause;
+ simd_clone_clauses_extract (new_node, TREE_VALUE (attr),
+ &inbranch_clause);
+ simd_clone_compute_isa_and_simdlen (new_node);
+ simd_clone_mangle (node, new_node);
As discussed on IRC, I was hoping that for OpenMP simd and selected
targets (e.g. i?86-linux and x86_64-linux) we could do better than that,
creating not just one or two clones as we do for Cilk+ where one can
select which CPU (and thus ISA) he wants to build the clones for, but
creating clones for all ISAs, and just based on command line options
either emit just one of them as the really optimized one and the others
just as thunks that would just call other simd clone functions or the
normal function possibly several times.
The thunk sounds like a good idea, long term. How about we start by
emitting all the variants up-front and then we can optimize these cases
later?
I was thinking, in the absence of a `simdlen' clause, we can provide a
target hook that returns a vector of (struct { int hw_vector_size; char
vecsize_mangle }) which would gives us the different clone variants we
should generate. If the user provides `simdlen', we can continue
generating just one clone (or two with *inbranch) with the present
generic algorithm in simd_clone_compute_vecsize_and_simdlen().
Or do you have any other ideas? But I'd like to leave the thunking
business after we get the general infrastructure working.
Aldy
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 1a12eda..c3a70b6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,44 @@
+2013-09-24 Aldy Hernandez <al...@redhat.com>
+
+ * Makefile.in (omp-low.o): Depend on PRETTY_PRINT_H.
+ * ipa-cp.c (determine_versionability): Nodes with SIMD clones are
+ not versionable.
+ * ggc.h (ggc_alloc_cleared_simd_clone_stat): New.
+ * cgraph.h (enum linear_stride_type): New.
+ (struct simd_clone_arg): New.
+ (struct simd_clone): New.
+ (struct cgraph_node): Add `simdclone' field.
+ Add `has_simd_clones' field.
+ * omp-low.c: Add new pass_omp_simd_clone support code.
+ (vecsize_mangle): New.
+ (ipa_omp_simd_clone): New.
+ (simd_clone_clauses_extract): New.
+ (simd_clone_compute_base_data_type): New.
+ (simd_clone_compute_vecsize_and_simdlen): New.
+ (simd_clone_create): New.
+ (simd_clone_mangle): New.
+ (simd_clone_struct_allow): New.
+ (simd_clone_struct_copy): New.
+ (class argno_map): New.
+ (argno_map::argno_map(tree)): New.
+ (argno_map::~argno_map): New.
+ (argno_map::to_tree): New.
+ * tree.h (OMP_CLAUSE_LINEAR_VARIABLE_STRIDE): New.
+ * tree-core.h (OMP_CLAUSE_LINEAR_VARIABLE_STRIDE): Document.
+ * tree-pass.h (make_pass_omp_simd_clone): New.
+ * passes.def (pass_omp_simd_clone): New.
+ * target.def: Define new hook prefix "TARGET_CILKPLUS_".
+ (default_vecsize_mangle): New.
+ (max_vecsize_for_mangle): New.
+ * doc/tm.texi.in: Add placeholder for
+ TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE and
+ TARGET_CILKPLUS_VECSIZE_FOR_MANGLE.
+ * doc/tm.texi: Regenerate.
+ * config/i386/i386.c (ix86_cilkplus_default_vecsize_mangle): New.
+ (ix86_cilkplus_vecsize_for_mangle): New.
+ (TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE): New.
+ (TARGET_CILKPLUS_VECSIZE_FOR_MANGLE): New.
+
2013-09-19 Jakub Jelinek <ja...@redhat.com>
PR tree-optimization/58472
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index c006711..4fc7e48 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2573,6 +2573,7 @@ omp-low.o : omp-low.c $(CONFIG_H) $(SYSTEM_H) coretypes.h
$(TM_H) $(TREE_H) \
$(RTL_H) $(GIMPLE_H) $(TREE_INLINE_H) langhooks.h $(DIAGNOSTIC_CORE_H) \
$(TREE_SSA_H) $(FLAGS_H) $(EXPR_H) $(DIAGNOSTIC_CORE_H) \
$(TREE_PASS_H) $(GGC_H) $(EXCEPT_H) $(SPLAY_TREE_H) $(OPTABS_H) \
+ $(PRETTY_PRINT_H) \
$(CFGLOOP_H) tree-iterator.h $(TARGET_H) gt-omp-low.h
tree-browser.o : tree-browser.c tree-browser.def $(CONFIG_H) $(SYSTEM_H) \
coretypes.h $(HASH_TABLE_H) $(TREE_H) $(TREE_PRETTY_PRINT_H)
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index 50e8743..0552805 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -248,6 +248,70 @@ struct GTY(()) cgraph_clone_info
bitmap combined_args_to_skip;
};
+enum linear_stride_type {
+ LINEAR_STRIDE_NO,
+ LINEAR_STRIDE_YES_CONSTANT,
+ LINEAR_STRIDE_YES_VARIABLE
+};
+
+/* Function arguments in the original function of a SIMD clone.
+ Supplementary data for `struct simd_clone'. */
+
+struct GTY(()) simd_clone_arg {
+ /* A SIMD clone's argument can be either linear (constant or
+ variable), uniform, or vector. If the argument is neither linear
+ or uniform, the default is vector. */
+
+ /* If the linear stride is a constant, `linear_stride' is
+ LINEAR_STRIDE_YES_CONSTANT, and `linear_stride_num' holds
+ the numeric stride.
+
+ If the linear stride is variable, `linear_stride' is
+ LINEAR_STRIDE_YES_VARIABLE, and `linear_stride_num' contains
+ the function argument containing the stride (as an index into the
+ function arguments starting at 0).
+
+ Otherwise, `linear_stride' is LINEAR_STRIDE_NO and
+ `linear_stride_num' is unused. */
+ enum linear_stride_type linear_stride;
+ unsigned HOST_WIDE_INT linear_stride_num;
+
+ /* Variable alignment if available, otherwise 0. */
+ unsigned int alignment;
+
+ /* True if variable is uniform. */
+ unsigned int uniform : 1;
+};
+
+/* Specific data for a SIMD function clone. */
+
+struct GTY(()) simd_clone {
+ /* Number of words in the SIMD lane associated with this clone. */
+ unsigned int simdlen;
+
+ /* Number of annotated function arguments in `args'. This is
+ usually the number of named arguments in FNDECL. */
+ unsigned int nargs;
+
+ /* Max hardware vector size in bits. */
+ unsigned int hw_vector_size;
+
+ /* The mangling character for a given vector size. This is is used
+ to determine the ISA mangling bit as specified in the Intel
+ Vector ABI. */
+ unsigned char vecsize_mangle;
+
+ /* True if this is the masked, in-branch version of the clone,
+ otherwise false. */
+ unsigned int inbranch : 1;
+
+ /* True if this is a Cilk Plus variant. */
+ unsigned int cilk_elemental : 1;
+
+ /* Annotated function arguments for the original function. */
+ struct simd_clone_arg GTY((length ("%h.nargs"))) args[1];
+};
+
/* The cgraph data structure.
Each function decl has assigned cgraph_node listing callees and callers. */
@@ -282,6 +346,10 @@ struct GTY(()) cgraph_node {
/* Declaration node used to be clone of. */
tree former_clone_of;
+ /* If this is a SIMD clone, this points to the SIMD specific
+ information for it. */
+ struct simd_clone *simdclone;
+
/* Interprocedural passes scheduled to have their transform functions
applied next time we execute local pass on them. We maintain it
per-function in order to allow IPA passes to introduce new functions. */
@@ -323,6 +391,8 @@ struct GTY(()) cgraph_node {
/* ?? We should be able to remove this. We have enough bits in
cgraph to calculate it. */
unsigned tm_clone : 1;
+ /* True if this function has SIMD clones. */
+ unsigned has_simd_clones : 1;
/* True if this decl is a dispatcher for function versions. */
unsigned dispatcher_function : 1;
};
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 46c37d8..726ee71 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42806,6 +42806,42 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
return val;
}
+/* Return the default mangling character when no vector size can be
+ determined from the `processor' clause. */
+
+static char
+ix86_cilkplus_default_vecsize_mangle (struct cgraph_node *clone
+ ATTRIBUTE_UNUSED)
+{
+ return 'x';
+}
+
+/* Return the hardware vector size (in bits) for a mangling
+ character. */
+
+static unsigned int
+ix86_cilkplus_vecsize_for_mangle (char mangle)
+{
+ /* ?? Intel currently has no ISA encoding character for AVX-512. */
+ switch (mangle)
+ {
+ case 'x':
+ /* xmm (SSE2). */
+ return 128;
+ case 'y':
+ /* ymm1 (AVX1). */
+ case 'Y':
+ /* ymm2 (AVX2). */
+ return 256;
+ case 'z':
+ /* zmm (MIC). */
+ return 512;
+ default:
+ gcc_unreachable ();
+ return 0;
+ }
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_RETURN_IN_MEMORY
#define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
@@ -43178,6 +43214,14 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
#undef TARGET_SPILL_CLASS
#define TARGET_SPILL_CLASS ix86_spill_class
+#undef TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE
+#define TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE \
+ ix86_cilkplus_default_vecsize_mangle
+
+#undef TARGET_CILKPLUS_VECSIZE_FOR_MANGLE
+#define TARGET_CILKPLUS_VECSIZE_FOR_MANGLE \
+ ix86_cilkplus_vecsize_for_mangle
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-i386.h"
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8d220f3..8bb9d1e 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5787,6 +5787,26 @@ The default is @code{NULL_TREE} which means to not
vectorize gather
loads.
@end deftypefn
+@deftypefn {Target Hook} char TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE (struct
cgraph_node *@var{})
+This hook should return the default mangling character when no vector
+size can be determined by examining the Cilk Plus @code{processor} clause.
+This is as specified in the Intel Vector ABI document.
+
+This hook, as well as @code{max_vector_size_for_isa} below must be set
+to support the Cilk Plus @code{processor} clause.
+
+The only argument is a @var{cgraph_node} containing the clone.
+@end deftypefn
+
+@deftypefn {Target Hook} {unsigned int} TARGET_CILKPLUS_VECSIZE_FOR_MANGLE
(char)
+This hook returns the maximum hardware vector size in bits for a given
+mangling character. The character is as described in Intel's
+Vector ABI (see @var{ISA} character in the section on mangling).
+
+This hook must be defined in order to support the Cilk Plus @code{processor}
+clause.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 863e843a..db25787 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4414,6 +4414,10 @@ address; but often a machine-dependent strategy can
generate better code.
@hook TARGET_VECTORIZE_BUILTIN_GATHER
+@hook TARGET_CILKPLUS_DEFAULT_VECSIZE_MANGLE
+
+@hook TARGET_CILKPLUS_VECSIZE_FOR_MANGLE
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/ggc.h b/gcc/ggc.h
index b31bc80..eee90c6 100644
--- a/gcc/ggc.h
+++ b/gcc/ggc.h
@@ -276,4 +276,11 @@ ggc_alloc_cleared_gimple_statement_d_stat (size_t s
MEM_STAT_DECL)
ggc_internal_cleared_alloc_stat (s PASS_MEM_STAT);
}
+static inline struct simd_clone *
+ggc_alloc_cleared_simd_clone_stat (size_t s MEM_STAT_DECL)
+{
+ return (struct simd_clone *)
+ ggc_internal_cleared_alloc_stat (s PASS_MEM_STAT);
+}
+
#endif
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 56b27b2..a04ee90 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -446,6 +446,13 @@ determine_versionability (struct cgraph_node *node)
reason = "not a tree_versionable_function";
else if (cgraph_function_body_availability (node) <= AVAIL_OVERWRITABLE)
reason = "insufficient body availability";
+ else if (node->has_simd_clones)
+ {
+ /* Ideally we should clone the SIMD clones themselves and create
+ vector copies of them, so IPA-cp and SIMD clones can happily
+ coexist, but that may not be worth the effort. */
+ reason = "function has SIMD clones";
+ }
if (reason && dump_file && !node->symbol.alias && !node->thunk.thunk_p)
fprintf (dump_file, "Function %s/%i is not versionable, reason: %s.\n",
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 2d7898f..3eeafc3 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3. If not see
#include "optabs.h"
#include "cfgloop.h"
#include "target.h"
+#include "pretty-print.h"
/* Lowering of OpenMP parallel and workshare constructs proceeds in two
@@ -10287,5 +10288,450 @@ make_pass_diagnose_omp_blocks (gcc::context *ctxt)
{
return new pass_diagnose_omp_blocks (ctxt);
}
+
+/* SIMD clone supporting code. */
+
+/* A map for function arguments. This will map a zero-based integer
+ to the corresponding index into DECL_ARGUMENTS. */
+class argno_map
+{
+ vec<tree> tree_args;
+ public:
+ /* Default constructor declared but not implemented by design. The
+ only valid constructor is TREE version below. */
+ argno_map ();
+ argno_map (tree fndecl);
+
+ ~argno_map () { tree_args.release (); }
+ tree to_tree (int n);
+};
+
+/* FNDECL is the function containing the arguments. */
+
+argno_map::argno_map (tree fndecl)
+{
+ tree_args.create (5);
+ for (tree t = DECL_ARGUMENTS (fndecl); t; t = DECL_CHAIN (t))
+ tree_args.safe_push (t);
+}
+
+/* Return the DECL corresponding to the zero-based integer index into
+ the function arguments. */
+
+tree
+argno_map::to_tree (int n)
+{
+ return tree_args[n];
+}
+
+/* Allocate a fresh `simd_clone' and return it. NARGS is the number
+ of arguments to reserve space for. */
+
+static struct simd_clone *
+simd_clone_struct_alloc (int nargs)
+{
+ struct simd_clone *clone_info;
+ int len = sizeof (struct simd_clone)
+ + nargs * sizeof (struct simd_clone_arg);
+ clone_info = ggc_alloc_cleared_simd_clone_stat (len PASS_MEM_STAT);
+ return clone_info;
+}
+
+/* Make a copy of the `struct simd_clone' in FROM to TO. */
+
+static inline void
+simd_clone_struct_copy (struct simd_clone *to, struct simd_clone *from)
+{
+ memcpy (to, from, sizeof (struct simd_clone)
+ + from->nargs * sizeof (struct simd_clone_arg));
+}
+
+/* Given a simd clone in NEW_NODE, extract the simd specific
+ information from the OMP clauses passed in CLAUSES, and set the
+ relevant bits in the cgraph node. *INBRANCH_SPECIFIED is set to
+ TRUE if the `inbranch' or `notinbranch' clause specified, otherwise
+ set to FALSE. */
+
+static void
+simd_clone_clauses_extract (struct cgraph_node *new_node, tree clauses,
+ bool *inbranch_specified)
+{
+ tree t;
+ int n = 0;
+ *inbranch_specified = false;
+ for (t = DECL_ARGUMENTS (new_node->symbol.decl); t; t = DECL_CHAIN (t))
+ ++n;
+
+ /* To distinguish from an OpenMP simd clone, Cilk Plus functions to
+ be cloned have a distinctive artificial label in addition to "omp
+ declare simd". */
+ bool cilk_clone
+ = (flag_enable_cilkplus
+ && lookup_attribute ("cilk plus elemental",
+ DECL_ATTRIBUTES (new_node->symbol.decl)));
+
+ struct simd_clone *clone_info = simd_clone_struct_alloc (n);
+ clone_info->nargs = n;
+ clone_info->cilk_elemental = cilk_clone;
+ gcc_assert (!new_node->simdclone);
+ new_node->simdclone = clone_info;
+
+ if (!clauses || TREE_CODE (clauses) != OMP_CLAUSE)
+ return;
+
+ for (t = clauses; t; t = OMP_CLAUSE_CHAIN (t))
+ {
+ switch (OMP_CLAUSE_CODE (t))
+ {
+ case OMP_CLAUSE_INBRANCH:
+ clone_info->inbranch = 1;
+ *inbranch_specified = true;
+ break;
+ case OMP_CLAUSE_NOTINBRANCH:
+ clone_info->inbranch = 0;
+ *inbranch_specified = true;
+ break;
+ case OMP_CLAUSE_SIMDLEN:
+ clone_info->simdlen
+ = TREE_INT_CST_LOW (OMP_CLAUSE_SIMDLEN_EXPR (t));
+ break;
+ case OMP_CLAUSE_LINEAR:
+ {
+ tree decl = OMP_CLAUSE_DECL (t);
+ tree step = OMP_CLAUSE_LINEAR_STEP (t);
+ int argno = TREE_INT_CST_LOW (decl);
+ if (OMP_CLAUSE_LINEAR_VARIABLE_STRIDE (t))
+ {
+ clone_info->args[argno].linear_stride
+ = LINEAR_STRIDE_YES_VARIABLE;
+ clone_info->args[argno].linear_stride_num
+ = TREE_INT_CST_LOW (step);
+ gcc_assert (!TREE_INT_CST_HIGH (step));
+ }
+ else
+ {
+ if (TREE_INT_CST_HIGH (step))
+ {
+ /* It looks like this can't really happen, since the
+ front-ends generally issue:
+
+ warning: integer constant is too large for its type.
+
+ But let's assume somehow we got past all that. */
+ warning_at (DECL_SOURCE_LOCATION (decl), 0,
+ "ignoring large linear step");
+ }
+ else
+ {
+ clone_info->args[argno].linear_stride
+ = LINEAR_STRIDE_YES_CONSTANT;
+ clone_info->args[argno].linear_stride_num
+ = TREE_INT_CST_LOW (step);
+ }
+ }
+ break;
+ }
+ case OMP_CLAUSE_UNIFORM:
+ {
+ tree decl = OMP_CLAUSE_DECL (t);
+ int argno = tree_low_cst (decl, 1);
+ clone_info->args[argno].uniform = 1;
+ break;
+ }
+ case OMP_CLAUSE_ALIGNED:
+ {
+ tree decl = OMP_CLAUSE_DECL (t);
+ int argno = tree_low_cst (decl, 1);
+ clone_info->args[argno].alignment
+ = TREE_INT_CST_LOW (OMP_CLAUSE_ALIGNED_ALIGNMENT (t));
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
+/* Helper function for mangling vectors. Given a vector size in bits,
+ return the corresponding mangling character. */
+
+static char
+vecsize_mangle (unsigned int vecsize)
+{
+ switch (vecsize)
+ {
+ /* The Intel Vector ABI does not provide a mangling character
+ for a 64-bit ISA, but this feels like it's keeping with the
+ design. */
+ case 64: return 'w';
+
+ case 128: return 'x';
+ case 256: return 'y';
+ case 512: return 'z';
+ default:
+ /* FIXME: We must come up with a default mangling bit. */
+ return 'x';
+ }
+}
+
+/* Given a SIMD clone in NEW_NODE, calculate the characteristic data
+ type and return the coresponding type. The characteristic data
+ type is computed as described in the Intel Vector ABI. */
+
+static tree
+simd_clone_compute_base_data_type (struct cgraph_node *new_node)
+{
+ tree type = integer_type_node;
+ tree fndecl = new_node->symbol.decl;
+
+ /* a) For non-void function, the characteristic data type is the
+ return type. */
+ if (TREE_CODE (TREE_TYPE (TREE_TYPE (fndecl))) != VOID_TYPE)
+ type = TREE_TYPE (TREE_TYPE (fndecl));
+
+ /* b) If the function has any non-uniform, non-linear parameters,
+ then the characteristic data type is the type of the first
+ such parameter. */
+ else
+ {
+ argno_map map (fndecl);
+ for (unsigned int i = 0; i < new_node->simdclone->nargs; ++i)
+ {
+ struct simd_clone_arg arg = new_node->simdclone->args[i];
+ if (!arg.uniform && arg.linear_stride == LINEAR_STRIDE_NO)
+ {
+ type = TREE_TYPE (map.to_tree (i));
+ break;
+ }
+ }
+ }
+
+ /* c) If the characteristic data type determined by a) or b) above
+ is struct, union, or class type which is pass-by-value (except
+ for the type that maps to the built-in complex data type), the
+ characteristic data type is int. */
+ if (RECORD_OR_UNION_TYPE_P (type)
+ && !aggregate_value_p (type, NULL)
+ && TREE_CODE (type) != COMPLEX_TYPE)
+ return integer_type_node;
+
+ /* d) If none of the above three classes is applicable, the
+ characteristic data type is int. */
+
+ return type;
+
+ /* e) For Intel Xeon Phi native and offload compilation, if the
+ resulting characteristic data type is 8-bit or 16-bit integer
+ data type, the characteristic data type is int. */
+ /* Well, we don't handle Xeon Phi yet. */
+}
+
+/* Given a SIMD clone in NEW_NODE, compute simdlen and vector size,
+ and store them in NEW_NODE->simdclone. */
+
+static void
+simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *new_node)
+{
+ char vmangle = new_node->simdclone->vecsize_mangle;
+ /* Vector size for this clone. */
+ unsigned int vecsize = 0;
+ /* Base vector type, based on function arguments. */
+ tree base_type = simd_clone_compute_base_data_type (new_node);
+ unsigned int base_type_size = GET_MODE_BITSIZE (TYPE_MODE (base_type));
+
+ /* Calculate everything for Cilk Plus clones with appropriate target
+ support. This is as specified in the Intel Vector ABI.
+
+ Note: Any target which supports the Cilk Plus processor clause
+ must also provide appropriate target hooks for calculating
+ default ISA/processor (default_vecsize_mangle), and for
+ calculating hardware vector size based on ISA/processor
+ (vecsize_for_mangle). */
+ if (new_node->simdclone->cilk_elemental
+ && targetm.cilkplus.default_vecsize_mangle)
+ {
+ if (!vmangle)
+ vmangle = targetm.cilkplus.default_vecsize_mangle (new_node);
+ vecsize = targetm.cilkplus.vecsize_for_mangle (vmangle);
+ if (!new_node->simdclone->simdlen)
+ new_node->simdclone->simdlen = vecsize / base_type_size;
+ }
+ /* Calculate everything else generically. */
+ else
+ {
+ vecsize = GET_MODE_BITSIZE (targetm.vectorize.preferred_simd_mode
+ (TYPE_MODE (base_type)));
+ vmangle = vecsize_mangle (vecsize);
+ if (!new_node->simdclone->simdlen)
+ new_node->simdclone->simdlen = vecsize / base_type_size;
+ }
+ new_node->simdclone->vecsize_mangle = vmangle;
+ new_node->simdclone->hw_vector_size = vecsize;
+}
+
+static void
+simd_clone_mangle (struct cgraph_node *old_node, struct cgraph_node *new_node)
+{
+ char vecsize_mangle = new_node->simdclone->vecsize_mangle;
+ char mask = new_node->simdclone->inbranch ? 'M' : 'N';
+ unsigned int simdlen = new_node->simdclone->simdlen;
+ unsigned int n;
+ pretty_printer pp;
+
+ gcc_assert (vecsize_mangle && simdlen);
+
+ pp_string (&pp, "_ZGV");
+ pp_character (&pp, vecsize_mangle);
+ pp_character (&pp, mask);
+ pp_decimal_int (&pp, simdlen);
+
+ for (n = 0; n < new_node->simdclone->nargs; ++n)
+ {
+ struct simd_clone_arg arg = new_node->simdclone->args[n];
+
+ if (arg.uniform)
+ pp_character (&pp, 'u');
+ else if (arg.linear_stride == LINEAR_STRIDE_YES_CONSTANT)
+ {
+ gcc_assert (arg.linear_stride_num != 0);
+ pp_character (&pp, 'l');
+ if (arg.linear_stride_num > 1)
+ pp_unsigned_wide_integer (&pp,
+ arg.linear_stride_num);
+ }
+ else if (arg.linear_stride == LINEAR_STRIDE_YES_VARIABLE)
+ {
+ pp_character (&pp, 's');
+ pp_unsigned_wide_integer (&pp, arg.linear_stride_num);
+ }
+ else
+ pp_character (&pp, 'v');
+ if (arg.alignment)
+ {
+ pp_character (&pp, 'a');
+ pp_decimal_int (&pp, arg.alignment);
+ }
+ }
+
+ pp_underscore (&pp);
+ pp_string (&pp,
+ IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (old_node->symbol.decl)));
+ const char *str = pp_formatted_text (&pp);
+ change_decl_assembler_name (new_node->symbol.decl,
+ get_identifier (str));
+}
+
+/* Create a simd clone of OLD_NODE and return it. */
+
+static struct cgraph_node *
+simd_clone_create (struct cgraph_node *old_node)
+{
+ struct cgraph_node *new_node;
+ new_node = cgraph_function_versioning (old_node, vNULL, NULL, NULL, false,
+ NULL, NULL, "simdclone");
+
+ /* Keep cgraph friends from removing the clone. */
+ new_node->symbol.externally_visible
+ = old_node->symbol.externally_visible;
+ TREE_PUBLIC (new_node->symbol.decl) = TREE_PUBLIC (old_node->symbol.decl);
+ old_node->has_simd_clones = true;
+
+ DECL_ATTRIBUTES (new_node->symbol.decl)
+ = remove_attribute ("omp declare simd",
+ DECL_ATTRIBUTES (new_node->symbol.decl));
+
+ return new_node;
+}
+
+/* If the function in NODE is tagged as an elemental SIMD function,
+ create the appropriate SIMD clones. */
+
+static void
+expand_simd_clones (struct cgraph_node *node)
+{
+ if (cgraph_function_body_availability (node) < AVAIL_OVERWRITABLE)
+ return;
+
+ tree attr = lookup_attribute ("omp declare simd",
+ DECL_ATTRIBUTES (node->symbol.decl));
+ if (!attr)
+ return;
+ do
+ {
+ struct cgraph_node *new_node = simd_clone_create (node);
+
+ bool inbranch_clause;
+ simd_clone_clauses_extract (new_node, TREE_VALUE (attr),
+ &inbranch_clause);
+ simd_clone_compute_vecsize_and_simdlen (new_node);
+ simd_clone_mangle (node, new_node);
+
+ // FIXME: Adjust clone parameters to their appropriate vector types.
+
+ /* If no inbranch clause was specified, we need both variants.
+ We have already created the not-in-branch version above, by
+ virtue of .inbranch being clear. Create the masked in-branch
+ version. */
+ if (!inbranch_clause)
+ {
+ struct cgraph_node *n = simd_clone_create (node);
+ struct simd_clone *clone
+ = simd_clone_struct_alloc (new_node->simdclone->nargs);
+ simd_clone_struct_copy (clone, new_node->simdclone);
+ clone->inbranch = 1;
+ n->simdclone = clone;
+ simd_clone_mangle (node, n);
+ }
+ }
+ while ((attr = lookup_attribute ("omp declare simd", TREE_CHAIN (attr))));
+}
+
+/* Entry point for IPA simd clone creation pass. */
+
+static unsigned int
+ipa_omp_simd_clone (void)
+{
+ struct cgraph_node *node;
+ FOR_EACH_DEFINED_FUNCTION (node)
+ expand_simd_clones (node);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_simd_clone =
+{
+ SIMPLE_IPA_PASS, /* type */
+ "simdclone", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ true, /* has_gate */
+ true, /* has_execute */
+ TV_NONE, /* tv_id */
+ ( PROP_ssa | PROP_cfg ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_omp_simd_clone : public simple_ipa_opt_pass
+{
+public:
+ pass_omp_simd_clone(gcc::context *ctxt)
+ : simple_ipa_opt_pass(pass_data_omp_simd_clone, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate () { return flag_openmp || flag_enable_cilkplus; }
+ unsigned int execute () { return ipa_omp_simd_clone (); }
+};
+
+} // anon namespace
+
+simple_ipa_opt_pass *
+make_pass_omp_simd_clone (gcc::context *ctxt)
+{
+ return new pass_omp_simd_clone (ctxt);
+}
#include "gt-omp-low.h"
diff --git a/gcc/passes.def b/gcc/passes.def
index 84eb3f3..6803399 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -97,6 +97,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_feedback_split_functions);
POP_INSERT_PASSES ()
NEXT_PASS (pass_ipa_increase_alignment);
+ NEXT_PASS (pass_omp_simd_clone);
NEXT_PASS (pass_ipa_tm);
NEXT_PASS (pass_ipa_lower_emutls);
TERMINATE_PASS_LIST ()
diff --git a/gcc/target.def b/gcc/target.def
index 6de513f..92cbd73 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1508,6 +1508,35 @@ hook_int_uint_mode_1)
HOOK_VECTOR_END (sched)
+/* Functions relating to Cilk Plus. */
+#undef HOOK_PREFIX
+#define HOOK_PREFIX "TARGET_CILKPLUS_"
+HOOK_VECTOR (TARGET_CILKPLUS, cilkplus)
+
+DEFHOOK
+(default_vecsize_mangle,
+"This hook should return the default mangling character when no vector\n\
+size can be determined by examining the Cilk Plus @code{processor} clause.\n\
+This is as specified in the Intel Vector ABI document.\n\
+\n\
+This hook, as well as @code{max_vector_size_for_isa} below must be set\n\
+to support the Cilk Plus @code{processor} clause.\n\
+\n\
+The only argument is a @var{cgraph_node} containing the clone.",
+char, (struct cgraph_node *), NULL)
+
+DEFHOOK
+(vecsize_for_mangle,
+"This hook returns the maximum hardware vector size in bits for a given\n\
+mangling character. The character is as described in Intel's\n\
+Vector ABI (see @var{ISA} character in the section on mangling).\n\
+\n\
+This hook must be defined in order to support the Cilk Plus @code{processor}\n\
+clause.",
+unsigned int, (char), NULL)
+
+HOOK_VECTOR_END (cilkplus)
+
/* Functions relating to vectorization. */
#undef HOOK_PREFIX
#define HOOK_PREFIX "TARGET_VECTORIZE_"
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-clones-1.c
b/gcc/testsuite/gcc.dg/gomp/simd-clones-1.c
new file mode 100644
index 0000000..486b67a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/simd-clones-1.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenmp -fdump-tree-optimized -O3" } */
+
+/* Test that functions that have SIMD clone counterparts are not
+ cloned by IPA-cp. For example, special_add() below has SIMD clones
+ created for it. However, if IPA-cp later decides to clone a
+ specialization of special_add(x, 666) when analyzing fillit(), we
+ will forever keep the vectorizer from using the SIMD versions of
+ special_add in a loop.
+
+ If IPA-CP gets taught how to adjust the SIMD clones as well, this
+ test could be removed. */
+
+#pragma omp declare simd simdlen(4)
+static int __attribute__ ((noinline))
+special_add (int x, int y)
+{
+ if (y == 666)
+ return x + y + 123;
+ else
+ return x + y;
+}
+
+void fillit(int *tot)
+{
+ int i;
+
+ for (i=0; i < 10000; ++i)
+ tot[i] = special_add (i, 666);
+}
+
+/* { dg-final { scan-tree-dump-not "special_add.constprop" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
new file mode 100644
index 0000000..8ab3131
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-fopenmp -fdump-tree-optimized -O -msse2" } */
+
+#pragma omp declare simd inbranch uniform(c) linear(b:66) //
addit.simdclone.2
+#pragma omp declare simd notinbranch aligned(c:32) // addit.simdclone.1
+int addit(int a, int b, int c)
+{
+ return a + b;
+}
+
+#pragma omp declare simd uniform(a) aligned(a:32) linear(k:1) notinbranch
+float setArray(float *a, float x, int k)
+{
+ a[k] = a[k] + x;
+ return a[k];
+}
+
+/* { dg-final { scan-tree-dump "clone.0 \\(_ZGVxN4ua32vl_setArray" "optimized"
} } */
+/* { dg-final { scan-tree-dump "clone.1 \\(_ZGVxN4vvva32_addit" "optimized" }
} */
+/* { dg-final { scan-tree-dump "clone.2 \\(_ZGVxM4vl66u_addit" "optimized" } }
*/
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-clones-3.c
b/gcc/testsuite/gcc.dg/gomp/simd-clones-3.c
new file mode 100644
index 0000000..1ce9692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/simd-clones-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-fopenmp -fdump-tree-optimized -O -msse2" } */
+
+/* Test that if there is no *inbranch clauses, that both the masked and
+ the unmasked version are created. */
+
+#pragma omp declare simd
+int addit(int a, int b, int c)
+{
+ return a + b;
+}
+
+/* { dg-final { scan-tree-dump "clone.* \\(_ZGVxN4vvv_addit" "optimized" } } */
+/* { dg-final { scan-tree-dump "clone.* \\(_ZGVxM4vvv_addit" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index 4a0d437..c436b72 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -885,6 +885,9 @@ struct GTY(()) tree_base {
CALL_ALLOCA_FOR_VAR_P in
CALL_EXPR
+ OMP_CLAUSE_LINEAR_VARIABLE_STRIDE in
+ OMP_CLAUSE_LINEAR
+
side_effects_flag:
TREE_SIDE_EFFECTS in
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index ea1a62f..718f259 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -474,6 +474,7 @@ extern ipa_opt_pass_d *make_pass_ipa_pure_const
(gcc::context *ctxt);
extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_lto_finish_out (gcc::context *ctxt);
extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt);
+extern simple_ipa_opt_pass *make_pass_omp_simd_clone (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_profile (gcc::context *ctxt);
extern ipa_opt_pass_d *make_pass_ipa_cdtor_merge (gcc::context *ctxt);
diff --git a/gcc/tree.h b/gcc/tree.h
index b13cb2b..3e9818c 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1318,6 +1318,10 @@ extern void protected_set_expr_location (tree,
location_t);
#define OMP_CLAUSE_LINEAR_NO_COPYOUT(NODE) \
TREE_PRIVATE (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_LINEAR))
+/* True if a LINEAR clause has a stride that is variable. */
+#define OMP_CLAUSE_LINEAR_VARIABLE_STRIDE(NODE) \
+ TREE_PROTECTED (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_LINEAR))
+
#define OMP_CLAUSE_LINEAR_STEP(NODE) \
OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_LINEAR), 1)