This is the patch I committed. Bernd pointed out that I didn't need to be so
coy about the branches in the middle of blocks at that point of the compilation
anyway. So we remove a couple of unneeded insn patterns.
nathan
2015-07-09 Nathan Sidwell <nat...@codesourcery.com>
Infrastructure:
* gimple.h (gimple_call_internal_unique_p): Declare.
* gimple.c (gimple_call_same_target_p): Add check for
gimple_call_internal_unique_p.
* internal-fn.c (gimple_call_internal_unique_p): New.
* omp-low.h (OACC_LOOP_MASK): Define here...
* omp-low.c (OACC_LOOP_MASK): ... not here.
* tree-ssa-threadedge.c (record_temporary_equivalences_from_stmts):
Add check for gimple_call_internal_unique_p.
* tree-ssa-tail-merge.c (same_succ_def::equal): Add EQ check for
the gimple statements.
Additions:
* internal-fn.def (GOACC_FORK, GOACC_JOIN): New.
* internal-fn.c (gimple_call_internal_unique_p): Add check for
IFN_GOACC_FORK, IFN_GOACC_JOIN.
(expand_GOACC_FORK, expand_GOACC_JOIN): New.
* omp-low.c (gen_oacc_fork, gen_oacc_join): New.
(expand_omp_for_static_nochunk): Add oacc loop fork & join calls.
(expand_omp_for_static_chunk): Likewise.
* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
nvptx_expand_oacc_join): Declare.
* config/nvptx/nvptx.md (UNSPEC_BIT_CONV, UNSPEC_BROADCAST,
UNSPEC_BR_UNIFIED): New unspecs.
(UNSPECV_FORK, UNSPECV_FORKED, UNSPECV_JOINING, UNSPECV_JOIN): New.
(BITS, BITD): New mode iterators.
(br_true_uni, br_false_uni): New unified branches.
(nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New insns.
(oacc_fork, oacc_join): New expand
(nvptx_broadcast<mode>): New insn.
(unpack<mode>si2, packsi<mode>2): New insns.
(worker_load<mode>, worker_store<mode>): New insns.
(nvptx_barsync): Renamed from ...
(threadbarrier_insn): ... here.
* config/nvptx/nvptx.c: Include hash-map,h, dominance.h, cfg.h &
omp-low.h.
(worker_bcast_hwm, worker_bcast_align, worker_bcast_name,
worker_bcast_sym): New.
(nvptx_option_override): Initialize worker_bcast_sym.
(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New.
(nvptx_gen_unpack, nvptx_gen_pack): New.
(struct wcast_data_t, propagate_mask): New types.
(nvptx_gen_vcast, nvptx_gen_wcast): New.
(struct parallel): New structs.
(parallel::parallel, parallel::~parallel): Ctor & dtor.
(bb_insn_map_t): New map.
(insn_bb_t, insn_bb_vec_t): New tuple & vector of.
(nvptx_split_blocks, nvptx_discover_pre): New.
(bb_par_t, bb_par_vec_t); New tuple & vector of.
(nvptx_dump_pars,nvptx_discover_pars): New.
(nvptx_propagate): New.
(vprop_gen, nvptx_vpropagate)@ New.
(wprop_gen, nvptx_wpropagate): New.
(nvptx_wsync): New.
(nvptx_single, nvptx_skip_par): New.
(nvptx_process_pars): New.
(nvptx_neuter_pars): New.
(nvptx_reorg): Add liveness DF problem. Call nvptx_split_blocks,
nvptx_discover_pars, nvptx_process_pars & nvptx_neuter_pars.
(nvptx_cannot_copy_insn): Check for broadcast, sync, fork & join insns.
(nvptx_file_end): Output worker broadcast array definition.
Deletions:
* builtins.c (expand_oacc_thread_barrier): Delete.
(expand_oacc_thread_broadcast): Delete.
(expand_builtin): Adjust.
* gimple.c (struct gimple_statement_omp_parallel_layout): Remove
broadcast_array member.
(gimple_omp_target_broadcast_array): Delete.
(gimple_omp_target_set_broadcast_array): Delete.
* omp-low.c (omp_region): Remove broadcast_array member.
(oacc_broadcast): Delete.
(build_oacc_threadbarrier): Delete.
(oacc_loop_needs_threadbarrier_p): Delete.
(oacc_alloc_broadcast_storage): Delete.
(find_omp_target_region): Remove call to
gimple_omp_target_broadcast_array.
(enclosing_target_region, required_predication_mask,
generate_vector_broadcast, generate_oacc_broadcast,
make_predication_test, predicate_bb, find_predicatable_bbs,
predicate_omp_regions): Delete.
(use, gen, live_in): Delete.
(populate_loop_live_in, oacc_populate_live_in_1,
oacc_populate_live_in, populate_loop_use, oacc_broadcast_1,
oacc_broadcast): Delete.
(execute_expand_omp): Remove predicate_omp_regions call.
(lower_omp_target): Remove oacc_alloc_broadcast_storage call.
Remove gimple_omp_target_set_broadcast_array call.
(make_gimple_omp_edges): Remove oacc_loop_needs_threadbarrier_p
check.
* tree-ssa-alias.c (ref_maybe_used_by_call_p_1): Remove
BUILT_IN_GOACC_THREADBARRIER.
* omp-builtins.def (BUILT_IN_GOACC_THREAD_BROADCAST,
BUILT_IN_GOACC_THREAD_BROADCAST_LL,
BUILT_IN_GOACC_THREADBARRIER): Delete.
* config/nvptx/nvptx.md (UNSPECV_WARPBCAST): Delete.
(br_true, br_false): Remove U format specifier.
(oacc_thread_broadcastsi, oacc_thread_broadcast_di): Delete.
(oacc_threadbarrier): Delete.
* config/.nvptx/nvptx.c (condition_unidirectional_p): Delete.
(nvptx_print_operand): Remove 'U' specifier.
(nvptx_reorg_subreg): Remove unidirection checking.
(nvptx_cannot_copy_insn): Remove broadcast and barrier insns.
* config/nvptx/nvptx.h (machine_function): Remove
arp_equal_pseudos.
Index: internal-fn.c
===================================================================
--- internal-fn.c (revision 225323)
+++ internal-fn.c (working copy)
@@ -98,6 +98,20 @@ init_internal_fns ()
internal_fn_fnspec_array[IFN_LAST] = 0;
}
+/* Return true if this internal fn call is a unique marker -- it
+ should not be duplicated or merged. */
+
+bool
+gimple_call_internal_unique_p (const_gimple gs)
+{
+ switch (gimple_call_internal_fn (gs))
+ {
+ default: return false;
+ case IFN_GOACC_FORK: return true;
+ case IFN_GOACC_JOIN: return true;
+ }
+}
+
/* ARRAY_TYPE is an array of vector modes. Return the associated insn
for load-lanes-style optab OPTAB. The insn must exist. */
@@ -1990,6 +2004,26 @@ expand_GOACC_DATA_END_WITH_ARG (gcall *s
gcc_unreachable ();
}
+static void
+expand_GOACC_FORK (gcall *stmt)
+{
+ rtx mode = expand_normal (gimple_call_arg (stmt, 0));
+
+#ifdef HAVE_oacc_fork
+ emit_insn (gen_oacc_fork (mode));
+#endif
+}
+
+static void
+expand_GOACC_JOIN (gcall *stmt)
+{
+ rtx mode = expand_normal (gimple_call_arg (stmt, 0));
+
+#ifdef HAVE_oacc_join
+ emit_insn (gen_oacc_join (mode));
+#endif
+}
+
/* Routines to expand each internal function, indexed by function number.
Each routine has the prototype:
Index: tree-ssa-threadedge.c
===================================================================
--- tree-ssa-threadedge.c (revision 225323)
+++ tree-ssa-threadedge.c (working copy)
@@ -310,6 +310,17 @@ record_temporary_equivalences_from_stmts
&& gimple_asm_volatile_p (as_a <gasm *> (stmt)))
return NULL;
+ /* If the statement is a unique builtin, we can not thread
+ through here. */
+ if (gimple_code (stmt) == GIMPLE_CALL)
+ {
+ gcall *call = as_a <gcall *> (stmt);
+
+ if (gimple_call_internal_p (call)
+ && gimple_call_internal_unique_p (call))
+ return NULL;
+ }
+
/* If duplicating this block is going to cause too much code
expansion, then do not thread through this block. */
stmt_count++;
Index: builtins.c
===================================================================
--- builtins.c (revision 225323)
+++ builtins.c (working copy)
@@ -5947,20 +5947,6 @@ expand_builtin_acc_on_device (tree exp A
#endif
}
-/* Expand a thread synchronization point for OpenACC threads. */
-static void
-expand_oacc_threadbarrier (void)
-{
-#ifdef HAVE_oacc_threadbarrier
- rtx insn = GEN_FCN (CODE_FOR_oacc_threadbarrier) ();
- if (insn != NULL_RTX)
- {
- emit_insn (insn);
- }
-#endif
-}
-
-
/* Expand a thread-id/thread-count builtin for OpenACC. */
static rtx
@@ -6032,47 +6018,6 @@ expand_oacc_ganglocal_ptr (rtx target AT
return NULL_RTX;
}
-/* Handle a GOACC_thread_broadcast builtin call EXP with target TARGET.
- Return the result. */
-
-static rtx
-expand_builtin_oacc_thread_broadcast (tree exp, rtx target)
-{
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- enum insn_code icode;
-
- enum machine_mode mode = TYPE_MODE (TREE_TYPE (arg0));
- gcc_assert (INTEGRAL_MODE_P (mode));
- do
- {
- icode = direct_optab_handler (oacc_thread_broadcast_optab, mode);
- mode = GET_MODE_WIDER_MODE (mode);
- }
- while (icode == CODE_FOR_nothing && mode != VOIDmode);
- if (icode == CODE_FOR_nothing)
- return expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
-
- rtx tmp = target;
- machine_mode mode0 = insn_data[icode].operand[0].mode;
- machine_mode mode1 = insn_data[icode].operand[1].mode;
- if (!tmp || !REG_P (tmp) || GET_MODE (tmp) != mode0)
- tmp = gen_reg_rtx (mode0);
- rtx op1 = expand_expr (arg0, NULL_RTX, mode1, EXPAND_NORMAL);
- if (GET_MODE (op1) != mode1)
- op1 = convert_to_mode (mode1, op1, 0);
-
- /* op1 might be an immediate, place it inside a register. */
- op1 = force_reg (mode1, op1);
-
- rtx insn = GEN_FCN (icode) (tmp, op1);
- if (insn != NULL_RTX)
- {
- emit_insn (insn);
- return tmp;
- }
- return const0_rtx;
-}
-
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
@@ -7225,14 +7170,6 @@ expand_builtin (tree exp, rtx target, rt
return target;
break;
- case BUILT_IN_GOACC_THREAD_BROADCAST:
- case BUILT_IN_GOACC_THREAD_BROADCAST_LL:
- return expand_builtin_oacc_thread_broadcast (exp, target);
-
- case BUILT_IN_GOACC_THREADBARRIER:
- expand_oacc_threadbarrier ();
- return const0_rtx;
-
default: /* just do library call, if unknown builtin */
break;
}
Index: tree-ssa-tail-merge.c
===================================================================
--- tree-ssa-tail-merge.c (revision 225323)
+++ tree-ssa-tail-merge.c (working copy)
@@ -608,10 +608,13 @@ same_succ_def::equal (const same_succ_de
{
s1 = gsi_stmt (gsi1);
s2 = gsi_stmt (gsi2);
- if (gimple_code (s1) != gimple_code (s2))
- return 0;
- if (is_gimple_call (s1) && !gimple_call_same_target_p (s1, s2))
- return 0;
+ if (s1 != s2)
+ {
+ if (gimple_code (s1) != gimple_code (s2))
+ return 0;
+ if (is_gimple_call (s1) && !gimple_call_same_target_p (s1, s2))
+ return 0;
+ }
gsi_next_nondebug (&gsi1);
gsi_next_nondebug (&gsi2);
gsi_advance_fw_nondebug_nonlocal (&gsi1);
Index: config/nvptx/nvptx.h
===================================================================
--- config/nvptx/nvptx.h (revision 225323)
+++ config/nvptx/nvptx.h (working copy)
@@ -235,7 +235,6 @@ struct nvptx_pseudo_info
struct GTY(()) machine_function
{
rtx_expr_list *call_args;
- char *warp_equal_pseudos;
rtx start_call;
tree funtype;
bool has_call_with_varargs;
Index: config/nvptx/nvptx-protos.h
===================================================================
--- config/nvptx/nvptx-protos.h (revision 225323)
+++ config/nvptx/nvptx-protos.h (working copy)
@@ -32,6 +32,8 @@ extern void nvptx_register_pragmas (void
extern const char *nvptx_section_for_decl (const_tree);
#ifdef RTX_CODE
+extern void nvptx_expand_oacc_fork (rtx);
+extern void nvptx_expand_oacc_join (rtx);
extern void nvptx_expand_call (rtx, rtx);
extern rtx nvptx_expand_compare (rtx);
extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
Index: config/nvptx/nvptx.md
===================================================================
--- config/nvptx/nvptx.md (revision 225323)
+++ config/nvptx/nvptx.md (working copy)
@@ -52,15 +52,24 @@
UNSPEC_NID
UNSPEC_SHARED_DATA
+
+ UNSPEC_BIT_CONV
+
+ UNSPEC_BROADCAST
+ UNSPEC_BR_UNIFIED
])
(define_c_enum "unspecv" [
UNSPECV_LOCK
UNSPECV_CAS
UNSPECV_XCHG
- UNSPECV_WARP_BCAST
UNSPECV_BARSYNC
UNSPECV_ID
+
+ UNSPECV_FORK
+ UNSPECV_FORKED
+ UNSPECV_JOINING
+ UNSPECV_JOIN
])
(define_attr "subregs_ok" "false,true"
@@ -253,6 +262,8 @@
(define_mode_iterator QHSIM [QI HI SI])
(define_mode_iterator SDFM [SF DF])
(define_mode_iterator SDCM [SC DC])
+(define_mode_iterator BITS [SI SF])
+(define_mode_iterator BITD [DI DF])
;; This mode iterator allows :P to be used for patterns that operate on
;; pointer-sized quantities. Exactly one of the two alternatives will match.
@@ -813,7 +824,7 @@
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%j0\\tbra%U0\\t%l1;")
+ "%j0\\tbra\\t%l1;")
(define_insn "br_false"
[(set (pc)
@@ -822,7 +833,24 @@
(label_ref (match_operand 1 "" ""))
(pc)))]
""
- "%J0\\tbra%U0\\t%l1;")
+ "%J0\\tbra\\t%l1;")
+
+;; unified conditional branch
+(define_insn "br_true_uni"
+ [(set (pc) (if_then_else
+ (ne (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+ UNSPEC_BR_UNIFIED) (const_int 0))
+ (label_ref (match_operand 1 "" "")) (pc)))]
+ ""
+ "%j0\\tbra.uni\\t%l1;")
+
+(define_insn "br_false_uni"
+ [(set (pc) (if_then_else
+ (eq (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+ UNSPEC_BR_UNIFIED) (const_int 0))
+ (label_ref (match_operand 1 "" "")) (pc)))]
+ ""
+ "%J0\\tbra.uni\\t%l1;")
(define_expand "cbranch<mode>4"
[(set (pc)
@@ -1326,37 +1354,92 @@
return asms[INTVAL (operands[1])];
})
-(define_insn "oacc_thread_broadcastsi"
- [(set (match_operand:SI 0 "nvptx_register_operand" "")
- (unspec_volatile:SI [(match_operand:SI 1 "nvptx_register_operand" "")]
- UNSPECV_WARP_BCAST))]
+(define_insn "nvptx_fork"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_FORK)]
""
- "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
+ "// fork %0;"
+)
-(define_expand "oacc_thread_broadcastdi"
- [(set (match_operand:DI 0 "nvptx_register_operand" "")
- (unspec_volatile:DI [(match_operand:DI 1 "nvptx_register_operand" "")]
- UNSPECV_WARP_BCAST))]
- ""
-{
- rtx t = gen_reg_rtx (DImode);
- emit_insn (gen_lshrdi3 (t, operands[1], GEN_INT (32)));
- rtx op0 = force_reg (SImode, gen_lowpart (SImode, t));
- rtx op1 = force_reg (SImode, gen_lowpart (SImode, operands[1]));
- rtx targ0 = gen_reg_rtx (SImode);
- rtx targ1 = gen_reg_rtx (SImode);
- emit_insn (gen_oacc_thread_broadcastsi (targ0, op0));
- emit_insn (gen_oacc_thread_broadcastsi (targ1, op1));
- rtx t2 = gen_reg_rtx (DImode);
- rtx t3 = gen_reg_rtx (DImode);
- emit_insn (gen_extendsidi2 (t2, targ0));
- emit_insn (gen_extendsidi2 (t3, targ1));
- rtx t4 = gen_reg_rtx (DImode);
- emit_insn (gen_ashldi3 (t4, t2, GEN_INT (32)));
- emit_insn (gen_iordi3 (operands[0], t3, t4));
- DONE;
+(define_insn "nvptx_forked"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_FORKED)]
+ ""
+ "// forked %0;"
+)
+
+(define_insn "nvptx_joining"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_JOINING)]
+ ""
+ "// joining %0;"
+)
+
+(define_insn "nvptx_join"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_JOIN)]
+ ""
+ "// join %0;"
+)
+
+(define_expand "oacc_fork"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_FORKED)]
+ ""
+{
+ nvptx_expand_oacc_fork (operands[0]);
})
+(define_expand "oacc_join"
+ [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_JOIN)]
+ ""
+{
+ nvptx_expand_oacc_join (operands[0]);
+})
+
+;; only 32-bit shuffles exist.
+(define_insn "nvptx_broadcast<mode>"
+ [(set (match_operand:BITS 0 "nvptx_register_operand" "")
+ (unspec:BITS
+ [(match_operand:BITS 1 "nvptx_register_operand" "")]
+ UNSPEC_BROADCAST))]
+ ""
+ "%.\\tshfl.idx.b32\\t%0, %1, 0, 31;")
+
+;; extract parts of a 64 bit object into 2 32-bit ints
+(define_insn "unpack<mode>si2"
+ [(set (match_operand:SI 0 "nvptx_register_operand" "")
+ (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "")
+ (const_int 0)] UNSPEC_BIT_CONV))
+ (set (match_operand:SI 1 "nvptx_register_operand" "")
+ (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
+ ""
+ "%.\\tmov.b64 {%0,%1}, %2;")
+
+;; pack 2 32-bit ints into a 64 bit object
+(define_insn "packsi<mode>2"
+ [(set (match_operand:BITD 0 "nvptx_register_operand" "")
+ (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "")
+ (match_operand:SI 2 "nvptx_register_operand" "")]
+ UNSPEC_BIT_CONV))]
+ ""
+ "%.\\tmov.b64 %0, {%1,%2};")
+
+(define_insn "worker_load<mode>"
+ [(set (match_operand:SDISDFM 0 "nvptx_register_operand" "=R")
+ (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "m")]
+ UNSPEC_SHARED_DATA))]
+ ""
+ "%.\\tld.shared%u0\\t%0,%1;")
+
+(define_insn "worker_store<mode>"
+ [(set (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "=m")]
+ UNSPEC_SHARED_DATA)
+ (match_operand:SDISDFM 0 "nvptx_register_operand" "R"))]
+ ""
+ "%.\\tst.shared%u1\\t%1,%0;")
+
(define_insn "ganglocal_ptr<mode>"
[(set (match_operand:P 0 "nvptx_register_operand" "")
(unspec:P [(const_int 0)] UNSPEC_SHARED_DATA))]
@@ -1462,14 +1545,8 @@
"%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
;; ??? Mark as not predicable later?
-(define_insn "threadbarrier_insn"
- [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_BARSYNC)]
+(define_insn "nvptx_barsync"
+ [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+ UNSPECV_BARSYNC)]
""
"bar.sync\\t%0;")
-
-(define_expand "oacc_threadbarrier"
- [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_BARSYNC)]
- ""
-{
- operands[0] = const0_rtx;
-})
Index: config/nvptx/nvptx.c
===================================================================
--- config/nvptx/nvptx.c (revision 225323)
+++ config/nvptx/nvptx.c (working copy)
@@ -24,6 +24,7 @@
#include "coretypes.h"
#include "tm.h"
#include "rtl.h"
+#include "hash-map.h"
#include "hash-set.h"
#include "machmode.h"
#include "vec.h"
@@ -74,6 +75,9 @@
#include "df.h"
#include "dumpfile.h"
#include "builtins.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "omp-low.h"
/* Record the function decls we've written, and the libfuncs and function
decls corresponding to them. */
@@ -97,6 +101,16 @@ static GTY((cache))
static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
+/* Size of buffer needed to broadcast across workers. This is used
+ for both worker-neutering and worker broadcasting. It is shared
+ by all functions emitted. The buffer is placed in shared memory.
+ It'd be nice if PTX supported common blocks, because then this
+ could be shared across TUs (taking the largest size). */
+static unsigned worker_bcast_hwm;
+static unsigned worker_bcast_align;
+#define worker_bcast_name "__worker_bcast"
+static GTY(()) rtx worker_bcast_sym;
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
@@ -124,6 +138,8 @@ nvptx_option_override (void)
needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
declared_libfuncs_htab
= hash_table<declared_libfunc_hasher>::create_ggc (17);
+
+ worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
}
/* Return the mode to be used when declaring a ptx object for OBJ.
@@ -1066,6 +1082,210 @@ nvptx_expand_compare (rtx compare)
return gen_rtx_NE (BImode, pred, const0_rtx);
}
+
+/* Expand the oacc fork & join primitive into ptx-required unspecs. */
+
+void
+nvptx_expand_oacc_fork (rtx mode)
+{
+ /* Emit fork for worker level. */
+ if (UINTVAL (mode) == OACC_worker)
+ emit_insn (gen_nvptx_fork (mode));
+}
+
+void
+nvptx_expand_oacc_join (rtx mode)
+{
+ /* Emit joining for all pars. */
+ emit_insn (gen_nvptx_joining (mode));
+}
+
+/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
+ objects. */
+
+static rtx
+nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
+{
+ rtx res;
+
+ switch (GET_MODE (src))
+ {
+ case DImode:
+ res = gen_unpackdisi2 (dst0, dst1, src);
+ break;
+ case DFmode:
+ res = gen_unpackdfsi2 (dst0, dst1, src);
+ break;
+ default: gcc_unreachable ();
+ }
+ return res;
+}
+
+/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
+ object. */
+
+static rtx
+nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
+{
+ rtx res;
+
+ switch (GET_MODE (dst))
+ {
+ case DImode:
+ res = gen_packsidi2 (dst, src0, src1);
+ break;
+ case DFmode:
+ res = gen_packsidf2 (dst, src0, src1);
+ break;
+ default: gcc_unreachable ();
+ }
+ return res;
+}
+
+/* Generate an instruction or sequence to broadcast register REG
+ across the vectors of a single warp. */
+
+static rtx
+nvptx_gen_vcast (rtx reg)
+{
+ rtx res;
+
+ switch (GET_MODE (reg))
+ {
+ case SImode:
+ res = gen_nvptx_broadcastsi (reg, reg);
+ break;
+ case SFmode:
+ res = gen_nvptx_broadcastsf (reg, reg);
+ break;
+ case DImode:
+ case DFmode:
+ {
+ rtx tmp0 = gen_reg_rtx (SImode);
+ rtx tmp1 = gen_reg_rtx (SImode);
+
+ start_sequence ();
+ emit_insn (nvptx_gen_unpack (tmp0, tmp1, reg));
+ emit_insn (nvptx_gen_vcast (tmp0));
+ emit_insn (nvptx_gen_vcast (tmp1));
+ emit_insn (nvptx_gen_pack (reg, tmp0, tmp1));
+ res = get_insns ();
+ end_sequence ();
+ }
+ break;
+ case BImode:
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+
+ start_sequence ();
+ emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
+ emit_insn (nvptx_gen_vcast (tmp));
+ emit_insn (gen_rtx_SET (BImode, reg,
+ gen_rtx_NE (BImode, tmp, const0_rtx)));
+ res = get_insns ();
+ end_sequence ();
+ }
+ break;
+
+ case HImode:
+ case QImode:
+ default:debug_rtx (reg);gcc_unreachable ();
+ }
+ return res;
+}
+
+/* Structure used when generating a worker-level spill or fill. */
+
+struct wcast_data_t
+{
+ rtx base;
+ rtx ptr;
+ unsigned offset;
+};
+
+/* Direction of the spill/fill and looping setup/teardown indicator. */
+
+enum propagate_mask
+ {
+ PM_read = 1 << 0,
+ PM_write = 1 << 1,
+ PM_loop_begin = 1 << 2,
+ PM_loop_end = 1 << 3,
+
+ PM_read_write = PM_read | PM_write
+ };
+
+/* Generate instruction(s) to spill or fill register REG to/from the
+ worker broadcast array. PM indicates what is to be done, REP
+ how many loop iterations will be executed (0 for not a loop). */
+
+static rtx
+nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+{
+ rtx res;
+ machine_mode mode = GET_MODE (reg);
+
+ switch (mode)
+ {
+ case BImode:
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+
+ start_sequence ();
+ if (pm & PM_read)
+ emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
+ emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+ if (pm & PM_write)
+ emit_insn (gen_rtx_SET (BImode, reg,
+ gen_rtx_NE (BImode, tmp, const0_rtx)));
+ res = get_insns ();
+ end_sequence ();
+ }
+ break;
+
+ default:
+ {
+ rtx addr = data->ptr;
+
+ if (!addr)
+ {
+ unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
+
+ if (align > worker_bcast_align)
+ worker_bcast_align = align;
+ data->offset = (data->offset + align - 1) & ~(align - 1);
+ addr = data->base;
+ if (data->offset)
+ addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
+ }
+
+ addr = gen_rtx_MEM (mode, addr);
+ addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
+ if (pm & PM_read)
+ res = gen_rtx_SET (mode, addr, reg);
+ if (pm & PM_write)
+ res = gen_rtx_SET (mode, reg, addr);
+
+ if (data->ptr)
+ {
+ /* We're using a ptr, increment it. */
+ start_sequence ();
+
+ emit_insn (res);
+ emit_insn (gen_adddi3 (data->ptr, data->ptr,
+ GEN_INT (GET_MODE_SIZE (GET_MODE (res)))));
+ res = get_insns ();
+ end_sequence ();
+ }
+ else
+ rep = 1;
+ data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
+ }
+ break;
+ }
+ return res;
+}
+
/* When loading an operand ORIG_OP, verify whether an address space
conversion to generic is required, and if so, perform it. Also
check for SYMBOL_REFs for function decls and call
@@ -1647,23 +1867,6 @@ nvptx_print_operand_address (FILE *file,
nvptx_print_address_operand (file, addr, VOIDmode);
}
-/* Return true if the value of COND is the same across all threads in a
- warp. */
-
-static bool
-condition_unidirectional_p (rtx cond)
-{
- if (CONSTANT_P (cond))
- return true;
- if (GET_CODE (cond) == REG)
- return cfun->machine->warp_equal_pseudos[REGNO (cond)];
- if (GET_RTX_CLASS (GET_CODE (cond)) == RTX_COMPARE
- || GET_RTX_CLASS (GET_CODE (cond)) == RTX_COMM_COMPARE)
- return (condition_unidirectional_p (XEXP (cond, 0))
- && condition_unidirectional_p (XEXP (cond, 1)));
- return false;
-}
-
/* Print an operand, X, to FILE, with an optional modifier in CODE.
Meaning of CODE:
@@ -1676,9 +1879,7 @@ condition_unidirectional_p (rtx cond)
f -- print a full reg even for something that must always be split
t -- print a type opcode suffix, promoting QImode to 32 bits
T -- print a type size in bits
- u -- print a type opcode suffix without promotions.
- U -- print ".uni" if a condition consists only of values equal across all
- threads in a warp. */
+ u -- print a type opcode suffix without promotions. */
static void
nvptx_print_operand (FILE *file, rtx x, int code)
@@ -1739,11 +1940,6 @@ nvptx_print_operand (FILE *file, rtx x,
fprintf (file, "@!");
goto common;
- case 'U':
- if (condition_unidirectional_p (x))
- fprintf (file, ".uni");
- break;
-
case 'c':
op_mode = GET_MODE (XEXP (x, 0));
switch (x_code)
@@ -1900,7 +2096,7 @@ get_replacement (struct reg_replace *r)
conversion copyin/copyout instructions. */
static void
-nvptx_reorg_subreg (int max_regs)
+nvptx_reorg_subreg ()
{
struct reg_replace qiregs, hiregs, siregs, diregs;
rtx_insn *insn, *next;
@@ -1914,11 +2110,6 @@ nvptx_reorg_subreg (int max_regs)
siregs.mode = SImode;
diregs.mode = DImode;
- cfun->machine->warp_equal_pseudos
- = ggc_cleared_vec_alloc<char> (max_regs);
-
- auto_vec<unsigned> warp_reg_worklist;
-
for (insn = get_insns (); insn; insn = next)
{
next = NEXT_INSN (insn);
@@ -1934,18 +2125,6 @@ nvptx_reorg_subreg (int max_regs)
diregs.n_in_use = 0;
extract_insn (insn);
- if (recog_memoized (insn) == CODE_FOR_oacc_thread_broadcastsi
- || (GET_CODE (PATTERN (insn)) == SET
- && CONSTANT_P (SET_SRC (PATTERN (insn)))))
- {
- rtx dest = recog_data.operand[0];
- if (REG_P (dest) && REG_N_SETS (REGNO (dest)) == 1)
- {
- cfun->machine->warp_equal_pseudos[REGNO (dest)] = true;
- warp_reg_worklist.safe_push (REGNO (dest));
- }
- }
-
enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
for (int i = 0; i < recog_data.n_operands; i++)
{
@@ -1999,71 +2178,745 @@ nvptx_reorg_subreg (int max_regs)
validate_change (insn, recog_data.operand_loc[i], new_reg, false);
}
}
+}
- while (!warp_reg_worklist.is_empty ())
+/* Loop structure of the function.The entire function is described as
+ a NULL loop. We should be able to extend this to represent
+ superblocks. */
+
+#define OACC_null OACC_HWM
+
+struct parallel
+{
+ /* Parent parallel. */
+ parallel *parent;
+
+ /* Next sibling parallel. */
+ parallel *next;
+
+ /* First child parallel. */
+ parallel *inner;
+
+ /* Partitioning mode of the parallel. */
+ unsigned mode;
+
+ /* Partitioning used within inner parallels. */
+ unsigned inner_mask;
+
+ /* Location of parallel forked and join. The forked is the first
+ block in the parallel and the join is the first block after of
+ the partition. */
+ basic_block forked_block;
+ basic_block join_block;
+
+ rtx_insn *forked_insn;
+ rtx_insn *join_insn;
+
+ rtx_insn *fork_insn;
+ rtx_insn *joining_insn;
+
+ /* Basic blocks in this parallel, but not in child parallels. The
+ FORKED and JOINING blocks are in the partition. The FORK and JOIN
+ blocks are not. */
+ auto_vec<basic_block> blocks;
+
+public:
+ parallel (parallel *parent, unsigned mode);
+ ~parallel ();
+};
+
+/* Constructor links the new parallel into it's parent's chain of
+ children. */
+
+parallel::parallel (parallel *parent_, unsigned mode_)
+ :parent (parent_), next (0), inner (0), mode (mode_), inner_mask (0)
+{
+ forked_block = join_block = 0;
+ forked_insn = join_insn = 0;
+ fork_insn = joining_insn = 0;
+
+ if (parent)
{
- int regno = warp_reg_worklist.pop ();
+ next = parent->inner;
+ parent->inner = this;
+ }
+}
+
+parallel::~parallel ()
+{
+ delete inner;
+ delete next;
+}
+
+/* Map of basic blocks to insns */
+typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
+
+/* A tuple of an insn of interest and the BB in which it resides. */
+typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
+typedef auto_vec<insn_bb_t> insn_bb_vec_t;
+
+/* Split basic blocks such that each forked and join unspecs are at
+ the start of their basic blocks. Thus afterwards each block will
+ have a single partitioning mode. We also do the same for return
+ insns, as they are executed by every thread. Return the
+ partitioning mode of the function as a whole. Populate MAP with
+ head and tail blocks. We also clear the BB visited flag, which is
+ used when finding partitions. */
+
+static void
+nvptx_split_blocks (bb_insn_map_t *map)
+{
+ insn_bb_vec_t worklist;
+ basic_block block;
+ rtx_insn *insn;
+
+ /* Locate all the reorg instructions of interest. */
+ FOR_ALL_BB_FN (block, cfun)
+ {
+ bool seen_insn = false;
+
+ // Clear visited flag, for use by parallel locator */
+ block->flags &= ~BB_VISITED;
- df_ref use = DF_REG_USE_CHAIN (regno);
- for (; use; use = DF_REF_NEXT_REG (use))
+ FOR_BB_INSNS (block, insn)
{
- rtx_insn *insn;
- if (!DF_REF_INSN_INFO (use))
- continue;
- insn = DF_REF_INSN (use);
- if (DEBUG_INSN_P (insn))
+ if (!INSN_P (insn))
continue;
-
- /* The only insns we have to exclude are those which refer to
- memory. */
- rtx pat = PATTERN (insn);
- if (GET_CODE (pat) == SET
- && (MEM_P (SET_SRC (pat)) || MEM_P (SET_DEST (pat))))
- continue;
-
- df_ref insn_use;
- bool all_equal = true;
- FOR_EACH_INSN_USE (insn_use, insn)
+ switch (recog_memoized (insn))
{
- unsigned insn_regno = DF_REF_REGNO (insn_use);
- if (!cfun->machine->warp_equal_pseudos[insn_regno])
- {
- all_equal = false;
- break;
- }
+ default:
+ seen_insn = true;
+ continue;
+ case CODE_FOR_nvptx_forked:
+ case CODE_FOR_nvptx_join:
+ break;
+
+ case CODE_FOR_return:
+ /* We also need to split just before return insns, as
+ that insn needs executing by all threads, but the
+ block it is in probably does not. */
+ break;
}
- if (!all_equal)
- continue;
- df_ref insn_def;
- FOR_EACH_INSN_DEF (insn_def, insn)
+
+ if (seen_insn)
+ /* We've found an instruction that must be at the start of
+ a block, but isn't. Add it to the worklist. */
+ worklist.safe_push (insn_bb_t (insn, block));
+ else
+ /* It was already the first instruction. Just add it to
+ the map. */
+ map->get_or_insert (block) = insn;
+ seen_insn = true;
+ }
+ }
+
+ /* Split blocks on the worklist. */
+ unsigned ix;
+ insn_bb_t *elt;
+ basic_block remap = 0;
+ for (ix = 0; worklist.iterate (ix, &elt); ix++)
+ {
+ if (remap != elt->second)
+ {
+ block = elt->second;
+ remap = block;
+ }
+
+ /* Split block before insn. The insn is in the new block */
+ edge e = split_block (block, PREV_INSN (elt->first));
+
+ block = e->dest;
+ map->get_or_insert (block) = elt->first;
+ }
+}
+
+/* BLOCK is a basic block containing a head or tail instruction.
+ Locate the associated prehead or pretail instruction, which must be
+ in the single predecessor block. */
+
+static rtx_insn *
+nvptx_discover_pre (basic_block block, int expected)
+{
+ gcc_assert (block->preds->length () == 1);
+ basic_block pre_block = (*block->preds)[0]->src;
+ rtx_insn *pre_insn;
+
+ for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
+ pre_insn = PREV_INSN (pre_insn))
+ gcc_assert (pre_insn != BB_HEAD (pre_block));
+
+ gcc_assert (recog_memoized (pre_insn) == expected);
+ return pre_insn;
+}
+
+/* Dump this parallel and all its inner parallels. */
+
+static void
+nvptx_dump_pars (parallel *par, unsigned depth)
+{
+ fprintf (dump_file, "%u: mode %d head=%d, tail=%d\n",
+ depth, par->mode,
+ par->forked_block ? par->forked_block->index : -1,
+ par->join_block ? par->join_block->index : -1);
+
+ fprintf (dump_file, " blocks:");
+
+ basic_block block;
+ for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+ fprintf (dump_file, " %d", block->index);
+ fprintf (dump_file, "\n");
+ if (par->inner)
+ nvptx_dump_pars (par->inner, depth + 1);
+
+ if (par->next)
+ nvptx_dump_pars (par->next, depth);
+}
+
+typedef std::pair<basic_block, parallel *> bb_par_t;
+typedef auto_vec<bb_par_t> bb_par_vec_t;
+
+/* Walk the BBG looking for fork & join markers. Construct a
+ loop structure for the function. MAP is a mapping of basic blocks
+ to head & taiol markers, discoveded when splitting blocks. This
+ speeds up the discovery. We rely on the BB visited flag having
+ been cleared when splitting blocks. */
+
+static parallel *
+nvptx_discover_pars (bb_insn_map_t *map)
+{
+ parallel *outer_par = new parallel (0, OACC_null);
+ bb_par_vec_t worklist;
+ basic_block block;
+
+ // Mark entry and exit blocks as visited.
+ block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+ block->flags |= BB_VISITED;
+ block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+ worklist.safe_push (bb_par_t (block, outer_par));
+
+ while (worklist.length ())
+ {
+ bb_par_t bb_par = worklist.pop ();
+ parallel *l = bb_par.second;
+
+ block = bb_par.first;
+
+ // Have we met this block?
+ if (block->flags & BB_VISITED)
+ continue;
+ block->flags |= BB_VISITED;
+
+ rtx_insn **endp = map->get (block);
+ if (endp)
+ {
+ rtx_insn *end = *endp;
+
+ /* This is a block head or tail, or return instruction. */
+ switch (recog_memoized (end))
{
- unsigned dregno = DF_REF_REGNO (insn_def);
- if (cfun->machine->warp_equal_pseudos[dregno])
- continue;
- cfun->machine->warp_equal_pseudos[dregno] = true;
- warp_reg_worklist.safe_push (dregno);
+ case CODE_FOR_return:
+ /* Return instructions are in their own block, and we
+ don't need to do anything more. */
+ continue;
+
+ case CODE_FOR_nvptx_forked:
+ /* Loop head, create a new inner loop and add it into
+ our parent's child list. */
+ {
+ unsigned mode = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+ l = new parallel (l, mode);
+ l->forked_block = block;
+ l->forked_insn = end;
+ if (mode == OACC_worker)
+ l->fork_insn
+ = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
+ }
+ break;
+
+ case CODE_FOR_nvptx_join:
+ /* A loop tail. Finish the current loop and return to
+ parent. */
+ {
+ unsigned mode = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+ gcc_assert (l->mode == mode);
+ l->join_block = block;
+ l->join_insn = end;
+ if (mode == OACC_worker)
+ l->joining_insn
+ = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
+ l = l->parent;
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
}
}
+
+ /* Add this block onto the current loop's list of blocks. */
+ l->blocks.safe_push (block);
+
+ /* Push each destination block onto the work list. */
+ edge e;
+ edge_iterator ei;
+ FOR_EACH_EDGE (e, ei, block->succs)
+ worklist.safe_push (bb_par_t (e->dest, l));
}
if (dump_file)
- for (int i = 0; i < max_regs; i++)
- if (cfun->machine->warp_equal_pseudos[i])
- fprintf (dump_file, "Found warp invariant pseudo %d\n", i);
+ {
+ fprintf (dump_file, "\nLoops\n");
+ nvptx_dump_pars (outer_par, 0);
+ fprintf (dump_file, "\n");
+ }
+
+ return outer_par;
+}
+
+/* Propagate live state at the start of a partitioned region. BLOCK
+ provides the live register information, and might not contain
+ INSN. Propagation is inserted just after INSN. RW indicates whether
+ we are reading and/or writing state. This
+ separation is needed for worker-level proppagation where we
+ essentially do a spill & fill. FN is the underlying worker
+ function to generate the propagation instructions for single
+ register. DATA is user data.
+
+ We propagate the live register set and the entire frame. We could
+ do better by (a) propagating just the live set that is used within
+ the partitioned regions and (b) only propagating stack entries that
+ are used. The latter might be quite hard to determine. */
+
+static void
+nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
+ rtx (*fn) (rtx, propagate_mask,
+ unsigned, void *), void *data)
+{
+ bitmap live = DF_LIVE_IN (block);
+ bitmap_iterator iterator;
+ unsigned ix;
+
+ /* Copy the frame array. */
+ HOST_WIDE_INT fs = get_frame_size ();
+ if (fs)
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ rtx idx = NULL_RTX;
+ rtx ptr = gen_reg_rtx (Pmode);
+ rtx pred = NULL_RTX;
+ rtx_code_label *label = NULL;
+
+ gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
+ fs /= GET_MODE_SIZE (DImode);
+ /* Detect single iteration loop. */
+ if (fs == 1)
+ fs = 0;
+
+ start_sequence ();
+ emit_insn (gen_rtx_SET (Pmode, ptr, frame_pointer_rtx));
+ if (fs)
+ {
+ idx = gen_reg_rtx (SImode);
+ pred = gen_reg_rtx (BImode);
+ label = gen_label_rtx ();
+
+ emit_insn (gen_rtx_SET (SImode, idx, GEN_INT (fs)));
+ /* Allow worker function to initialize anything needed */
+ rtx init = fn (tmp, PM_loop_begin, fs, data);
+ if (init)
+ emit_insn (init);
+ emit_label (label);
+ LABEL_NUSES (label)++;
+ emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
+ }
+ if (rw & PM_read)
+ emit_insn (gen_rtx_SET (DImode, tmp, gen_rtx_MEM (DImode, ptr)));
+ emit_insn (fn (tmp, rw, fs, data));
+ if (rw & PM_write)
+ emit_insn (gen_rtx_SET (DImode, gen_rtx_MEM (DImode, ptr), tmp));
+ if (fs)
+ {
+ emit_insn (gen_rtx_SET (SImode, pred,
+ gen_rtx_NE (BImode, idx, const0_rtx)));
+ emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
+ emit_insn (gen_br_true_uni (pred, label));
+ rtx fini = fn (tmp, PM_loop_end, fs, data);
+ if (fini)
+ emit_insn (fini);
+ emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
+ }
+ emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
+ emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
+ rtx cpy = get_insns ();
+ end_sequence ();
+ insn = emit_insn_after (cpy, insn);
+ }
+
+ /* Copy live registers. */
+ EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+ {
+ rtx reg = regno_reg_rtx[ix];
+
+ if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+ {
+ rtx bcast = fn (reg, rw, 0, data);
+
+ insn = emit_insn_after (bcast, insn);
+ }
+ }
+}
+
+/* Worker for nvptx_vpropagate. */
+
+static rtx
+vprop_gen (rtx reg, propagate_mask pm,
+ unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+{
+ if (!(pm & PM_read_write))
+ return 0;
+
+ return nvptx_gen_vcast (reg);
}
-/* PTX-specific reorganization
- 1) mark now-unused registers, so function begin doesn't declare
- unused registers.
- 2) replace subregs with suitable sequences.
-*/
+/* Propagate state that is live at start of BLOCK across the vectors
+ of a single warp. Propagation is inserted just after INSN. */
static void
-nvptx_reorg (void)
+nvptx_vpropagate (basic_block block, rtx_insn *insn)
{
- struct reg_replace qiregs, hiregs, siregs, diregs;
- rtx_insn *insn, *next;
+ nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+}
+
+/* Worker for nvptx_wpropagate. */
+
+static rtx
+wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+{
+ wcast_data_t *data = (wcast_data_t *)data_;
+
+ if (pm & PM_loop_begin)
+ {
+ /* Starting a loop, initialize pointer. */
+ unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
+
+ if (align > worker_bcast_align)
+ worker_bcast_align = align;
+ data->offset = (data->offset + align - 1) & ~(align - 1);
+
+ data->ptr = gen_reg_rtx (Pmode);
+
+ return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
+ }
+ else if (pm & PM_loop_end)
+ {
+ rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
+ data->ptr = NULL_RTX;
+ return clobber;
+ }
+ else
+ return nvptx_gen_wcast (reg, pm, rep, data);
+}
+
+/* Spill or fill live state that is live at start of BLOCK. PRE_P
+ indicates if this is just before partitioned mode (do spill), or
+ just after it starts (do fill). Sequence is inserted just after
+ INSN. */
+
+static void
+nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+{
+ wcast_data_t data;
+
+ data.base = gen_reg_rtx (Pmode);
+ data.offset = 0;
+ data.ptr = NULL_RTX;
+
+ nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+ if (data.offset)
+ {
+ /* Stuff was emitted, initialize the base pointer now. */
+ rtx init = gen_rtx_SET (Pmode, data.base, worker_bcast_sym);
+ emit_insn_after (init, insn);
+
+ if (worker_bcast_hwm < data.offset)
+ worker_bcast_hwm = data.offset;
+ }
+}
+
+/* Emit a worker-level synchronization barrier. */
+
+static void
+nvptx_wsync (bool tail_p, rtx_insn *insn)
+{
+ emit_insn_after (gen_nvptx_barsync (GEN_INT (tail_p)), insn);
+}
+
+/* Single neutering according to MASK. FROM is the incoming block and
+ TO is the outgoing block. These may be the same block. Insert at
+ start of FROM:
+
+ if (tid.<axis>) goto end.
+
+ and insert before ending branch of TO (if there is such an insn):
+
+ end:
+ <possibly-broadcast-cond>
+ <branch>
+
+ We currently only use differnt FROM and TO when skipping an entire
+ loop. We could do more if we detected superblocks. */
+
+static void
+nvptx_single (unsigned mask, basic_block from, basic_block to)
+{
+ rtx_insn *head = BB_HEAD (from);
+ rtx_insn *tail = BB_END (to);
+ unsigned skip_mask = mask;
+
+ /* Find first insn of from block */
+ while (head != BB_END (from) && !INSN_P (head))
+ head = NEXT_INSN (head);
+
+ /* Find last insn of to block */
+ rtx_insn *limit = from == to ? head : BB_HEAD (to);
+ while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
+ tail = PREV_INSN (tail);
+
+ /* Detect if tail is a branch. */
+ rtx tail_branch = NULL_RTX;
+ rtx cond_branch = NULL_RTX;
+ if (tail && INSN_P (tail))
+ {
+ tail_branch = PATTERN (tail);
+ if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
+ tail_branch = NULL_RTX;
+ else
+ {
+ cond_branch = SET_SRC (tail_branch);
+ if (GET_CODE (cond_branch) != IF_THEN_ELSE)
+ cond_branch = NULL_RTX;
+ }
+ }
+
+ if (tail == head)
+ {
+ /* If this is empty, do nothing. */
+ if (!head || !INSN_P (head))
+ return;
+
+ /* If this is a dummy insn, do nothing. */
+ switch (recog_memoized (head))
+ {
+ default:break;
+ case CODE_FOR_nvptx_fork:
+ case CODE_FOR_nvptx_forked:
+ case CODE_FOR_nvptx_joining:
+ case CODE_FOR_nvptx_join:
+ return;
+ }
+ if (cond_branch)
+ {
+ /* If we're only doing vector single, there's no need to
+ emit skip code because we'll not insert anything. */
+ if (!(mask & OACC_LOOP_MASK (OACC_vector)))
+ skip_mask = 0;
+ }
+ else if (tail_branch)
+ /* Block with only unconditional branch. Nothing to do. */
+ return;
+ }
+
+ /* Insert the vector test inside the worker test. */
+ unsigned mode;
+ rtx_insn *before = tail;
+ for (mode = OACC_worker; mode <= OACC_vector; mode++)
+ if (OACC_LOOP_MASK (mode) & skip_mask)
+ {
+ rtx id = gen_reg_rtx (SImode);
+ rtx pred = gen_reg_rtx (BImode);
+ rtx_code_label *label = gen_label_rtx ();
+
+ emit_insn_before (gen_oacc_id (id, GEN_INT (mode)), head);
+ rtx cond = gen_rtx_SET (BImode, pred,
+ gen_rtx_NE (BImode, id, const0_rtx));
+ emit_insn_before (cond, head);
+ rtx br;
+ if (mode == OACC_vector)
+ br = gen_br_true (pred, label);
+ else
+ br = gen_br_true_uni (pred, label);
+ emit_insn_before (br, head);
+
+ LABEL_NUSES (label)++;
+ if (tail_branch)
+ before = emit_label_before (label, before);
+ else
+ emit_label_after (label, tail);
+ }
+
+ /* Now deal with propagating the branch condition. */
+ if (cond_branch)
+ {
+ rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
+
+ if (OACC_LOOP_MASK (OACC_vector) == mask)
+ {
+ /* Vector mode only, do a shuffle. */
+ emit_insn_before (nvptx_gen_vcast (pvar), tail);
+ }
+ else
+ {
+ /* Includes worker mode, do spill & fill. by construction
+ we should never have worker mode only. */
+ wcast_data_t data;
+
+ data.base = worker_bcast_sym;
+ data.ptr = 0;
+
+ if (worker_bcast_hwm < GET_MODE_SIZE (SImode))
+ worker_bcast_hwm = GET_MODE_SIZE (SImode);
+
+ data.offset = 0;
+ emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+ before);
+ emit_insn_before (gen_nvptx_barsync (GEN_INT (2)), tail);
+ data.offset = 0;
+ emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data),
+ tail);
+ }
+
+ extract_insn (tail);
+ rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
+ UNSPEC_BR_UNIFIED);
+ validate_change (tail, recog_data.operand_loc[0], unsp, false);
+ }
+}
+
+/* PAR is a parallel that is being skipped in its entirety according to
+ MASK. Treat this as skipping a superblock starting at forked
+ and ending at joining. */
+
+static void
+nvptx_skip_par (unsigned mask, parallel *par)
+{
+ basic_block tail = par->join_block;
+ gcc_assert (tail->preds->length () == 1);
+
+ basic_block pre_tail = (*tail->preds)[0]->src;
+ gcc_assert (pre_tail->succs->length () == 1);
+
+ nvptx_single (mask, par->forked_block, pre_tail);
+}
+
+/* Process the parallel PAR and all its contained
+ parallels. We do everything but the neutering. Return mask of
+ partitioned modes used within this parallel. */
+
+static unsigned
+nvptx_process_pars (parallel *par)
+{
+ unsigned inner_mask = OACC_LOOP_MASK (par->mode);
+
+ /* Do the inner parallels first. */
+ if (par->inner)
+ {
+ par->inner_mask = nvptx_process_pars (par->inner);
+ inner_mask |= par->inner_mask;
+ }
+
+ switch (par->mode)
+ {
+ case OACC_null:
+ /* Dummy parallel. */
+ break;
+
+ case OACC_vector:
+ nvptx_vpropagate (par->forked_block, par->forked_insn);
+ break;
+
+ case OACC_worker:
+ {
+ nvptx_wpropagate (false, par->forked_block,
+ par->forked_insn);
+ nvptx_wpropagate (true, par->forked_block, par->fork_insn);
+ /* Insert begin and end synchronizations. */
+ nvptx_wsync (false, par->forked_insn);
+ nvptx_wsync (true, par->joining_insn);
+ }
+ break;
+
+ case OACC_gang:
+ break;
+
+ default:gcc_unreachable ();
+ }
+
+ /* Now do siblings. */
+ if (par->next)
+ inner_mask |= nvptx_process_pars (par->next);
+ return inner_mask;
+}
+
+/* Neuter the parallel described by PAR. We recurse in depth-first
+ order. MODES are the partitioning of the execution and OUTER is
+ the partitioning of the parallels we are contained in. */
+
+static void
+nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
+{
+ unsigned me = (OACC_LOOP_MASK (par->mode)
+ & (OACC_LOOP_MASK (OACC_worker)
+ | OACC_LOOP_MASK (OACC_vector)));
+ unsigned skip_mask = 0, neuter_mask = 0;
+
+ if (par->inner)
+ nvptx_neuter_pars (par->inner, modes, outer | me);
+
+ for (unsigned mode = OACC_worker; mode <= OACC_vector; mode++)
+ {
+ if ((outer | me) & OACC_LOOP_MASK (mode))
+ { /* Mode is partitioned: no neutering. */ }
+ else if (!(modes & OACC_LOOP_MASK (mode)))
+ { /* Mode is not used: nothing to do. */ }
+ else if (par->inner_mask & OACC_LOOP_MASK (mode)
+ || !par->forked_insn)
+ /* Partitioned in inner parallels, or we're not a partitioned
+ at all: neuter individual blocks. */
+ neuter_mask |= OACC_LOOP_MASK (mode);
+ else if (!par->parent || !par->parent->forked_insn
+ || par->parent->inner_mask & OACC_LOOP_MASK (mode))
+ /* Parent isn't a parallel or contains this paralleling: skip
+ parallel at this level. */
+ skip_mask |= OACC_LOOP_MASK (mode);
+ else
+ { /* Parent will skip this parallel itself. */ }
+ }
+
+ if (neuter_mask)
+ {
+ basic_block block;
+
+ for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+ nvptx_single (neuter_mask, block, block);
+ }
+
+ if (skip_mask)
+ nvptx_skip_par (skip_mask, par);
+
+ if (par->next)
+ nvptx_neuter_pars (par->next, modes, outer);
+}
+
+/* NVPTX machine dependent reorg.
+ Insert vector and worker single neutering code and state
+ propagation when entering partioned mode. Fixup subregs. */
+
+static void
+nvptx_reorg (void)
+{
/* We are freeing block_for_insn in the toplev to keep compatibility
with old MDEP_REORGS that are not CFG based. Recompute it now. */
compute_bb_for_insn ();
@@ -2072,19 +2925,36 @@ nvptx_reorg (void)
df_clear_flags (DF_LR_RUN_DCE);
df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
+ df_live_add_problem ();
+
+ /* Split blocks and record interesting unspecs. */
+ bb_insn_map_t bb_insn_map;
+
+ nvptx_split_blocks (&bb_insn_map);
+
+ /* Compute live regs */
df_analyze ();
regstat_init_n_sets_and_refs ();
- int max_regs = max_reg_num ();
-
+ if (dump_file)
+ df_dump (dump_file);
+
/* Mark unused regs as unused. */
+ int max_regs = max_reg_num ();
for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
regno_reg_rtx[i] = const0_rtx;
- /* Replace subregs. */
- nvptx_reorg_subreg (max_regs);
+ parallel *pars = nvptx_discover_pars (&bb_insn_map);
+
+ nvptx_process_pars (pars);
+ nvptx_neuter_pars (pars, (OACC_LOOP_MASK (OACC_vector)
+ | OACC_LOOP_MASK (OACC_worker)), 0);
+ delete pars;
+
+ nvptx_reorg_subreg ();
+
regstat_free_n_sets_and_refs ();
df_finish_pass (true);
@@ -2133,19 +3003,24 @@ nvptx_vector_alignment (const_tree type)
return MIN (align, BIGGEST_ALIGNMENT);
}
-/* Indicate that INSN cannot be duplicated. This is true for insns
- that generate a unique id. To be on the safe side, we also
- exclude instructions that have to be executed simultaneously by
- all threads in a warp. */
+/* Indicate that INSN cannot be duplicated. */
static bool
nvptx_cannot_copy_insn_p (rtx_insn *insn)
{
- if (recog_memoized (insn) == CODE_FOR_oacc_thread_broadcastsi)
- return true;
- if (recog_memoized (insn) == CODE_FOR_threadbarrier_insn)
- return true;
- return false;
+ switch (recog_memoized (insn))
+ {
+ case CODE_FOR_nvptx_broadcastsi:
+ case CODE_FOR_nvptx_broadcastsf:
+ case CODE_FOR_nvptx_barsync:
+ case CODE_FOR_nvptx_fork:
+ case CODE_FOR_nvptx_forked:
+ case CODE_FOR_nvptx_joining:
+ case CODE_FOR_nvptx_join:
+ return true;
+ default:
+ return false;
+ }
}
/* Record a symbol for mkoffload to enter into the mapping table. */
@@ -2185,6 +3060,21 @@ nvptx_file_end (void)
FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
nvptx_record_fndecl (decl, true);
fputs (func_decls.str().c_str(), asm_out_file);
+
+ if (worker_bcast_hwm)
+ {
+ /* Define the broadcast buffer. */
+
+ if (worker_bcast_align < GET_MODE_SIZE (SImode))
+ worker_bcast_align = GET_MODE_SIZE (SImode);
+ worker_bcast_hwm = (worker_bcast_hwm + worker_bcast_align - 1)
+ & ~(worker_bcast_align - 1);
+
+ fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name);
+ fprintf (asm_out_file, ".shared.align %d .u8 %s[%d];\n",
+ worker_bcast_align,
+ worker_bcast_name, worker_bcast_hwm);
+ }
}
#undef TARGET_OPTION_OVERRIDE
Index: tree-ssa-alias.c
===================================================================
--- tree-ssa-alias.c (revision 225323)
+++ tree-ssa-alias.c (working copy)
@@ -1764,7 +1764,6 @@ ref_maybe_used_by_call_p_1 (gcall *call,
case BUILT_IN_GOMP_ATOMIC_END:
case BUILT_IN_GOMP_BARRIER:
case BUILT_IN_GOMP_BARRIER_CANCEL:
- case BUILT_IN_GOACC_THREADBARRIER:
case BUILT_IN_GOMP_TASKWAIT:
case BUILT_IN_GOMP_TASKGROUP_END:
case BUILT_IN_GOMP_CRITICAL_START:
Index: gimple.c
===================================================================
--- gimple.c (revision 225323)
+++ gimple.c (working copy)
@@ -1380,12 +1380,27 @@ bool
gimple_call_same_target_p (const_gimple c1, const_gimple c2)
{
if (gimple_call_internal_p (c1))
- return (gimple_call_internal_p (c2)
- && gimple_call_internal_fn (c1) == gimple_call_internal_fn (c2));
+ {
+ if (!gimple_call_internal_p (c2)
+ || gimple_call_internal_fn (c1) != gimple_call_internal_fn (c2))
+ return false;
+
+ if (gimple_call_internal_unique_p (c1))
+ return false;
+
+ return true;
+ }
+ else if (gimple_call_fn (c1) == gimple_call_fn (c2))
+ return true;
else
- return (gimple_call_fn (c1) == gimple_call_fn (c2)
- || (gimple_call_fndecl (c1)
- && gimple_call_fndecl (c1) == gimple_call_fndecl (c2)));
+ {
+ tree decl = gimple_call_fndecl (c1);
+
+ if (!decl || decl != gimple_call_fndecl (c2))
+ return false;
+
+ return true;
+ }
}
/* Detect flags from a GIMPLE_CALL. This is just like
Index: gimple.h
===================================================================
--- gimple.h (revision 225323)
+++ gimple.h (working copy)
@@ -581,10 +581,6 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT
/* [ WORD 11 ]
Size of the gang-local memory to allocate. */
tree ganglocal_size;
-
- /* [ WORD 12 ]
- A pointer to the array to be used for broadcasting across threads. */
- tree broadcast_array;
};
/* GIMPLE_OMP_PARALLEL or GIMPLE_TASK */
@@ -2693,6 +2689,11 @@ gimple_call_internal_fn (const_gimple gs
return static_cast <const gcall *> (gs)->u.internal_fn;
}
+/* Return true, if this internal gimple call is unique. */
+
+extern bool
+gimple_call_internal_unique_p (const_gimple);
+
/* If CTRL_ALTERING_P is true, mark GIMPLE_CALL S to be a stmt
that could alter control flow. */
@@ -5248,25 +5249,6 @@ gimple_omp_target_set_ganglocal_size (go
}
-/* Return the pointer to the broadcast array associated with OMP_TARGET GS. */
-
-static inline tree
-gimple_omp_target_broadcast_array (const gomp_target *omp_target_stmt)
-{
- return omp_target_stmt->broadcast_array;
-}
-
-
-/* Set PTR to be the broadcast array associated with OMP_TARGET
- GS. */
-
-static inline void
-gimple_omp_target_set_broadcast_array (gomp_target *omp_target_stmt, tree ptr)
-{
- omp_target_stmt->broadcast_array = ptr;
-}
-
-
/* Return the clauses associated with OMP_TEAMS GS. */
static inline tree
Index: omp-low.c
===================================================================
--- omp-low.c (revision 225323)
+++ omp-low.c (working copy)
@@ -166,14 +166,8 @@ struct omp_region
/* For an OpenACC loop, the level of parallelism requested. */
int gwv_this;
-
- tree broadcast_array;
};
-/* Levels of parallelism as defined by OpenACC. Increasing numbers
- correspond to deeper loop nesting levels. */
-#define OACC_LOOP_MASK(X) (1 << (X))
-
/* Context structure. Used to store information about each parallel
directive in the code. */
@@ -292,8 +286,6 @@ static vec<omp_context *> taskreg_contex
static void scan_omp (gimple_seq *, omp_context *);
static tree scan_omp_1_op (tree *, int *, void *);
-static basic_block oacc_broadcast (basic_block, basic_block,
- struct omp_region *);
#define WALK_SUBSTMTS \
case GIMPLE_BIND: \
@@ -3487,15 +3479,6 @@ build_omp_barrier (tree lhs)
return g;
}
-/* Build a call to GOACC_threadbarrier. */
-
-static gcall *
-build_oacc_threadbarrier (void)
-{
- tree fndecl = builtin_decl_explicit (BUILT_IN_GOACC_THREADBARRIER);
- return gimple_build_call (fndecl, 0);
-}
-
/* If a context was created for STMT when it was scanned, return it. */
static omp_context *
@@ -3506,6 +3489,37 @@ maybe_lookup_ctx (gimple stmt)
return n ? (omp_context *) n->value : NULL;
}
+/* Generate loop head markers in outer->inner order. */
+
+static void
+gen_oacc_fork (gimple_seq *seq, unsigned mask)
+{
+ unsigned level;
+
+ for (level = OACC_gang; level != OACC_HWM; level++)
+ if (mask & OACC_LOOP_MASK (level))
+ {
+ tree arg = build_int_cst (unsigned_type_node, level);
+ gcall *call = gimple_build_call_internal (IFN_GOACC_FORK, 1, arg);
+ gimple_seq_add_stmt (seq, call);
+ }
+}
+
+/* Generate loop tail markers in inner->outer order. */
+
+static void
+gen_oacc_join (gimple_seq *seq, unsigned mask)
+{
+ unsigned level;
+
+ for (level = OACC_HWM; level-- != OACC_gang; )
+ if (mask & OACC_LOOP_MASK (level))
+ {
+ tree arg = build_int_cst (unsigned_type_node, level);
+ gcall *call = gimple_build_call_internal (IFN_GOACC_JOIN, 1, arg);
+ gimple_seq_add_stmt (seq, call);
+ }
+}
/* Find the mapping for DECL in CTX or the immediately enclosing
context that has a mapping for DECL.
@@ -6777,21 +6791,6 @@ expand_omp_for_generic (struct omp_regio
}
}
-
-/* True if a barrier is needed after a loop partitioned over
- gangs/workers/vectors as specified by GWV_BITS. OpenACC semantics specify
- that a (conceptual) barrier is needed after worker and vector-partitioned
- loops, but not after gang-partitioned loops. Currently we are relying on
- warp reconvergence to synchronise threads within a warp after vector loops,
- so an explicit barrier is not helpful after those. */
-
-static bool
-oacc_loop_needs_threadbarrier_p (int gwv_bits)
-{
- return !(gwv_bits & OACC_LOOP_MASK (OACC_gang))
- && (gwv_bits & OACC_LOOP_MASK (OACC_worker));
-}
-
/* A subroutine of expand_omp_for. Generate code for a parallel
loop with static schedule and no specified chunk size. Given
parameters:
@@ -6827,6 +6826,11 @@ oacc_loop_needs_threadbarrier_p (int gwv
V += STEP;
if (V cond e) goto L1;
L2:
+
+ For OpenACC the above is wrapped in an OACC_FORK/OACC_JOIN pair.
+ Currently we wrap the whole sequence, but it'd be better to place the
+ markers just inside the outer conditional, so they can be entirely
+ eliminated if the loop is unreachable.
*/
static void
@@ -6868,10 +6872,6 @@ expand_omp_for_static_nochunk (struct om
}
exit_bb = region->exit;
- /* Broadcast variables to OpenACC threads. */
- entry_bb = oacc_broadcast (entry_bb, fin_bb, region);
- region->entry = entry_bb;
-
/* Iteration space partitioning goes in ENTRY_BB. */
gsi = gsi_last_bb (entry_bb);
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
@@ -6893,6 +6893,15 @@ expand_omp_for_static_nochunk (struct om
t = fold_binary (fd->loop.cond_code, boolean_type_node,
fold_convert (type, fd->loop.n1),
fold_convert (type, fd->loop.n2));
+
+ if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ gimple_seq seq = NULL;
+
+ gen_oacc_fork (&seq, region->gwv_this);
+ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+ }
+
if (fd->collapse == 1
&& TYPE_UNSIGNED (type)
&& (t == NULL_TREE || !integer_onep (t)))
@@ -7134,17 +7143,17 @@ expand_omp_for_static_nochunk (struct om
/* Replace the GIMPLE_OMP_RETURN with a barrier, or nothing. */
gsi = gsi_last_bb (exit_bb);
- if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+ if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ gimple_seq seq = NULL;
+
+ gen_oacc_join (&seq, region->gwv_this);
+ gsi_insert_seq_after (&gsi, seq, GSI_SAME_STMT);
+ }
+ else if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
- if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- {
- gcc_checking_assert (t == NULL_TREE);
- if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
- gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
- }
- else
- gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
+ gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
gsi_remove (&gsi, true);
@@ -7248,6 +7257,11 @@ find_phi_with_arg_on_edge (tree arg, edg
trip += 1;
goto L0;
L4:
+
+ For OpenACC the above is wrapped in an OACC_FORK/OACC_JOIN pair.
+ Currently we wrap the whole sequence, but it'd be better to place the
+ markers just inside the outer conditional, so they can be entirely
+ eliminated if the loop is unreachable.
*/
static void
@@ -7281,10 +7295,6 @@ expand_omp_for_static_chunk (struct omp_
gcc_assert (EDGE_COUNT (iter_part_bb->succs) == 2);
fin_bb = BRANCH_EDGE (iter_part_bb)->dest;
- /* Broadcast variables to OpenACC threads. */
- entry_bb = oacc_broadcast (entry_bb, fin_bb, region);
- region->entry = entry_bb;
-
gcc_assert (broken_loop
|| fin_bb == FALLTHRU_EDGE (cont_bb)->dest);
seq_start_bb = split_edge (FALLTHRU_EDGE (iter_part_bb));
@@ -7318,6 +7328,14 @@ expand_omp_for_static_chunk (struct omp_
t = fold_binary (fd->loop.cond_code, boolean_type_node,
fold_convert (type, fd->loop.n1),
fold_convert (type, fd->loop.n2));
+ if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ gimple_seq seq = NULL;
+
+ gen_oacc_fork (&seq, region->gwv_this);
+ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+ }
+
if (fd->collapse == 1
&& TYPE_UNSIGNED (type)
&& (t == NULL_TREE || !integer_onep (t)))
@@ -7576,17 +7594,18 @@ expand_omp_for_static_chunk (struct omp_
/* Replace the GIMPLE_OMP_RETURN with a barrier, or nothing. */
gsi = gsi_last_bb (exit_bb);
- if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+
+ if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
+ {
+ gimple_seq seq = NULL;
+
+ gen_oacc_join (&seq, region->gwv_this);
+ gsi_insert_seq_after (&gsi, seq, GSI_SAME_STMT);
+ }
+ else if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
{
t = gimple_omp_return_lhs (gsi_stmt (gsi));
- if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
- {
- gcc_checking_assert (t == NULL_TREE);
- if (oacc_loop_needs_threadbarrier_p (region->gwv_this))
- gsi_insert_after (&gsi, build_oacc_threadbarrier (), GSI_SAME_STMT);
- }
- else
- gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
+ gsi_insert_after (&gsi, build_omp_barrier (t), GSI_SAME_STMT);
}
gsi_remove (&gsi, true);
@@ -9158,20 +9177,6 @@ expand_omp_atomic (struct omp_region *re
expand_omp_atomic_mutex (load_bb, store_bb, addr, loaded_val, stored_val);
}
-/* Allocate storage for OpenACC worker threads in CTX to broadcast
- condition results. */
-
-static void
-oacc_alloc_broadcast_storage (omp_context *ctx)
-{
- tree vull_type_node = build_qualified_type (long_long_unsigned_type_node,
- TYPE_QUAL_VOLATILE);
-
- ctx->worker_sync_elt
- = alloc_var_ganglocal (NULL_TREE, vull_type_node, ctx,
- TYPE_SIZE_UNIT (vull_type_node));
-}
-
/* Mark the loops inside the kernels region starting at REGION_ENTRY and ending
at REGION_EXIT. */
@@ -9947,7 +9952,6 @@ find_omp_target_region_data (struct omp_
region->gwv_this |= OACC_LOOP_MASK (OACC_worker);
if (find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH))
region->gwv_this |= OACC_LOOP_MASK (OACC_vector);
- region->broadcast_array = gimple_omp_target_broadcast_array (stmt);
}
/* Helper for build_omp_regions. Scan the dominator tree starting at
@@ -10091,669 +10095,6 @@ build_omp_regions (void)
build_omp_regions_1 (ENTRY_BLOCK_PTR_FOR_FN (cfun), NULL, false);
}
-/* Walk the tree upwards from region until a target region is found
- or we reach the end, then return it. */
-static omp_region *
-enclosing_target_region (omp_region *region)
-{
- while (region != NULL
- && region->type != GIMPLE_OMP_TARGET)
- region = region->outer;
- return region;
-}
-
-/* Return a mask of GWV_ values indicating the kind of OpenACC
- predication required for basic blocks in REGION. */
-
-static int
-required_predication_mask (omp_region *region)
-{
- while (region
- && region->type != GIMPLE_OMP_FOR && region->type != GIMPLE_OMP_TARGET)
- region = region->outer;
- if (!region)
- return 0;
-
- int outer_masks = region->gwv_this;
- omp_region *outer_target = region;
- while (outer_target != NULL && outer_target->type != GIMPLE_OMP_TARGET)
- {
- if (outer_target->type == GIMPLE_OMP_FOR)
- outer_masks |= outer_target->gwv_this;
- outer_target = outer_target->outer;
- }
- if (!outer_target)
- return 0;
-
- int mask = 0;
- if ((outer_target->gwv_this & OACC_LOOP_MASK (OACC_worker)) != 0
- && (region->type == GIMPLE_OMP_TARGET
- || (outer_masks & OACC_LOOP_MASK (OACC_worker)) == 0))
- mask |= OACC_LOOP_MASK (OACC_worker);
- if ((outer_target->gwv_this & OACC_LOOP_MASK (OACC_vector)) != 0
- && (region->type == GIMPLE_OMP_TARGET
- || (outer_masks & OACC_LOOP_MASK (OACC_vector)) == 0))
- mask |= OACC_LOOP_MASK (OACC_vector);
- return mask;
-}
-
-/* Generate a broadcast across OpenACC vector threads (a warp on GPUs)
- so that VAR is broadcast to DEST_VAR. The new statements are added
- after WHERE. Return the stmt after which the block should be split. */
-
-static gimple
-generate_vector_broadcast (tree dest_var, tree var,
- gimple_stmt_iterator &where)
-{
- gimple retval = gsi_stmt (where);
- tree vartype = TREE_TYPE (var);
- tree call_arg_type = unsigned_type_node;
- enum built_in_function fn = BUILT_IN_GOACC_THREAD_BROADCAST;
-
- if (TYPE_PRECISION (vartype) > TYPE_PRECISION (call_arg_type))
- {
- fn = BUILT_IN_GOACC_THREAD_BROADCAST_LL;
- call_arg_type = long_long_unsigned_type_node;
- }
-
- bool need_conversion = !types_compatible_p (vartype, call_arg_type);
- tree casted_var = var;
-
- if (need_conversion)
- {
- gassign *conv1 = NULL;
- casted_var = create_tmp_var (call_arg_type);
-
- /* Handle floats and doubles. */
- if (!INTEGRAL_TYPE_P (vartype))
- {
- tree t = fold_build1 (VIEW_CONVERT_EXPR, call_arg_type, var);
- conv1 = gimple_build_assign (casted_var, t);
- }
- else
- conv1 = gimple_build_assign (casted_var, NOP_EXPR, var);
-
- gsi_insert_after (&where, conv1, GSI_CONTINUE_LINKING);
- }
-
- tree decl = builtin_decl_explicit (fn);
- gimple call = gimple_build_call (decl, 1, casted_var);
- gsi_insert_after (&where, call, GSI_NEW_STMT);
- tree casted_dest = dest_var;
-
- if (need_conversion)
- {
- gassign *conv2 = NULL;
- casted_dest = create_tmp_var (call_arg_type);
-
- if (!INTEGRAL_TYPE_P (vartype))
- {
- tree t = fold_build1 (VIEW_CONVERT_EXPR, vartype, casted_dest);
- conv2 = gimple_build_assign (dest_var, t);
- }
- else
- conv2 = gimple_build_assign (dest_var, NOP_EXPR, casted_dest);
-
- gsi_insert_after (&where, conv2, GSI_CONTINUE_LINKING);
- }
-
- gimple_call_set_lhs (call, casted_dest);
- return retval;
-}
-
-/* Generate a broadcast across OpenACC threads in REGION so that VAR
- is broadcast to DEST_VAR. MASK specifies the parallelism level and
- thereby the broadcast method. If it is only vector, we
- can use a warp broadcast, otherwise we fall back to memory
- store/load. */
-
-static gimple
-generate_oacc_broadcast (omp_region *region, tree dest_var, tree var,
- gimple_stmt_iterator &where, int mask)
-{
- if (mask == OACC_LOOP_MASK (OACC_vector))
- return generate_vector_broadcast (dest_var, var, where);
-
- omp_region *parent = enclosing_target_region (region);
-
- tree elttype = build_qualified_type (TREE_TYPE (var), TYPE_QUAL_VOLATILE);
- tree ptr = create_tmp_var (build_pointer_type (elttype));
- gassign *cast1 = gimple_build_assign (ptr, NOP_EXPR,
- parent->broadcast_array);
- gsi_insert_after (&where, cast1, GSI_NEW_STMT);
- gassign *st = gimple_build_assign (build_simple_mem_ref (ptr), var);
- gsi_insert_after (&where, st, GSI_NEW_STMT);
-
- gsi_insert_after (&where, build_oacc_threadbarrier (), GSI_NEW_STMT);
-
- gassign *cast2 = gimple_build_assign (ptr, NOP_EXPR,
- parent->broadcast_array);
- gsi_insert_after (&where, cast2, GSI_NEW_STMT);
- gassign *ld = gimple_build_assign (dest_var, build_simple_mem_ref (ptr));
- gsi_insert_after (&where, ld, GSI_NEW_STMT);
-
- gsi_insert_after (&where, build_oacc_threadbarrier (), GSI_NEW_STMT);
-
- return st;
-}
-
-/* Build a test for OpenACC predication. TRUE_EDGE is the edge that should be
- taken if the block should be executed. SKIP_DEST_BB is the destination to
- jump to otherwise. MASK specifies the type of predication, it can contain
- the bits for VECTOR and/or WORKER. */
-
-static void
-make_predication_test (edge true_edge, basic_block skip_dest_bb, int mask)
-{
- basic_block cond_bb = true_edge->src;
-
- gimple_stmt_iterator tmp_gsi = gsi_last_bb (cond_bb);
- tree decl = builtin_decl_explicit (BUILT_IN_GOACC_ID);
- tree comp_var = NULL_TREE;
- unsigned ix;
-
- for (ix = OACC_worker; ix <= OACC_vector; ix++)
- if (OACC_LOOP_MASK (ix) & mask)
- {
- gimple call = gimple_build_call
- (decl, 1, build_int_cst (unsigned_type_node, ix));
- tree var = create_tmp_var (unsigned_type_node);
-
- gimple_call_set_lhs (call, var);
- gsi_insert_after (&tmp_gsi, call, GSI_NEW_STMT);
- if (comp_var)
- {
- tree new_comp = create_tmp_var (unsigned_type_node);
- gassign *ior = gimple_build_assign (new_comp,
- BIT_IOR_EXPR, comp_var, var);
- gsi_insert_after (&tmp_gsi, ior, GSI_NEW_STMT);
- comp_var = new_comp;
- }
- else
- comp_var = var;
- }
-
- tree cond = build2 (EQ_EXPR, boolean_type_node, comp_var,
- fold_convert (unsigned_type_node, integer_zero_node));
- gimple cond_stmt = gimple_build_cond_empty (cond);
- gsi_insert_after (&tmp_gsi, cond_stmt, GSI_NEW_STMT);
-
- true_edge->flags = EDGE_TRUE_VALUE;
-
- /* Force an abnormal edge before a broadcast operation that might be present
- in SKIP_DEST_BB. This is only done for the non-execution edge (with
- respect to the predication done by this function) -- the opposite
- (execution) edge that reaches the broadcast operation must be made
- abnormal also, e.g. in this function's caller. */
- edge e = make_edge (cond_bb, skip_dest_bb, EDGE_FALSE_VALUE);
- basic_block false_abnorm_bb = split_edge (e);
- edge abnorm_edge = single_succ_edge (false_abnorm_bb);
- abnorm_edge->flags |= EDGE_ABNORMAL;
-}
-
-/* Apply OpenACC predication to basic block BB which is in
- region PARENT. MASK has a bitmask of levels that need to be
- applied; VECTOR and/or WORKER may be set. */
-
-static void
-predicate_bb (basic_block bb, struct omp_region *parent, int mask)
-{
- /* We handle worker-single vector-partitioned loops by jumping
- around them if not in the controlling worker. Don't insert
- unnecessary (and incorrect) predication. */
- if (parent->type == GIMPLE_OMP_FOR
- && (parent->gwv_this & OACC_LOOP_MASK (OACC_vector)))
- mask &= ~OACC_LOOP_MASK (OACC_worker);
-
- if (mask == 0 || parent->type == GIMPLE_OMP_ATOMIC_LOAD)
- return;
-
- gimple_stmt_iterator gsi;
- gimple stmt;
-
- gsi = gsi_last_bb (bb);
- stmt = gsi_stmt (gsi);
- if (stmt == NULL)
- return;
-
- basic_block skip_dest_bb = NULL;
-
- if (gimple_code (stmt) == GIMPLE_OMP_ENTRY_END)
- return;
-
- if (gimple_code (stmt) == GIMPLE_COND)
- {
- tree cond_var = create_tmp_var (boolean_type_node);
- tree broadcast_cond = create_tmp_var (boolean_type_node);
- gassign *asgn = gimple_build_assign (cond_var,
- gimple_cond_code (stmt),
- gimple_cond_lhs (stmt),
- gimple_cond_rhs (stmt));
- gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
- gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
-
- gimple splitpoint = generate_oacc_broadcast (parent, broadcast_cond,
- cond_var, gsi_asgn,
- mask);
-
- edge e = split_block (bb, splitpoint);
- e->flags = EDGE_ABNORMAL;
- skip_dest_bb = e->dest;
-
- gimple_cond_set_condition (as_a <gcond *> (stmt), EQ_EXPR,
- broadcast_cond, boolean_true_node);
- }
- else if (gimple_code (stmt) == GIMPLE_SWITCH)
- {
- gswitch *sstmt = as_a <gswitch *> (stmt);
- tree var = gimple_switch_index (sstmt);
- tree new_var = create_tmp_var (TREE_TYPE (var));
-
- gassign *asgn = gimple_build_assign (new_var, var);
- gsi_insert_before (&gsi, asgn, GSI_CONTINUE_LINKING);
- gimple_stmt_iterator gsi_asgn = gsi_for_stmt (asgn);
-
- gimple splitpoint = generate_oacc_broadcast (parent, new_var, var,
- gsi_asgn, mask);
-
- edge e = split_block (bb, splitpoint);
- e->flags = EDGE_ABNORMAL;
- skip_dest_bb = e->dest;
-
- gimple_switch_set_index (sstmt, new_var);
- }
- else if (is_gimple_omp (stmt))
- {
- gsi_prev (&gsi);
- gimple split_stmt = gsi_stmt (gsi);
- enum gimple_code code = gimple_code (stmt);
-
- /* First, see if we must predicate away an entire loop or atomic region.
*/
- if (code == GIMPLE_OMP_FOR
- || code == GIMPLE_OMP_ATOMIC_LOAD)
- {
- omp_region *inner;
- inner = *bb_region_map->get (FALLTHRU_EDGE (bb)->dest);
- skip_dest_bb = single_succ (inner->exit);
- gcc_assert (inner->entry == bb);
- if (code != GIMPLE_OMP_FOR
- || ((inner->gwv_this & OACC_LOOP_MASK (OACC_vector))
- && !(inner->gwv_this & OACC_LOOP_MASK (OACC_worker))
- && (mask & OACC_LOOP_MASK (OACC_worker))))
- {
- gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
- gsi_prev (&head_gsi);
- edge e0 = split_block (bb, gsi_stmt (head_gsi));
- int mask2 = mask;
- if (code == GIMPLE_OMP_FOR)
- mask2 &= ~OACC_LOOP_MASK (OACC_vector);
- if (!split_stmt || code != GIMPLE_OMP_FOR)
- {
- /* The simple case: nothing here except the for,
- so we just need to make one branch around the
- entire loop. */
- inner->entry = e0->dest;
- make_predication_test (e0, skip_dest_bb, mask2);
- return;
- }
- basic_block for_block = e0->dest;
- /* The general case, make two conditions - a full one around the
- code preceding the for, and one branch around the loop. */
- edge e1 = split_block (for_block, split_stmt);
- basic_block bb3 = e1->dest;
- edge e2 = split_block (for_block, split_stmt);
- basic_block bb2 = e2->dest;
-
- make_predication_test (e0, bb2, mask);
- make_predication_test (single_pred_edge (bb3), skip_dest_bb,
- mask2);
- inner->entry = bb3;
- return;
- }
- }
-
- /* Only a few statements need special treatment. */
- if (gimple_code (stmt) != GIMPLE_OMP_FOR
- && gimple_code (stmt) != GIMPLE_OMP_CONTINUE
- && gimple_code (stmt) != GIMPLE_OMP_RETURN)
- {
- edge e = single_succ_edge (bb);
- skip_dest_bb = e->dest;
- }
- else
- {
- if (!split_stmt)
- return;
- edge e = split_block (bb, split_stmt);
- skip_dest_bb = e->dest;
- if (gimple_code (stmt) == GIMPLE_OMP_CONTINUE)
- {
- gcc_assert (parent->cont == bb);
- parent->cont = skip_dest_bb;
- }
- else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
- {
- gcc_assert (parent->exit == bb);
- parent->exit = skip_dest_bb;
- }
- else if (gimple_code (stmt) == GIMPLE_OMP_FOR)
- {
- omp_region *inner;
- inner = *bb_region_map->get (FALLTHRU_EDGE (skip_dest_bb)->dest);
- gcc_assert (inner->entry == bb);
- inner->entry = skip_dest_bb;
- }
- }
- }
- else if (single_succ_p (bb))
- {
- edge e = single_succ_edge (bb);
- skip_dest_bb = e->dest;
- if (gimple_code (stmt) == GIMPLE_GOTO)
- gsi_prev (&gsi);
- if (gsi_stmt (gsi) == 0)
- return;
- }
-
- if (skip_dest_bb != NULL)
- {
- gimple_stmt_iterator head_gsi = gsi_start_bb (bb);
- gsi_prev (&head_gsi);
- edge e2 = split_block (bb, gsi_stmt (head_gsi));
- make_predication_test (e2, skip_dest_bb, mask);
- }
-}
-
-/* Walk the dominator tree starting at BB to collect basic blocks in
- WORKLIST which need OpenACC vector predication applied to them. */
-
-static void
-find_predicatable_bbs (basic_block bb, vec<basic_block> &worklist)
-{
- struct omp_region *parent = *bb_region_map->get (bb);
- if (required_predication_mask (parent) != 0)
- worklist.safe_push (bb);
- basic_block son;
- for (son = first_dom_son (CDI_DOMINATORS, bb);
- son;
- son = next_dom_son (CDI_DOMINATORS, son))
- find_predicatable_bbs (son, worklist);
-}
-
-/* Apply OpenACC vector predication to all basic blocks. HEAD_BB is the
- first. */
-
-static void
-predicate_omp_regions (basic_block head_bb)
-{
- vec<basic_block> worklist = vNULL;
- find_predicatable_bbs (head_bb, worklist);
- int i;
- basic_block bb;
- FOR_EACH_VEC_ELT (worklist, i, bb)
- {
- omp_region *region = *bb_region_map->get (bb);
- int mask = required_predication_mask (region);
- predicate_bb (bb, region, mask);
- }
-}
-
-/* USE and GET sets for variable broadcasting. */
-static std::set<tree> use, gen, live_in;
-
-/* This is an extremely conservative live in analysis. We only want to
- detect is any compiler temporary used inside an acc loop is local to
- that loop or not. So record all decl uses in all the basic blocks
- post-dominating the acc loop in question. */
-static tree
-populate_loop_live_in (tree *tp, int *walk_subtrees,
- void *data_ ATTRIBUTE_UNUSED)
-{
- struct walk_stmt_info *wi = (struct walk_stmt_info *) data_;
-
- if (wi && wi->is_lhs)
- {
- if (VAR_P (*tp))
- live_in.insert (*tp);
- }
- else if (IS_TYPE_OR_DECL_P (*tp))
- *walk_subtrees = 0;
-
- return NULL_TREE;
-}
-
-static void
-oacc_populate_live_in_1 (basic_block entry_bb, basic_block exit_bb,
- basic_block loop_bb)
-{
- basic_block son;
- gimple_stmt_iterator gsi;
-
- if (entry_bb == exit_bb)
- return;
-
- if (!dominated_by_p (CDI_DOMINATORS, loop_bb, entry_bb))
- return;
-
- for (gsi = gsi_start_bb (entry_bb); !gsi_end_p (gsi); gsi_next (&gsi))
- {
- struct walk_stmt_info wi;
- gimple stmt;
-
- memset (&wi, 0, sizeof (wi));
- stmt = gsi_stmt (gsi);
-
- walk_gimple_op (stmt, populate_loop_live_in, &wi);
- }
-
- /* Continue walking the dominator tree. */
- for (son = first_dom_son (CDI_DOMINATORS, entry_bb);
- son;
- son = next_dom_son (CDI_DOMINATORS, son))
- oacc_populate_live_in_1 (son, exit_bb, loop_bb);
-}
-
-static void
-oacc_populate_live_in (basic_block entry_bb, omp_region *region)
-{
- /* Find the innermost OMP_TARGET region. */
- while (region && region->type != GIMPLE_OMP_TARGET)
- region = region->outer;
-
- if (!region)
- return;
-
- basic_block son;
-
- for (son = first_dom_son (CDI_DOMINATORS, region->entry);
- son;
- son = next_dom_son (CDI_DOMINATORS, son))
- oacc_populate_live_in_1 (son, region->exit, entry_bb);
-}
-
-static tree
-populate_loop_use (tree *tp, int *walk_subtrees, void *data_)
-{
- struct walk_stmt_info *wi = (struct walk_stmt_info *) data_;
- std::set<tree>::iterator it;
-
- /* There isn't much to do for LHS ops. There shouldn't be any pointers
- or references here. */
- if (wi && wi->is_lhs)
- return NULL_TREE;
-
- if (VAR_P (*tp))
- {
- tree type;
-
- *walk_subtrees = 0;
-
- /* Filter out incompatible decls. */
- if (INDIRECT_REF_P (*tp) || is_global_var (*tp))
- return NULL_TREE;
-
- type = TREE_TYPE (*tp);
-
- /* Aggregate types aren't supported either. */
- if (AGGREGATE_TYPE_P (type))
- return NULL_TREE;
-
- /* Filter out decls inside GEN. */
- it = gen.find (*tp);
- if (it == gen.end ())
- use.insert (*tp);
- }
- else if (IS_TYPE_OR_DECL_P (*tp))
- *walk_subtrees = 0;
-
- return NULL_TREE;
-}
-
-/* INIT is true if this is the first time this function is called. */
-
-static void
-oacc_broadcast_1 (basic_block entry_bb, basic_block exit_bb, bool init,
- int mask)
-{
- basic_block son;
- gimple_stmt_iterator gsi;
- gimple stmt;
- tree block, var;
-
- if (entry_bb == exit_bb)
- return;
-
- /* Populate the GEN set. */
-
- gsi = gsi_start_bb (entry_bb);
- stmt = gsi_stmt (gsi);
-
- /* There's nothing to do if stmt is empty or if this is the entry basic
- block to the vector loop. The entry basic block to pre-expanded loops
- do not have an entry label. As such, the scope containing the initial
- entry_bb should not be added to the gen set. */
- if (stmt != NULL && !init && (block = gimple_block (stmt)) != NULL)
- for (var = BLOCK_VARS (block); var; var = DECL_CHAIN (var))
- gen.insert(var);
-
- /* Populate the USE set. */
-
- for (gsi = gsi_start_bb (entry_bb); !gsi_end_p (gsi); gsi_next (&gsi))
- {
- struct walk_stmt_info wi;
-
- memset (&wi, 0, sizeof (wi));
- stmt = gsi_stmt (gsi);
-
- walk_gimple_op (stmt, populate_loop_use, &wi);
- }
-
- /* Continue processing the children of this basic block. */
- for (son = first_dom_son (CDI_DOMINATORS, entry_bb);
- son;
- son = next_dom_son (CDI_DOMINATORS, son))
- oacc_broadcast_1 (son, exit_bb, false, mask);
-}
-
-/* Broadcast variables to OpenACC vector loops. This function scans
- all of the basic blocks withing an acc vector loop. It maintains
- two sets of decls, a GEN set and a USE set. The GEN set contains
- all of the decls in the the basic block's scope. The USE set
- consists of decls used in current basic block, but are not in the
- GEN set, globally defined or were transferred into the the accelerator
- via a data movement clause.
-
- The vector loop begins at ENTRY_BB and end at EXIT_BB, where EXIT_BB
- is a latch back to ENTRY_BB. Once a set of used variables have been
- determined, they will get broadcasted in a pre-header to ENTRY_BB. */
-
-static basic_block
-oacc_broadcast (basic_block entry_bb, basic_block exit_bb, omp_region *region)
-{
- gimple_stmt_iterator gsi;
- std::set<tree>::iterator it;
- int mask = region->gwv_this;
-
- /* Nothing to do if this isn't an acc worker or vector loop. */
- if (mask == 0)
- return entry_bb;
-
- use.empty ();
- gen.empty ();
- live_in.empty ();
-
- /* Currently, subroutines aren't supported. */
- gcc_assert (!lookup_attribute ("oacc function",
- DECL_ATTRIBUTES (current_function_decl)));
-
- /* Populate live_in. */
- oacc_populate_live_in (entry_bb, region);
-
- /* Populate the set of used decls. */
- oacc_broadcast_1 (entry_bb, exit_bb, true, mask);
-
- /* Filter out all of the GEN decls from the USE set. Also filter out
- any compiler temporaries that which are not present in LIVE_IN. */
- for (it = use.begin (); it != use.end (); it++)
- {
- std::set<tree>::iterator git, lit;
-
- git = gen.find (*it);
- lit = live_in.find (*it);
- if (git != gen.end () || lit == live_in.end ())
- use.erase (it);
- }
-
- if (mask == OACC_LOOP_MASK (OACC_vector))
- {
- /* Broadcast all decls in USE right before the last instruction in
- entry_bb. */
- gsi = gsi_last_bb (entry_bb);
-
- gimple_seq seq = NULL;
- gimple_stmt_iterator g2 = gsi_start (seq);
-
- for (it = use.begin (); it != use.end (); it++)
- generate_oacc_broadcast (region, *it, *it, g2, mask);
-
- gsi_insert_seq_before (&gsi, seq, GSI_CONTINUE_LINKING);
- }
- else if (mask & OACC_LOOP_MASK (OACC_worker))
- {
- if (use.empty ())
- return entry_bb;
-
- /* If this loop contains a worker, then each broadcast must be
- predicated. */
-
- for (it = use.begin (); it != use.end (); it++)
- {
- /* Worker broadcasting requires predication. To do that, there
- needs to be several new parent basic blocks before the omp
- for instruction. */
-
- gimple_seq seq = NULL;
- gimple_stmt_iterator g2 = gsi_start (seq);
- gimple splitpoint = generate_oacc_broadcast (region, *it, *it,
- g2, mask);
- gsi = gsi_last_bb (entry_bb);
- gsi_insert_seq_before (&gsi, seq, GSI_CONTINUE_LINKING);
- edge e = split_block (entry_bb, splitpoint);
- e->flags |= EDGE_ABNORMAL;
- basic_block dest_bb = e->dest;
- gsi_prev (&gsi);
- edge e2 = split_block (entry_bb, gsi_stmt (gsi));
- e2->flags |= EDGE_ABNORMAL;
- make_predication_test (e2, dest_bb, mask);
-
- /* Update entry_bb. */
- entry_bb = dest_bb;
- }
- }
-
- return entry_bb;
-}
-
/* Main entry point for expanding OMP-GIMPLE into runtime calls. */
static unsigned int
@@ -10772,8 +10113,6 @@ execute_expand_omp (void)
fprintf (dump_file, "\n");
}
- predicate_omp_regions (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-
remove_exit_barriers (root_omp_region);
expand_omp (root_omp_region);
@@ -12342,10 +11681,7 @@ lower_omp_target (gimple_stmt_iterator *
orlist = NULL;
if (is_gimple_omp_oacc (stmt))
- {
- oacc_init_count_vars (ctx, clauses);
- oacc_alloc_broadcast_storage (ctx);
- }
+ oacc_init_count_vars (ctx, clauses);
if (has_reduction)
{
@@ -12631,7 +11967,6 @@ lower_omp_target (gimple_stmt_iterator *
gsi_insert_seq_before (gsi_p, sz_ilist, GSI_SAME_STMT);
gimple_omp_target_set_ganglocal_size (stmt, sz);
- gimple_omp_target_set_broadcast_array (stmt, ctx->worker_sync_elt);
pop_gimplify_context (NULL);
}
@@ -13348,16 +12683,7 @@ make_gimple_omp_edges (basic_block bb, s
((for_stmt = last_stmt (cur_region->entry))))
== GF_OMP_FOR_KIND_OACC_LOOP)
{
- /* Called before OMP expansion, so this information has not been
- recorded in cur_region->gwv_this yet. */
- int gwv_bits = find_omp_for_region_gwv (for_stmt);
- if (oacc_loop_needs_threadbarrier_p (gwv_bits))
- {
- make_edge (bb, bb->next_bb, EDGE_FALLTHRU | EDGE_ABNORMAL);
- fallthru = false;
- }
- else
- fallthru = true;
+ fallthru = true;
}
else
/* In the case of a GIMPLE_OMP_SECTION, the edge will go
Index: omp-low.h
===================================================================
--- omp-low.h (revision 225323)
+++ omp-low.h (working copy)
@@ -20,6 +20,8 @@ along with GCC; see the file COPYING3.
#ifndef GCC_OMP_LOW_H
#define GCC_OMP_LOW_H
+/* Levels of parallelism as defined by OpenACC. Increasing numbers
+ correspond to deeper loop nesting levels. */
enum oacc_loop_levels
{
OACC_gang,
@@ -27,6 +29,7 @@ enum oacc_loop_levels
OACC_vector,
OACC_HWM
};
+#define OACC_LOOP_MASK(X) (1 << (X))
struct omp_region;
Index: omp-builtins.def
===================================================================
--- omp-builtins.def (revision 225323)
+++ omp-builtins.def (working copy)
@@ -69,13 +69,6 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_GA
BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DEVICEPTR, "GOACC_deviceptr",
BT_FN_PTR_PTR, ATTR_CONST_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_THREAD_BROADCAST, "GOACC_thread_broadcast",
- BT_FN_UINT_UINT, ATTR_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_THREAD_BROADCAST_LL,
"GOACC_thread_broadcast_ll",
- BT_FN_ULONGLONG_ULONGLONG, ATTR_NOTHROW_LEAF_LIST)
-DEF_GOACC_BUILTIN (BUILT_IN_GOACC_THREADBARRIER, "GOACC_threadbarrier",
- BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
-
DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
Index: internal-fn.def
===================================================================
--- internal-fn.def (revision 225323)
+++ internal-fn.def (working copy)
@@ -64,3 +64,5 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST
DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL)
DEF_INTERNAL_FN (GOACC_DATA_END_WITH_ARG, ECF_NOTHROW, ".r")
+DEF_INTERNAL_FN (GOACC_FORK, ECF_NOTHROW | ECF_LEAF, ".")
+DEF_INTERNAL_FN (GOACC_JOIN, ECF_NOTHROW | ECF_LEAF, ".")