Re: [OpenACC 2/11] PTX backend changes

Nathan Sidwell Wed, 21 Oct 2015 12:16:42 -0700

This patch is the PTX backend changes for partitioned execution.


We implement some new expanders:
oacc_dim_size, oacc_dim_pos -- span and location within the compute cube

oacc_fork, oacc_join -- expanders for IFN_UNIQUE (OACC_FORK) & IFN_UNIQUE(OACC_JOIN)

The fork & join markers are non-copyable instructions, and continue preservingthe nested loop structure of the CFG. in mach_dep_reorg we scan for theseinstuctions reconstructing the partitioned loop structure and determining whichBB's reside in which loops. Once determined we can then apply the neuteringalgorithm, which in a single-partitioned region forces all but engine-zero toskip to the end of the block. For blocks that end in a branch, it skips to justbefore the branch. for blocks that end in a conditional branch we insert codeto propagate the branch condition from engine-zero to the other engines, so theyall go the same way at the branch.


There are two axes of interest:
* vector, these can propagate via a machine 'shuffle' instruction
* worker, these can propagate via a location in local shared memory

At the beginning of a partitioned region, we have to propagate live registerstate and stack frame from engine-zero to the other engines (just as wouldhappen on a regular 'fork' call). Again, how this is done depends on the axisof propagation:

* vector, use shuffle instructions just after the fork

* worker, spill to buffer in shared memory just before the fork and then fillfrom that buffer just after the fork.

For the worker axis, explicit sync instructions are needed before and afteraccessing the shared memory state.


Bernd, any comments?

nathan

2015-10-20  Nathan Sidwell  <nat...@codesourcery.com>

	* config/nvptx/nvptx.h (struct machine_function): Add
	axis_predicate.
	* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
	nvptx_expand_oacc_join): Declare.
	* config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID): Delete.
	(UNSPEC_DIM_SIZE, UNSPEC_SHARED_DATA, UNSPEC_BIT_CONV,
	UNSPEC_SHUFFLE, UNSPEC_BR_UNIFIED): New.
	(UNSPECV_BARSYNC, UNSPECV_DIM_POS, UNSPECV_FORK, UNSPECV_FORKED,
	UNSPECV_JOINING, UNSPECV_JOIN): New.
	(BITS, BITD): New mode iterators.
	(br_true_uni, br_false_uni): New.
	(*oacc_ntid_insn,  oacc_ntid, *oacc_tid_insn, oacc_tid): Delete.
	(oacc_dim_size, oacc_dim_pos): New.
	(nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New.
	(oacc_fork, oacc_join): New.
	(nvptx_shuffle<mode>, unpack<mode>si2, packsi<mode>2): New.
	(worker_load<mode>, worker_store<mode>): New.
	(nvptx_barsync): New.
	* config/nvptx/nvptx.c: Include gimple.h & dumpfile.h.
	(SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY, SHUFFLE_IDX): Define.
	(worker_bcast_hwm, worker_bcast_align, worker_bcast_name,
	worker_bcast_sym): New.
	(nvptx_option_override): Initialize worker broadcast buffer.
	(nvptx_emit_forking, nvptx_emit_joining): New.
	(nvptx_init_axis_predicate): New.
	(nvptx_declare_function_name): Init axis predicates.
	(nvptx_expand_call): Add fork/join markers around routine call.
	(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New.
	(nvptx_gen_unpack, nvptx_gen_pack, nvptx_gen_shuffle): New.
	(nvptx_gen_vcast): New.
	(struct wcast_data_t): New.
	(enum propagate_mask): New.
	(nvptx_gen_wcast): New.
	(nvptx_print_operand): Add 'S' case.
	(struct parallel): New.
	(parallel::parallel, parallel::~parallel): New.
	(bb_insn_map_t, insn_bb_t, insn_bb_vec_t): New typedefs.
	(nvptx_split_blocks, nvptx_discover_pre, nvptx_dump_pars,
	nvptx_find_par, nvptx_discover_pars): New.
	(nvptx_propagate): New.
	(vprop_gen, nvptx_vpropagate): New.
	(wprop_gen, nvptx_wpropagate): New.
	(nvptx_wsync): New.
	(nvptx_single, nvptx_skip_par): New.
	(nvptx_process_pars, nvptx_neuter_pars): New.
	(ntptx_reorg): Split blocks, generate parallel structure, apply
	neutering.
	(nvptx_cannot_copy_insn_p): New.
	(nvptx_file_end): Emit worker broadcast decl.
	(TARGET_CANNOT_COPY_INSN_P): Override.

Index: gcc/config/nvptx/nvptx-protos.h
===================================================================
--- gcc/config/nvptx/nvptx-protos.h	(revision 229096)
+++ gcc/config/nvptx/nvptx-protos.h	(working copy)
@@ -32,6 +32,8 @@ extern void nvptx_register_pragmas (void
 extern const char *nvptx_section_for_decl (const_tree);
 
 #ifdef RTX_CODE
+extern void nvptx_expand_oacc_fork (unsigned);
+extern void nvptx_expand_oacc_join (unsigned);
 extern void nvptx_expand_call (rtx, rtx);
 extern rtx nvptx_expand_compare (rtx);
 extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 229096)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -51,14 +51,21 @@
 #include "langhooks.h"
 #include "dbxout.h"
 #include "cfgrtl.h"
+#include "gimple.h"
 #include "stor-layout.h"
 #include "builtins.h"
 #include "omp-low.h"
 #include "gomp-constants.h"
+#include "dumpfile.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
 
+#define SHUFFLE_UP 0
+#define SHUFFLE_DOWN 1
+#define SHUFFLE_BFLY 2
+#define SHUFFLE_IDX 3
+
 /* Record the function decls we've written, and the libfuncs and function
    decls corresponding to them.  */
 static std::stringstream func_decls;
@@ -81,6 +88,16 @@ struct tree_hasher : ggc_cache_ptr_hash<
 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
 
+/* Size of buffer needed to broadcast across workers.  This is used
+   for both worker-neutering and worker broadcasting.   It is shared
+   by all functions emitted.  The buffer is placed in shared memory.
+   It'd be nice if PTX supported common blocks, because then this
+   could be shared across TUs (taking the largest size).  */
+static unsigned worker_bcast_hwm;
+static unsigned worker_bcast_align;
+#define worker_bcast_name "__worker_bcast"
+static GTY(()) rtx worker_bcast_sym;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -108,6 +125,9 @@ nvptx_option_override (void)
   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
   declared_libfuncs_htab
     = hash_table<declared_libfunc_hasher>::create_ggc (17);
+
+  worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
+  worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 }
 
 /* Return the mode to be used when declaring a ptx object for OBJ.
@@ -194,6 +214,44 @@ nvptx_split_reg_p (machine_mode mode)
   return false;
 }
 
+/* Emit forking instructions for MASK.  */
+
+static void
+nvptx_emit_forking (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+      
+      /* Emit fork at all levels, this helps form SESE regions..  */
+      if (!is_call)
+	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_forked (op));
+    }
+}
+
+/* Emit joining instructions for MASK.  */
+
+static void
+nvptx_emit_joining (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+
+      /* Emit joining for all non-call pars to ensure there's a single
+	 predecessor for the block the join insn ends up in.  This is
+	 needed for skipping entire loops.  */
+      if (!is_call)
+	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_join (op));
+    }
+}
+
 #define PASS_IN_REG_P(MODE, TYPE)				\
   ((GET_MODE_CLASS (MODE) == MODE_INT				\
     || GET_MODE_CLASS (MODE) == MODE_FLOAT			\
@@ -500,6 +558,19 @@ nvptx_record_needed_fndecl (tree decl)
     *slot = decl;
 }
 
+/* Emit code to initialize the REGNO predicate register to indicate
+   whether we are not lane zero on the NAME axis.  */
+
+static void
+nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
+{
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
+  fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
+  fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
+  fprintf (file, "\t}\n");
+}
+
 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
    function, including local var decls and copies from the arguments to
    local regs.  */
@@ -623,6 +694,14 @@ nvptx_declare_function_name (FILE *file,
   if (stdarg_p (fntype))
     fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
 	     GET_MODE_BITSIZE (Pmode));
+
+  /* Emit axis predicates. */
+  if (cfun->machine->axis_predicate[0])
+    nvptx_init_axis_predicate (file,
+			       REGNO (cfun->machine->axis_predicate[0]), "y");
+  if (cfun->machine->axis_predicate[1])
+    nvptx_init_axis_predicate (file,
+			       REGNO (cfun->machine->axis_predicate[1]), "x");
 }
 
 /* Output a return instruction.  Also copy the return value to its outgoing
@@ -779,6 +858,7 @@ nvptx_expand_call (rtx retval, rtx addre
   bool external_decl = false;
   rtx varargs = NULL_RTX;
   tree decl_type = NULL_TREE;
+  unsigned parallel = 0;
 
   for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
     nargs++;
@@ -799,6 +879,22 @@ nvptx_expand_call (rtx retval, rtx addre
 	    cfun->machine->has_call_with_sc = true;
 	  if (DECL_EXTERNAL (decl))
 	    external_decl = true;
+	  tree attr = get_oacc_fn_attrib (decl);
+	  if (attr)
+	    {
+	      tree dims = TREE_VALUE (attr);
+
+	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
+	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
+		{
+		  if (TREE_PURPOSE (dims)
+		      && !integer_zerop (TREE_PURPOSE (dims)))
+		    break;
+		  /* Not on this axis.  */
+		  parallel ^= GOMP_DIM_MASK (ix);
+		  dims = TREE_CHAIN (dims);
+		}
+	    }
 	}
     }
 
@@ -860,7 +956,11 @@ nvptx_expand_call (rtx retval, rtx addre
 	  write_func_decl_from_insn (func_decls, retval, pat, callee);
 	}
     }
+
+  nvptx_emit_forking (parallel, true);
   emit_call_insn (pat);
+  nvptx_emit_joining (parallel, true);
+
   if (tmp_retval != retval)
     emit_move_insn (retval, tmp_retval);
 }
@@ -1069,6 +1169,214 @@ nvptx_expand_compare (rtx compare)
   return gen_rtx_NE (BImode, pred, const0_rtx);
 }
 
+/* Expand the oacc fork & join primitive into ptx-required unspecs.  */
+
+void
+nvptx_expand_oacc_fork (unsigned mode)
+{
+  nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
+}
+
+void
+nvptx_expand_oacc_join (unsigned mode)
+{
+  nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
+}
+
+/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
+   objects.  */
+
+static rtx
+nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
+{
+  rtx res;
+  
+  switch (GET_MODE (src))
+    {
+    case DImode:
+      res = gen_unpackdisi2 (dst0, dst1, src);
+      break;
+    case DFmode:
+      res = gen_unpackdfsi2 (dst0, dst1, src);
+      break;
+    default: gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
+   object.  */
+
+static rtx
+nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
+{
+  rtx res;
+  
+  switch (GET_MODE (dst))
+    {
+    case DImode:
+      res = gen_packsidi2 (dst, src0, src1);
+      break;
+    case DFmode:
+      res = gen_packsidf2 (dst, src0, src1);
+      break;
+    default: gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate an instruction or sequence to broadcast register REG
+   across the vectors of a single warp.  */
+
+static rtx
+nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
+{
+  rtx res;
+
+  switch (GET_MODE (dst))
+    {
+    case SImode:
+      res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
+      break;
+    case SFmode:
+      res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
+      break;
+    case DImode:
+    case DFmode:
+      {
+	rtx tmp0 = gen_reg_rtx (SImode);
+	rtx tmp1 = gen_reg_rtx (SImode);
+
+	start_sequence ();
+	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
+	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
+	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
+	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+    case BImode:
+      {
+	rtx tmp = gen_reg_rtx (SImode);
+	
+	start_sequence ();
+	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
+	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
+	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+      
+    default:
+      gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Generate an instruction or sequence to broadcast register REG
+   across the vectors of a single warp.  */
+
+static rtx
+nvptx_gen_vcast (rtx reg)
+{
+  return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
+}
+
+/* Structure used when generating a worker-level spill or fill.  */
+
+struct wcast_data_t
+{
+  rtx base;
+  rtx ptr;
+  unsigned offset;
+};
+
+/* Direction of the spill/fill and looping setup/teardown indicator.  */
+
+enum propagate_mask
+  {
+    PM_read = 1 << 0,
+    PM_write = 1 << 1,
+    PM_loop_begin = 1 << 2,
+    PM_loop_end = 1 << 3,
+
+    PM_read_write = PM_read | PM_write
+  };
+
+/* Generate instruction(s) to spill or fill register REG to/from the
+   worker broadcast array.  PM indicates what is to be done, REP
+   how many loop iterations will be executed (0 for not a loop).  */
+   
+static rtx
+nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+{
+  rtx  res;
+  machine_mode mode = GET_MODE (reg);
+
+  switch (mode)
+    {
+    case BImode:
+      {
+	rtx tmp = gen_reg_rtx (SImode);
+	
+	start_sequence ();
+	if (pm & PM_read)
+	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
+	emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+	if (pm & PM_write)
+	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
+	res = get_insns ();
+	end_sequence ();
+      }
+      break;
+
+    default:
+      {
+	rtx addr = data->ptr;
+
+	if (!addr)
+	  {
+	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
+
+	    if (align > worker_bcast_align)
+	      worker_bcast_align = align;
+	    data->offset = (data->offset + align - 1) & ~(align - 1);
+	    addr = data->base;
+	    if (data->offset)
+	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
+	  }
+	
+	addr = gen_rtx_MEM (mode, addr);
+	addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
+	if (pm == PM_read)
+	  res = gen_rtx_SET (addr, reg);
+	else if (pm == PM_write)
+	  res = gen_rtx_SET (reg, addr);
+	else
+	  gcc_unreachable ();
+
+	if (data->ptr)
+	  {
+	    /* We're using a ptr, increment it.  */
+	    start_sequence ();
+	    
+	    emit_insn (res);
+	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
+				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
+	    res = get_insns ();
+	    end_sequence ();
+	  }
+	else
+	  rep = 1;
+	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
+      }
+      break;
+    }
+  return res;
+}
+
 /* When loading an operand ORIG_OP, verify whether an address space
    conversion to generic is required, and if so, perform it.  Also
    check for SYMBOL_REFs for function decls and call
@@ -1660,6 +1968,7 @@ nvptx_print_operand_address (FILE *file,
    c -- print an opcode suffix for a comparison operator, including a type code
    d -- print a CONST_INT as a vector dimension (x, y, or z)
    f -- print a full reg even for something that must always be split
+   S -- print a shuffle kind specified by CONST_INT
    t -- print a type opcode suffix, promoting QImode to 32 bits
    T -- print a type size in bits
    u -- print a type opcode suffix without promotions.  */
@@ -1723,6 +2032,15 @@ nvptx_print_operand (FILE *file, rtx x,
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
       break;
 
+    case 'S':
+      {
+	unsigned kind = UINTVAL (x);
+	static const char *const kinds[] = 
+	  {"up", "down", "bfly", "idx"};
+	fprintf (file, ".%s", kinds[kind]);
+      }
+      break;
+
     case 'T':
       fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
       break;
@@ -1973,10 +2291,744 @@ nvptx_reorg_subreg (void)
     }
 }
 
+/* Loop structure of the function.The entire function is described as
+   a NULL loop.  We should be able to extend this to represent
+   superblocks.  */
+
+struct parallel
+{
+  /* Parent parallel.  */
+  parallel *parent;
+  
+  /* Next sibling parallel.  */
+  parallel *next;
+
+  /* First child parallel.  */
+  parallel *inner;
+
+  /* Partitioning mask of the parallel.  */
+  unsigned mask;
+
+  /* Partitioning used within inner parallels. */
+  unsigned inner_mask;
+
+  /* Location of parallel forked and join.  The forked is the first
+     block in the parallel and the join is the first block after of
+     the partition.  */
+  basic_block forked_block;
+  basic_block join_block;
+
+  rtx_insn *forked_insn;
+  rtx_insn *join_insn;
+
+  rtx_insn *fork_insn;
+  rtx_insn *joining_insn;
+
+  /* Basic blocks in this parallel, but not in child parallels.  The
+     FORKED and JOINING blocks are in the partition.  The FORK and JOIN
+     blocks are not.  */
+  auto_vec<basic_block> blocks;
+
+public:
+  parallel (parallel *parent, unsigned mode);
+  ~parallel ();
+};
+
+/* Constructor links the new parallel into it's parent's chain of
+   children.  */
+
+parallel::parallel (parallel *parent_, unsigned mask_)
+  :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
+{
+  forked_block = join_block = 0;
+  forked_insn = join_insn = 0;
+  fork_insn = joining_insn = 0;
+  
+  if (parent)
+    {
+      next = parent->inner;
+      parent->inner = this;
+    }
+}
+
+parallel::~parallel ()
+{
+  delete inner;
+  delete next;
+}
+
+/* Map of basic blocks to insns */
+typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
+
+/* A tuple of an insn of interest and the BB in which it resides.  */
+typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
+typedef auto_vec<insn_bb_t> insn_bb_vec_t;
+
+/* Split basic blocks such that each forked and join unspecs are at
+   the start of their basic blocks.  Thus afterwards each block will
+   have a single partitioning mode.  We also do the same for return
+   insns, as they are executed by every thread.  Return the
+   partitioning mode of the function as a whole.  Populate MAP with
+   head and tail blocks.  We also clear the BB visited flag, which is
+   used when finding partitions.  */
+
+static void
+nvptx_split_blocks (bb_insn_map_t *map)
+{
+  insn_bb_vec_t worklist;
+  basic_block block;
+  rtx_insn *insn;
+
+  /* Locate all the reorg instructions of interest.  */
+  FOR_ALL_BB_FN (block, cfun)
+    {
+      bool seen_insn = false;
+
+      // Clear visited flag, for use by parallel locator  */
+      block->flags &= ~BB_VISITED;
+      
+      FOR_BB_INSNS (block, insn)
+	{
+	  if (!INSN_P (insn))
+	    continue;
+	  switch (recog_memoized (insn))
+	    {
+	    default:
+	      seen_insn = true;
+	      continue;
+	    case CODE_FOR_nvptx_forked:
+	    case CODE_FOR_nvptx_join:
+	      break;
+
+	    case CODE_FOR_return:
+	      /* We also need to split just before return insns, as
+		 that insn needs executing by all threads, but the
+		 block it is in probably does not.  */
+	      break;
+	    }
+
+	  if (seen_insn)
+	    /* We've found an instruction that  must be at the start of
+	       a block, but isn't.  Add it to the worklist.  */
+	    worklist.safe_push (insn_bb_t (insn, block));
+	  else
+	    /* It was already the first instruction.  Just add it to
+	       the map.  */
+	    map->get_or_insert (block) = insn;
+	  seen_insn = true;
+	}
+    }
+
+  /* Split blocks on the worklist.  */
+  unsigned ix;
+  insn_bb_t *elt;
+  basic_block remap = 0;
+  for (ix = 0; worklist.iterate (ix, &elt); ix++)
+    {
+      if (remap != elt->second)
+	{
+	  block = elt->second;
+	  remap = block;
+	}
+      
+      /* Split block before insn. The insn is in the new block  */
+      edge e = split_block (block, PREV_INSN (elt->first));
+
+      block = e->dest;
+      map->get_or_insert (block) = elt->first;
+    }
+}
+
+/* BLOCK is a basic block containing a head or tail instruction.
+   Locate the associated prehead or pretail instruction, which must be
+   in the single predecessor block.  */
+
+static rtx_insn *
+nvptx_discover_pre (basic_block block, int expected)
+{
+  gcc_assert (block->preds->length () == 1);
+  basic_block pre_block = (*block->preds)[0]->src;
+  rtx_insn *pre_insn;
+
+  for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
+       pre_insn = PREV_INSN (pre_insn))
+    gcc_assert (pre_insn != BB_HEAD (pre_block));
+
+  gcc_assert (recog_memoized (pre_insn) == expected);
+  return pre_insn;
+}
+
+/* Dump this parallel and all its inner parallels.  */
+
+static void
+nvptx_dump_pars (parallel *par, unsigned depth)
+{
+  fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
+	   depth, par->mask,
+	   par->forked_block ? par->forked_block->index : -1,
+	   par->join_block ? par->join_block->index : -1);
+
+  fprintf (dump_file, "    blocks:");
+
+  basic_block block;
+  for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+    fprintf (dump_file, " %d", block->index);
+  fprintf (dump_file, "\n");
+  if (par->inner)
+    nvptx_dump_pars (par->inner, depth + 1);
+
+  if (par->next)
+    nvptx_dump_pars (par->next, depth);
+}
+
+/* If BLOCK contains a fork/join marker, process it to create or
+   terminate a loop structure.  Add this block to the current loop,
+   and then walk successor blocks.   */
+
+static parallel *
+nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
+{
+  if (block->flags & BB_VISITED)
+    return par;
+  block->flags |= BB_VISITED;
+
+  if (rtx_insn **endp = map->get (block))
+    {
+      rtx_insn *end = *endp;
+
+      /* This is a block head or tail, or return instruction.  */
+      switch (recog_memoized (end))
+	{
+	case CODE_FOR_return:
+	  /* Return instructions are in their own block, and we
+	     don't need to do anything more.  */
+	  return par;
+
+	case CODE_FOR_nvptx_forked:
+	  /* Loop head, create a new inner loop and add it into
+	     our parent's child list.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+	    gcc_assert (mask);
+	    par = new parallel (par, mask);
+	    par->forked_block = block;
+	    par->forked_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->fork_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
+	  }
+	  break;
+
+	case CODE_FOR_nvptx_join:
+	  /* A loop tail.  Finish the current loop and return to
+	     parent.  */
+	  {
+	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+	    gcc_assert (par->mask == mask);
+	    par->join_block = block;
+	    par->join_insn = end;
+	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	      par->joining_insn
+		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
+	    par = par->parent;
+	  }
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  if (par)
+    /* Add this block onto the current loop's list of blocks.  */
+    par->blocks.safe_push (block);
+  else
+    /* This must be the entry block.  Create a NULL parallel.  */
+    par = new parallel (0, 0);
+
+  /* Walk successor blocks.  */
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, block->succs)
+    nvptx_find_par (map, par, e->dest);
+
+  return par;
+}
+
+/* DFS walk the CFG looking for fork & join markers.  Construct
+   loop structures as we go.  MAP is a mapping of basic blocks
+   to head & tail markers, discovered when splitting blocks.  This
+   speeds up the discovery.  We rely on the BB visited flag having
+   been cleared when splitting blocks.  */
+
+static parallel *
+nvptx_discover_pars (bb_insn_map_t *map)
+{
+  basic_block block;
+
+  /* Mark exit blocks as visited.  */
+  block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+  block->flags |= BB_VISITED;
+
+  /* And entry block as not.  */
+  block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+  block->flags &= ~BB_VISITED;
+
+  parallel *par = nvptx_find_par (map, 0, block);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nLoops\n");
+      nvptx_dump_pars (par, 0);
+      fprintf (dump_file, "\n");
+    }
+  
+  return par;
+}
+
+/* Propagate live state at the start of a partitioned region.  BLOCK
+   provides the live register information, and might not contain
+   INSN. Propagation is inserted just after INSN. RW indicates whether
+   we are reading and/or writing state.  This
+   separation is needed for worker-level proppagation where we
+   essentially do a spill & fill.  FN is the underlying worker
+   function to generate the propagation instructions for single
+   register.  DATA is user data.
+
+   We propagate the live register set and the entire frame.  We could
+   do better by (a) propagating just the live set that is used within
+   the partitioned regions and (b) only propagating stack entries that
+   are used.  The latter might be quite hard to determine.  */
+
+static void
+nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
+		 rtx (*fn) (rtx, propagate_mask,
+			    unsigned, void *), void *data)
+{
+  bitmap live = DF_LIVE_IN (block);
+  bitmap_iterator iterator;
+  unsigned ix;
+
+  /* Copy the frame array.  */
+  HOST_WIDE_INT fs = get_frame_size ();
+  if (fs)
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+      rtx idx = NULL_RTX;
+      rtx ptr = gen_reg_rtx (Pmode);
+      rtx pred = NULL_RTX;
+      rtx_code_label *label = NULL;
+
+      gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
+      fs /= GET_MODE_SIZE (DImode);
+      /* Detect single iteration loop. */
+      if (fs == 1)
+	fs = 0;
+
+      start_sequence ();
+      emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
+      if (fs)
+	{
+	  idx = gen_reg_rtx (SImode);
+	  pred = gen_reg_rtx (BImode);
+	  label = gen_label_rtx ();
+	  
+	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
+	  /* Allow worker function to initialize anything needed */
+	  rtx init = fn (tmp, PM_loop_begin, fs, data);
+	  if (init)
+	    emit_insn (init);
+	  emit_label (label);
+	  LABEL_NUSES (label)++;
+	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
+	}
+      if (rw & PM_read)
+	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
+      emit_insn (fn (tmp, rw, fs, data));
+      if (rw & PM_write)
+	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
+      if (fs)
+	{
+	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
+	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
+	  emit_insn (gen_br_true_uni (pred, label));
+	  rtx fini = fn (tmp, PM_loop_end, fs, data);
+	  if (fini)
+	    emit_insn (fini);
+	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
+	}
+      emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
+      emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
+      rtx cpy = get_insns ();
+      end_sequence ();
+      insn = emit_insn_after (cpy, insn);
+    }
+
+  /* Copy live registers.  */
+  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+    {
+      rtx reg = regno_reg_rtx[ix];
+
+      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	{
+	  rtx bcast = fn (reg, rw, 0, data);
+
+	  insn = emit_insn_after (bcast, insn);
+	}
+    }
+}
+
+/* Worker for nvptx_vpropagate.  */
+
+static rtx
+vprop_gen (rtx reg, propagate_mask pm,
+	   unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+{
+  if (!(pm & PM_read_write))
+    return 0;
+  
+  return nvptx_gen_vcast (reg);
+}
+
+/* Propagate state that is live at start of BLOCK across the vectors
+   of a single warp.  Propagation is inserted just after INSN.   */
+
+static void
+nvptx_vpropagate (basic_block block, rtx_insn *insn)
+{
+  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+}
+
+/* Worker for nvptx_wpropagate.  */
+
+static rtx
+wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+{
+  wcast_data_t *data = (wcast_data_t *)data_;
+
+  if (pm & PM_loop_begin)
+    {
+      /* Starting a loop, initialize pointer.    */
+      unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
+
+      if (align > worker_bcast_align)
+	worker_bcast_align = align;
+      data->offset = (data->offset + align - 1) & ~(align - 1);
+
+      data->ptr = gen_reg_rtx (Pmode);
+
+      return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
+    }
+  else if (pm & PM_loop_end)
+    {
+      rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
+      data->ptr = NULL_RTX;
+      return clobber;
+    }
+  else
+    return nvptx_gen_wcast (reg, pm, rep, data);
+}
+
+/* Spill or fill live state that is live at start of BLOCK.  PRE_P
+   indicates if this is just before partitioned mode (do spill), or
+   just after it starts (do fill). Sequence is inserted just after
+   INSN.  */
+
+static void
+nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+{
+  wcast_data_t data;
+
+  data.base = gen_reg_rtx (Pmode);
+  data.offset = 0;
+  data.ptr = NULL_RTX;
+
+  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  if (data.offset)
+    {
+      /* Stuff was emitted, initialize the base pointer now.  */
+      rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
+      emit_insn_after (init, insn);
+      
+      if (worker_bcast_hwm < data.offset)
+	worker_bcast_hwm = data.offset;
+    }
+}
+
+/* Emit a worker-level synchronization barrier.  We use different
+   markers for before and after synchronizations.  */
+
+static rtx
+nvptx_wsync (bool after)
+{
+  return gen_nvptx_barsync (GEN_INT (after));
+}
+
+/* Single neutering according to MASK.  FROM is the incoming block and
+   TO is the outgoing block.  These may be the same block. Insert at
+   start of FROM:
+   
+     if (tid.<axis>) goto end.
+
+   and insert before ending branch of TO (if there is such an insn):
+
+     end:
+     <possibly-broadcast-cond>
+     <branch>
+
+   We currently only use differnt FROM and TO when skipping an entire
+   loop.  We could do more if we detected superblocks.  */
+
+static void
+nvptx_single (unsigned mask, basic_block from, basic_block to)
+{
+  rtx_insn *head = BB_HEAD (from);
+  rtx_insn *tail = BB_END (to);
+  unsigned skip_mask = mask;
+
+  /* Find first insn of from block */
+  while (head != BB_END (from) && !INSN_P (head))
+    head = NEXT_INSN (head);
+
+  /* Find last insn of to block */
+  rtx_insn *limit = from == to ? head : BB_HEAD (to);
+  while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
+    tail = PREV_INSN (tail);
+
+  /* Detect if tail is a branch.  */
+  rtx tail_branch = NULL_RTX;
+  rtx cond_branch = NULL_RTX;
+  if (tail && INSN_P (tail))
+    {
+      tail_branch = PATTERN (tail);
+      if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
+	tail_branch = NULL_RTX;
+      else
+	{
+	  cond_branch = SET_SRC (tail_branch);
+	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
+	    cond_branch = NULL_RTX;
+	}
+    }
+
+  if (tail == head)
+    {
+      /* If this is empty, do nothing.  */
+      if (!head || !INSN_P (head))
+	return;
+
+      /* If this is a dummy insn, do nothing.  */
+      switch (recog_memoized (head))
+	{
+	default:break;
+	case CODE_FOR_nvptx_fork:
+	case CODE_FOR_nvptx_forked:
+	case CODE_FOR_nvptx_joining:
+	case CODE_FOR_nvptx_join:
+	  return;
+	}
+
+      if (cond_branch)
+	{
+	  /* If we're only doing vector single, there's no need to
+	     emit skip code because we'll not insert anything.  */
+	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
+	    skip_mask = 0;
+	}
+      else if (tail_branch)
+	/* Block with only unconditional branch.  Nothing to do.  */
+	return;
+    }
+
+  /* Insert the vector test inside the worker test.  */
+  unsigned mode;
+  rtx_insn *before = tail;
+  for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
+    if (GOMP_DIM_MASK (mode) & skip_mask)
+      {
+	rtx_code_label *label = gen_label_rtx ();
+	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
+
+	if (!pred)
+	  {
+	    pred = gen_reg_rtx (BImode);
+	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
+	  }
+	
+	rtx br;
+	if (mode == GOMP_DIM_VECTOR)
+	  br = gen_br_true (pred, label);
+	else
+	  br = gen_br_true_uni (pred, label);
+	emit_insn_before (br, head);
+
+	LABEL_NUSES (label)++;
+	if (tail_branch)
+	  before = emit_label_before (label, before);
+	else
+	  emit_label_after (label, tail);
+      }
+
+  /* Now deal with propagating the branch condition.  */
+  if (cond_branch)
+    {
+      rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
+
+      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+	{
+	  /* Vector mode only, do a shuffle.  */
+	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
+	}
+      else
+	{
+	  /* Includes worker mode, do spill & fill.  By construction
+	     we should never have worker mode only. */
+	  wcast_data_t data;
+
+	  data.base = worker_bcast_sym;
+	  data.ptr = 0;
+
+	  if (worker_bcast_hwm < GET_MODE_SIZE (SImode))
+	    worker_bcast_hwm = GET_MODE_SIZE (SImode);
+
+	  data.offset = 0;
+	  emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+			    before);
+	  /* Barrier so other workers can see the write.  */
+	  emit_insn_before (nvptx_wsync (false), tail);
+	  data.offset = 0;
+	  emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+	  /* This barrier is needed to avoid worker zero clobbering
+	     the broadcast buffer before all the other workers have
+	     had a chance to read this instance of it.  */
+	  emit_insn_before (nvptx_wsync (true), tail);
+	}
+
+      extract_insn (tail);
+      rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
+				 UNSPEC_BR_UNIFIED);
+      validate_change (tail, recog_data.operand_loc[0], unsp, false);
+    }
+}
+
+/* PAR is a parallel that is being skipped in its entirety according to
+   MASK.  Treat this as skipping a superblock starting at forked
+   and ending at joining.  */
+
+static void
+nvptx_skip_par (unsigned mask, parallel *par)
+{
+  basic_block tail = par->join_block;
+  gcc_assert (tail->preds->length () == 1);
+
+  basic_block pre_tail = (*tail->preds)[0]->src;
+  gcc_assert (pre_tail->succs->length () == 1);
+
+  nvptx_single (mask, par->forked_block, pre_tail);
+}
+
+/* Process the parallel PAR and all its contained
+   parallels.  We do everything but the neutering.  Return mask of
+   partitioned modes used within this parallel.  */
+
+static unsigned
+nvptx_process_pars (parallel *par)
+{
+  unsigned inner_mask = par->mask;
+
+  /* Do the inner parallels first.  */
+  if (par->inner)
+    {
+      par->inner_mask = nvptx_process_pars (par->inner);
+      inner_mask |= par->inner_mask;
+    }
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
+    { /* No propagation needed for a call.  */ }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
+      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
+      /* Insert begin and end synchronizations.  */
+      emit_insn_after (nvptx_wsync (false), par->forked_insn);
+      emit_insn_before (nvptx_wsync (true), par->joining_insn);
+    }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+    nvptx_vpropagate (par->forked_block, par->forked_insn);
+
+  /* Now do siblings.  */
+  if (par->next)
+    inner_mask |= nvptx_process_pars (par->next);
+  return inner_mask;
+}
+
+/* Neuter the parallel described by PAR.  We recurse in depth-first
+   order.  MODES are the partitioning of the execution and OUTER is
+   the partitioning of the parallels we are contained in.  */
+
+static void
+nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
+{
+  unsigned me = par->mask
+    & (GOMP_DIM_MASK (GOMP_DIM_WORKER) | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  unsigned  skip_mask = 0, neuter_mask = 0;
+  
+  if (par->inner)
+    nvptx_neuter_pars (par->inner, modes, outer | me);
+
+  for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
+    {
+      if ((outer | me) & GOMP_DIM_MASK (mode))
+	{ /* Mode is partitioned: no neutering.  */ }
+      else if (!(modes & GOMP_DIM_MASK (mode)))
+	{ /* Mode  is not used: nothing to do.  */ }
+      else if (par->inner_mask & GOMP_DIM_MASK (mode)
+	       || !par->forked_insn)
+	/* Partitioned in inner parallels, or we're not a partitioned
+	   at all: neuter individual blocks.  */
+	neuter_mask |= GOMP_DIM_MASK (mode);
+      else if (!par->parent || !par->parent->forked_insn
+	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
+	/* Parent isn't a parallel or contains this paralleling: skip
+	   parallel at this level.  */
+	skip_mask |= GOMP_DIM_MASK (mode);
+      else
+	{ /* Parent will skip this parallel itself.  */ }
+    }
+
+  if (neuter_mask)
+    {
+      int ix;
+      int len = par->blocks.length ();
+
+      for (ix = 0; ix != len; ix++)
+	{
+	  basic_block block = par->blocks[ix];
+
+	  nvptx_single (neuter_mask, block, block);
+	}
+    }
+
+  if (skip_mask)
+      nvptx_skip_par (skip_mask, par);
+  
+  if (par->next)
+    nvptx_neuter_pars (par->next, modes, outer);
+}
+
 /* PTX-specific reorganization
+   - Scan and release reduction buffers
+   - Split blocks at fork and join instructions
    - Compute live registers
    - Mark now-unused registers, so function begin doesn't declare
    unused registers.
+   - Insert state propagation when entering partitioned mode
+   - Insert neutering instructions when in single mode
    - Replace subregs with suitable sequences.
 */
 
@@ -1989,19 +3041,60 @@ nvptx_reorg (void)
 
   thread_prologue_and_epilogue_insns ();
 
+  /* Split blocks and record interesting unspecs.  */
+  bb_insn_map_t bb_insn_map;
+
+  nvptx_split_blocks (&bb_insn_map);
+
   /* Compute live regs */
   df_clear_flags (DF_LR_RUN_DCE);
   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
+  df_live_add_problem ();
+  df_live_set_all_dirty ();
   df_analyze ();
   regstat_init_n_sets_and_refs ();
 
-  int max_regs = max_reg_num ();
-
+  if (dump_file)
+    df_dump (dump_file);
+  
   /* Mark unused regs as unused.  */
+  int max_regs = max_reg_num ();
   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
       regno_reg_rtx[i] = const0_rtx;
 
+  /* Determine launch dimensions of the function.  If it is not an
+     offloaded function  (i.e. this is a regular compiler), the
+     function has no neutering.  */
+  tree attr = get_oacc_fn_attrib (current_function_decl);
+  if (attr)
+    {
+      /* If we determined this mask before RTL expansion, we could
+	 elide emission of some levels of forks and joins.  */
+      unsigned mask = 0;
+      tree dims = TREE_VALUE (attr);
+      unsigned ix;
+
+      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+	{
+	  int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
+	  tree allowed = TREE_PURPOSE (dims);
+
+	  if (size != 1 && !(allowed && integer_zerop (allowed)))
+	    mask |= GOMP_DIM_MASK (ix);
+	}
+      /* If there is worker neutering, there must be vector
+	 neutering.  Otherwise the hardware will fail.  */
+      gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+		  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+
+      /* Discover & process partitioned regions.  */
+      parallel *pars = nvptx_discover_pars (&bb_insn_map);
+      nvptx_process_pars (pars);
+      nvptx_neuter_pars (pars, mask, 0);
+      delete pars;
+    }
+
   /* Replace subregs.  */
   nvptx_reorg_subreg ();
 
@@ -2052,6 +3145,26 @@ nvptx_vector_alignment (const_tree type)
 
   return MIN (align, BIGGEST_ALIGNMENT);
 }
+
+/* Indicate that INSN cannot be duplicated.   */
+
+static bool
+nvptx_cannot_copy_insn_p (rtx_insn *insn)
+{
+  switch (recog_memoized (insn))
+    {
+    case CODE_FOR_nvptx_shufflesi:
+    case CODE_FOR_nvptx_shufflesf:
+    case CODE_FOR_nvptx_barsync:
+    case CODE_FOR_nvptx_fork:
+    case CODE_FOR_nvptx_forked:
+    case CODE_FOR_nvptx_joining:
+    case CODE_FOR_nvptx_join:
+      return true;
+    default:
+      return false;
+    }
+}
 
 /* Record a symbol for mkoffload to enter into the mapping table.  */
 
@@ -2129,6 +3242,19 @@ nvptx_file_end (void)
   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
     nvptx_record_fndecl (decl, true);
   fputs (func_decls.str().c_str(), asm_out_file);
+
+  if (worker_bcast_hwm)
+    {
+      /* Define the broadcast buffer.  */
+
+      worker_bcast_hwm = (worker_bcast_hwm + worker_bcast_align - 1)
+	& ~(worker_bcast_align - 1);
+      
+      fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name);
+      fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
+	       worker_bcast_align,
+	       worker_bcast_name, worker_bcast_hwm);
+    }
 }
 
 /* Validate compute dimensions of an OpenACC offload or routine, fill
@@ -2233,6 +3359,9 @@ nvptx_goacc_validate_dims (tree ARG_UNUS
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
 
+#undef TARGET_CANNOT_COPY_INSN_P
+#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
Index: gcc/config/nvptx/nvptx.h
===================================================================
--- gcc/config/nvptx/nvptx.h	(revision 229096)
+++ gcc/config/nvptx/nvptx.h	(working copy)
@@ -230,6 +230,7 @@ struct GTY(()) machine_function
   HOST_WIDE_INT outgoing_stdarg_size;
   int ret_reg_mode; /* machine_mode not defined yet. */
   int punning_buffer_size;
+  rtx axis_predicate[2];
 };
 #endif
 
Index: gcc/config/nvptx/nvptx.md
===================================================================
--- gcc/config/nvptx/nvptx.md	(revision 229096)
+++ gcc/config/nvptx/nvptx.md	(working copy)
@@ -49,14 +49,27 @@
 
    UNSPEC_ALLOCA
 
-   UNSPEC_NTID
-   UNSPEC_TID
+   UNSPEC_DIM_SIZE
+
+   UNSPEC_SHARED_DATA
+
+   UNSPEC_BIT_CONV
+
+   UNSPEC_SHUFFLE
+   UNSPEC_BR_UNIFIED
 ])
 
 (define_c_enum "unspecv" [
    UNSPECV_LOCK
    UNSPECV_CAS
    UNSPECV_XCHG
+   UNSPECV_BARSYNC
+   UNSPECV_DIM_POS
+
+   UNSPECV_FORK
+   UNSPECV_FORKED
+   UNSPECV_JOINING
+   UNSPECV_JOIN
 ])
 
 (define_attr "subregs_ok" "false,true"
@@ -246,6 +259,8 @@
 (define_mode_iterator QHSIM [QI HI SI])
 (define_mode_iterator SDFM [SF DF])
 (define_mode_iterator SDCM [SC DC])
+(define_mode_iterator BITS [SI SF])
+(define_mode_iterator BITD [DI DF])
 
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
@@ -817,6 +832,23 @@
   ""
   "%J0\\tbra\\t%l1;")
 
+;; unified conditional branch
+(define_insn "br_true_uni"
+  [(set (pc) (if_then_else
+	(ne (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+		       UNSPEC_BR_UNIFIED) (const_int 0))
+        (label_ref (match_operand 1 "" "")) (pc)))]
+  ""
+  "%j0\\tbra.uni\\t%l1;")
+
+(define_insn "br_false_uni"
+  [(set (pc) (if_then_else
+	(eq (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
+		       UNSPEC_BR_UNIFIED) (const_int 0))
+        (label_ref (match_operand 1 "" "")) (pc)))]
+  ""
+  "%J0\\tbra.uni\\t%l1;")
+
 (define_expand "cbranch<mode>4"
   [(set (pc)
 	(if_then_else (match_operator 0 "nvptx_comparison_operator"
@@ -1308,36 +1340,126 @@
   DONE;
 })
 
-(define_insn "*oacc_ntid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NTID))]
+(define_insn "oacc_dim_size"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "")
+	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")]
+		   UNSPEC_DIM_SIZE))]
   ""
-  "%.\\tmov.u32 %0, %%ntid%d1;")
+{
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32\\t%0, %%nctaid.x;",	/* gang */
+  "%.\\tmov.u32\\t%0, %%ntid.y;",	/* worker */
+  "%.\\tmov.u32\\t%0, %%ntid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
+})
 
-(define_expand "oacc_ntid"
+(define_insn "oacc_dim_pos"
   [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NTID))]
+	(unspec_volatile:SI [(match_operand:SI 1 "const_int_operand" "")]
+			    UNSPECV_DIM_POS))]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  static const char *const asms[] =
+{ /* Must match oacc_loop_levels ordering.  */
+  "%.\\tmov.u32\\t%0, %%ctaid.x;",	/* gang */
+  "%.\\tmov.u32\\t%0, %%tid.y;",	/* worker */
+  "%.\\tmov.u32\\t%0, %%tid.x;",	/* vector */
+};
+  return asms[INTVAL (operands[1])];
 })
 
-(define_insn "*oacc_tid_insn"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_TID))]
+(define_insn "nvptx_fork"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_FORK)]
   ""
-  "%.\\tmov.u32 %0, %%tid%d1;")
+  "// fork %0;"
+)
 
-(define_expand "oacc_tid"
-  [(set (match_operand:SI 0 "nvptx_register_operand" "")
-	(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_TID))]
+(define_insn "nvptx_forked"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_FORKED)]
+  ""
+  "// forked %0;"
+)
+
+(define_insn "nvptx_joining"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_JOINING)]
+  ""
+  "// joining %0;"
+)
+
+(define_insn "nvptx_join"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_JOIN)]
+  ""
+  "// join %0;"
+)
+
+(define_expand "oacc_fork"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_FORKED)]
   ""
 {
-  if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2)
-    FAIL;
+  nvptx_expand_oacc_fork (INTVAL (operands[0]));
+  DONE;
 })
 
+(define_expand "oacc_join"
+  [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
+		       UNSPECV_JOIN)]
+  ""
+{
+  nvptx_expand_oacc_join (INTVAL (operands[0]));
+  DONE;
+})
+
+;; only 32-bit shuffles exist.
+(define_insn "nvptx_shuffle<mode>"
+  [(set (match_operand:BITS 0 "nvptx_register_operand" "=R")
+	(unspec:BITS
+		[(match_operand:BITS 1 "nvptx_register_operand" "R")
+		 (match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")
+		 (match_operand:SI 3 "const_int_operand" "n")]
+		  UNSPEC_SHUFFLE))]
+  ""
+  "%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
+
+;; extract parts of a 64 bit object into 2 32-bit ints
+(define_insn "unpack<mode>si2"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+        (unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R")
+		    (const_int 0)] UNSPEC_BIT_CONV))
+   (set (match_operand:SI 1 "nvptx_register_operand" "=R")
+        (unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
+  ""
+  "%.\\tmov.b64\\t{%0,%1}, %2;")
+
+;; pack 2 32-bit ints into a 64 bit object
+(define_insn "packsi<mode>2"
+  [(set (match_operand:BITD 0 "nvptx_register_operand" "=R")
+        (unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R")
+		      (match_operand:SI 2 "nvptx_register_operand" "R")]
+		    UNSPEC_BIT_CONV))]
+  ""
+  "%.\\tmov.b64\\t%0, {%1,%2};")
+
+(define_insn "worker_load<mode>"
+  [(set (match_operand:SDISDFM 0 "nvptx_register_operand" "=R")
+        (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "m")]
+			 UNSPEC_SHARED_DATA))]
+  ""
+  "%.\\tld.shared%u0\\t%0, %1;")
+
+(define_insn "worker_store<mode>"
+  [(set (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "=m")]
+			 UNSPEC_SHARED_DATA)
+	(match_operand:SDISDFM 0 "nvptx_register_operand" "R"))]
+  ""
+  "%.\\tst.shared%u1\\t%1, %0;")
+
 ;; Atomic insns.
 
 (define_expand "atomic_compare_and_swap<mode>"
@@ -1423,3 +1545,9 @@
 	(match_dup 1))]
   "0"
   "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
+
+(define_insn "nvptx_barsync"
+  [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
+		    UNSPECV_BARSYNC)]
+  ""
+  "\\tbar.sync\\t%0;")

Re: [OpenACC 2/11] PTX backend changes

Reply via email to