On Wed, Jul 08, 2015 at 10:47:56AM -0400, Nathan Sidwell wrote: > +/* Generate loop head markers in outer->inner order. */ > + > +static void > +gen_oacc_fork (gimple_seq *seq, unsigned mask) > +{ > + { > + // TODDO: Determine this information from the parallel region itself
TODO ? > + // and emit it once in the offload function. Currently the target > + // geometry definition is being extracted early. For now inform > + // the backend we're using all axes of parallelism, which is a > + // safe default. > + gcall *call = gimple_build_call_internal > + (IFN_GOACC_MODES, 1, > + build_int_cst (unsigned_type_node, > + OACC_LOOP_MASK (OACC_gang) > + | OACC_LOOP_MASK (OACC_vector) > + | OACC_LOOP_MASK (OACC_worker))); The formatting is too ugly. I'd say you just want tree arg = build_int_cst (unsigned_type_node, OACC_LOOP_MASK (OACC_gang) | OACC_LOOP_MASK (OACC_vector) | OACC_LOOP_MASK (OACC_worker)); gcall *call = gimple_build_call_internal (IFN_GOACC_MODES, 1, arg); > + | OACC_LOOP_MASK (OACC_vector) > + for (level = OACC_gang; level != OACC_HWM; level++) > + if (mask & OACC_LOOP_MASK (level)) > + { > + tree arg = build_int_cst (unsigned_type_node, level); > + gcall *call = gimple_build_call_internal > + (IFN_GOACC_FORK, 1, arg); Why the line-break? That should fit into 80 columns just fine. > + gimple_seq_add_stmt (seq, call); > + } > +} > + > +/* Generate loop tail markers in inner->outer order. */ > + > +static void > +gen_oacc_join (gimple_seq *seq, unsigned mask) > +{ > + unsigned level; > + > + for (level = OACC_HWM; level-- != OACC_gang; ) > + if (mask & OACC_LOOP_MASK (level)) > + { > + tree arg = build_int_cst (unsigned_type_node, level); > + gcall *call = gimple_build_call_internal > + (IFN_GOACC_JOIN, 1, arg); > + gimple_seq_add_stmt (seq, call); > + } > +} > > /* Find the mapping for DECL in CTX or the immediately enclosing > context that has a mapping for DECL. > @@ -6777,21 +6808,6 @@ expand_omp_for_generic (struct omp_regio > } > } > > - > -/* True if a barrier is needed after a loop partitioned over > - gangs/workers/vectors as specified by GWV_BITS. OpenACC semantics specify > - that a (conceptual) barrier is needed after worker and vector-partitioned > - loops, but not after gang-partitioned loops. Currently we are relying on > - warp reconvergence to synchronise threads within a warp after vector > loops, > - so an explicit barrier is not helpful after those. */ > - > -static bool > -oacc_loop_needs_threadbarrier_p (int gwv_bits) > -{ > - return !(gwv_bits & OACC_LOOP_MASK (OACC_gang)) > - && (gwv_bits & OACC_LOOP_MASK (OACC_worker)); > -} > - > /* A subroutine of expand_omp_for. Generate code for a parallel > loop with static schedule and no specified chunk size. Given > parameters: > @@ -6800,6 +6816,7 @@ oacc_loop_needs_threadbarrier_p (int gwv > > where COND is "<" or ">", we generate pseudocode > > + OACC_FORK > if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2; > if (cond is <) > adj = STEP - 1; > @@ -6827,6 +6844,11 @@ oacc_loop_needs_threadbarrier_p (int gwv > V += STEP; > if (V cond e) goto L1; > L2: > + OACC_JOIN > + > + It'd be better to place the OACC_LOOP markers just inside the outer > + conditional, so they can be entirely eliminated if the loop is > + unreachable. Putting OACC_FORK/OACC_JOIN unconditionally into the comment is very confusing. The expand_omp_for_static_nochunk routine is used for #pragma omp for schedule(static), #pragma omp distribute etc. which certainly don't want to emit such markers in there. So perhaps mention somewhere that you wrap all the above sequence in between OACC_FORK/OACC_JOIN markers. > @@ -7220,6 +7249,7 @@ find_phi_with_arg_on_edge (tree arg, edg > > where COND is "<" or ">", we generate pseudocode > > +OACC_FORK > if ((__typeof (V)) -1 > 0 && N2 cond N1) goto L2; > if (cond is <) > adj = STEP - 1; > @@ -7230,6 +7260,7 @@ find_phi_with_arg_on_edge (tree arg, edg > else > n = (adj + N2 - N1) / STEP; > trip = 0; > + > V = threadid * CHUNK * STEP + N1; -- this extra definition of V is > here so that V is defined > if the loop is not entered > @@ -7248,6 +7279,7 @@ find_phi_with_arg_on_edge (tree arg, edg > trip += 1; > goto L0; > L4: > +OACC_JOIN > */ Likewise. > > static void > @@ -7281,10 +7313,6 @@ expand_omp_for_static_chunk (struct omp_ > gcc_assert (EDGE_COUNT (iter_part_bb->succs) == 2); > fin_bb = BRANCH_EDGE (iter_part_bb)->dest; > > - /* Broadcast variables to OpenACC threads. */ > - entry_bb = oacc_broadcast (entry_bb, fin_bb, region); > - region->entry = entry_bb; > - > gcc_assert (broken_loop > || fin_bb == FALLTHRU_EDGE (cont_bb)->dest); > seq_start_bb = split_edge (FALLTHRU_EDGE (iter_part_bb)); > @@ -7296,7 +7324,7 @@ expand_omp_for_static_chunk (struct omp_ > trip_update_bb = split_edge (FALLTHRU_EDGE (cont_bb)); > } > exit_bb = region->exit; > - > + Please avoid such whitespace changes. In any case, as it is a gomp-4_0-branch patch, I'll defer full review to the branch maintainers. Jakub