On 14-12-18 20:58, Tom de Vries wrote: > 0007-nvptx-consolidate-offloaded-function-attributes-into.patch
> diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c > index a3169febbb4..dcfa57d10ca 100644 > --- a/gcc/config/nvptx/nvptx.c > +++ b/gcc/config/nvptx/nvptx.c > @@ -2872,6 +2872,17 @@ nvptx_reorg_uniform_simt () > } > } > > +/* Offloading function attributes. */ > + > +struct offload_attrs > +{ > + unsigned mask; > + int num_gangs; > + int num_workers; > + int vector_length; > + int max_workers; > +}; > + I like the idea of factoring out the extraction of information from the function attributes. But max_workers is something derived from that information, so it doesn't seem proper to add it here. > /* Loop structure of the function. The entire function is described as > a NULL loop. */ > > @@ -4569,6 +4580,56 @@ nvptx_neuter_pars (parallel *par, unsigned modes, > unsigned outer) > nvptx_neuter_pars (par->next, modes, outer); > } > > +static void > +populate_offload_attrs (offload_attrs *oa) > +{ > + tree attr = oacc_get_fn_attrib (current_function_decl); > + tree dims = TREE_VALUE (attr); > + unsigned ix; > + > + oa->mask = 0; > + > + for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) > + { > + tree t = TREE_VALUE (dims); > + int size = (t == NULL_TREE) ? 0 : TREE_INT_CST_LOW (t); This is very strange. Why do we represent 'TREE_VALUE (dims) == NULL_TREE' with '0' (which means determined at runtime)? In oacc_validate_dims we use -1 for this, which means 'not set'. > + tree allowed = TREE_PURPOSE (dims); > + > + if (size != 1 && !(allowed && integer_zerop (allowed))) > + oa->mask |= GOMP_DIM_MASK (ix); > + > + switch (ix) > + { > + case GOMP_DIM_GANG: > + oa->num_gangs = size; > + break; > + > + case GOMP_DIM_WORKER: > + oa->num_workers = size; > + break; > + > + case GOMP_DIM_VECTOR: > + oa->vector_length = size; > + break; > + } > + } > + > + if (oa->vector_length == 0) > + { > + /* FIXME: Need a more graceful way to handle large vector > + lengths in OpenACC routines. */ > + if (!lookup_attribute ("omp target entrypoint", > + DECL_ATTRIBUTES (current_function_decl))) > + oa->vector_length = PTX_WARP_SIZE; > + else > + oa->vector_length = PTX_VECTOR_LENGTH; > + } The case that 'oa->vector_length == 0' is triggered by calling populate_offload_attrs from nvptx_adjust_parallelism, which is called before oacc_validate_dims has updated the function attributes. This kludge is trying to resolve a circular dependency: after calling oacc_validate_dims and updating the function attributes, we know the chosen vector length, which is necessary for nvptx_adjust_parallelism, which influences the used_mask given as parameter to ... oacc_validate_dims. The way the kludge tries to cut this circular dependency is by replicating setting of default dimensions (designed to be done in nvptx_goacc_validate_dims) here in populate_offload_attrs. In the counter-proposed nvptx_adjust_parallelism ( https://gcc.gnu.org/ml/gcc-patches/2018-12/msg01619.html ), we cut the circular dependency in a different way: by reusing nvptx_goacc_validate_dims in nvptx_adjust_parallelism. So, we can assume oa->vector_length > 0 here, and declare this dead code. > + if (oa->num_workers == 0) > + oa->max_workers = PTX_CTA_SIZE / oa->vector_length; > + else > + oa->max_workers = oa->num_workers; > +} > + I moved this bit to the patch introducing nvptx_mach_max_workers. > #if WORKAROUND_PTXJIT_BUG_2 > /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This > variant > is needed in the nvptx target because the branches generated for > @@ -4750,27 +4811,19 @@ nvptx_reorg (void) > { > /* If we determined this mask before RTL expansion, we could > elide emission of some levels of forks and joins. */ > - unsigned mask = 0; > - tree dims = TREE_VALUE (attr); > - unsigned ix; > + offload_attrs oa; > > - for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) > - { > - int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); > - tree allowed = TREE_PURPOSE (dims); > + populate_offload_attrs (&oa); > > - if (size != 1 && !(allowed && integer_zerop (allowed))) > - mask |= GOMP_DIM_MASK (ix); > - } > /* If there is worker neutering, there must be vector > neutering. Otherwise the hardware will fail. */ > - gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) > - || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); > + gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) > + || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); > > /* Discover & process partitioned regions. */ > parallel *pars = nvptx_discover_pars (&bb_insn_map); > nvptx_process_pars (pars); > - nvptx_neuter_pars (pars, mask, 0); > + nvptx_neuter_pars (pars, oa.mask, 0); > delete pars; > } > Committed as attached. Thanks, - Tom
[nvptx] Factor out populate_offload_attrs Factor out populate_offload_attrs from nvptx_reorg. 2018-12-17 Tom de Vries <tdevr...@suse.de> * config/nvptx/nvptx.c (struct offload_attrs): New. (populate_offload_attrs): New function. Factor mask extraction out of nvptx_reorg. Add extraction of dimensions. (nvptx_reorg): Use populate_offload_attrs. --- gcc/config/nvptx/nvptx.c | 63 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 8cb58341c23..f527429ce2d 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -2873,6 +2873,16 @@ nvptx_reorg_uniform_simt () } } +/* Offloading function attributes. */ + +struct offload_attrs +{ + unsigned mask; + int num_gangs; + int num_workers; + int vector_length; +}; + /* Loop structure of the function. The entire function is described as a NULL loop. */ @@ -4576,6 +4586,41 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) nvptx_neuter_pars (par->next, modes, outer); } +static void +populate_offload_attrs (offload_attrs *oa) +{ + tree attr = oacc_get_fn_attrib (current_function_decl); + tree dims = TREE_VALUE (attr); + unsigned ix; + + oa->mask = 0; + + for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) + { + tree t = TREE_VALUE (dims); + int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t); + tree allowed = TREE_PURPOSE (dims); + + if (size != 1 && !(allowed && integer_zerop (allowed))) + oa->mask |= GOMP_DIM_MASK (ix); + + switch (ix) + { + case GOMP_DIM_GANG: + oa->num_gangs = size; + break; + + case GOMP_DIM_WORKER: + oa->num_workers = size; + break; + + case GOMP_DIM_VECTOR: + oa->vector_length = size; + break; + } + } +} + #if WORKAROUND_PTXJIT_BUG_2 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant is needed in the nvptx target because the branches generated for @@ -4757,27 +4802,19 @@ nvptx_reorg (void) { /* If we determined this mask before RTL expansion, we could elide emission of some levels of forks and joins. */ - unsigned mask = 0; - tree dims = TREE_VALUE (attr); - unsigned ix; + offload_attrs oa; - for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) - { - int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); - tree allowed = TREE_PURPOSE (dims); + populate_offload_attrs (&oa); - if (size != 1 && !(allowed && integer_zerop (allowed))) - mask |= GOMP_DIM_MASK (ix); - } /* If there is worker neutering, there must be vector neutering. Otherwise the hardware will fail. */ - gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) - || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); + gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); /* Discover & process partitioned regions. */ parallel *pars = nvptx_discover_pars (&bb_insn_map); nvptx_process_pars (pars); - nvptx_neuter_pars (pars, mask, 0); + nvptx_neuter_pars (pars, oa.mask, 0); delete pars; }