https://gcc.gnu.org/g:0a5e48f68ff8d2e581031def92eef5bf3338934b
commit 0a5e48f68ff8d2e581031def92eef5bf3338934b Author: Chung-Lin Tang <clt...@baylibre.com> Date: Sun Mar 30 21:04:45 2025 +0000 OpenACC: array reductions bug fixes This is a merge of the v4 to v5 diff patch from: https://gcc.gnu.org/pipermail/gcc-patches/2025-March/679682.html This patch fixes issues found for NVPTX sm_70 testing, and another issue related to copying to reduction buffer for worker/vector mode. gcc/ChangeLog: * config/gcn/gcn-tree.cc (gcn_goacc_reduction_setup): Fix array case copy source into reduction buffer. * config/nvptx/nvptx.cc (nvptx_expand_shared_addr): Move default size init setting place. (enum nvptx_builtins): Add NVPTX_BUILTIN_BAR_WARPSYNC. (nvptx_init_builtins): Add DEF() of nvptx_builtin_bar_warpsync. (nvptx_expand_builtin): Expand NVPTX_BUILTIN_BAR_WARPSYNC. (nvptx_goacc_reduction_setup): Fix array case copy source into reduction buffer. (nvptx_goacc_reduction_fini): Add bar.warpsync for at end of vector-mode reductions for sm_70 and above. libgomp/ChangeLog: * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c: Adjust test. * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c: Likewise. Diff: --- gcc/config/gcn/gcn-tree.cc | 3 +- gcc/config/nvptx/nvptx.cc | 32 ++++++++++++++++------ .../libgomp.oacc-c-c++-common/reduction-arrays-2.c | 24 ++++++++++++++++ .../libgomp.oacc-c-c++-common/reduction-arrays-3.c | 24 ++++++++++++++++ .../libgomp.oacc-c-c++-common/reduction-arrays-4.c | 24 ++++++++++++++++ .../libgomp.oacc-c-c++-common/reduction-arrays-5.c | 24 ++++++++++++++++ 6 files changed, 122 insertions(+), 9 deletions(-) diff --git a/gcc/config/gcn/gcn-tree.cc b/gcc/config/gcn/gcn-tree.cc index a71aadc395a0..97f35cd6f7a0 100644 --- a/gcc/config/gcn/gcn-tree.cc +++ b/gcc/config/gcn/gcn-tree.cc @@ -750,13 +750,14 @@ gcn_goacc_reduction_setup (gcall *call) tree offset = gimple_call_arg (call, 5); if (array_p) { + tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr; tree decl = gcn_goacc_get_worker_array_reduction_buffer (array_type, array_max_idx, &seq); tree ptr = make_ssa_name (TREE_TYPE (array_addr)); gimplify_assign (ptr, build_fold_addr_expr (decl), &seq); /* Store incoming value to worker reduction buffer. */ - oacc_build_array_copy (ptr, array_addr, array_max_idx, &seq); + oacc_build_array_copy (ptr, copy_src, array_max_idx, &seq); } else { diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 886a39a26e95..8e5d9ecce70a 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -6500,16 +6500,16 @@ nvptx_expand_shared_addr (tree exp, rtx target, if (TREE_CONSTANT (size_expr)) size = TREE_INT_CST_LOW (size_expr); + /* Default size for unknown size expression. */ + if (size == 0) + size = 256; + if (vector) { offload_attrs oa; populate_offload_attrs (&oa); - /* Default size for unknown size expression. */ - if (size == 0) - size = 256; - unsigned int psize = ROUND_UP (size + offset, align); unsigned int pnum = nvptx_mach_max_workers (); vector_red_partition = MAX (vector_red_partition, psize); @@ -6605,6 +6605,7 @@ enum nvptx_builtins NVPTX_BUILTIN_BAR_RED_AND, NVPTX_BUILTIN_BAR_RED_OR, NVPTX_BUILTIN_BAR_RED_POPC, + NVPTX_BUILTIN_BAR_WARPSYNC, NVPTX_BUILTIN_BREV, NVPTX_BUILTIN_BREVLL, NVPTX_BUILTIN_COND_UNI, @@ -6737,6 +6738,8 @@ nvptx_init_builtins (void) DEF (BAR_RED_POPC, "bar_red_popc", (UINT, UINT, UINT, UINT, UINT, NULL_TREE)); + DEF (BAR_WARPSYNC, "bar_warpsync", (VOID, VOID, NULL_TREE)); + DEF (BREV, "brev", (UINT, UINT, NULL_TREE)); DEF (BREVLL, "brevll", (LLUINT, LLUINT, NULL_TREE)); @@ -6787,6 +6790,10 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), case NVPTX_BUILTIN_BAR_RED_POPC: return nvptx_expand_bar_red (exp, target, mode, ignore); + case NVPTX_BUILTIN_BAR_WARPSYNC: + emit_insn (gen_nvptx_warpsync ()); + return NULL_RTX; + case NVPTX_BUILTIN_BREV: case NVPTX_BUILTIN_BREVLL: return nvptx_expand_brev (exp, target, mode, ignore); @@ -7758,11 +7765,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) push_gimplify_context (true); + /* Copy the receiver object. */ + tree ref_to_res = gimple_call_arg (call, 1); + if (level != GOMP_DIM_GANG) { - /* Copy the receiver object. */ - tree ref_to_res = gimple_call_arg (call, 1); - if (!integer_zerop (ref_to_res)) { if (!array_p) @@ -7785,13 +7792,14 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) tree call, ptr; if (array_p) { + tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr; tree array_elem_type = TREE_TYPE (array_type); call = nvptx_get_shared_red_addr (array_elem_type, array_max_idx, offset, level == GOMP_DIM_VECTOR); ptr = make_ssa_name (TREE_TYPE (call)); gimplify_assign (ptr, call, &seq); oacc_build_array_copy (fold_convert (TREE_TYPE (array_addr), ptr), - array_addr, array_max_idx, &seq); + copy_src, array_max_idx, &seq); } else { @@ -8026,6 +8034,14 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) else r = nvptx_reduction_update (gimple_location (call), &gsi, accum, var, op, level); + + if (TARGET_SM70 && level == GOMP_DIM_VECTOR) + { + /* After SM70, with Independent Thread Scheduling introduced, + place a warpsync after vector-mode update of accum buffer. */ + tree fn = nvptx_builtin_decl (NVPTX_BUILTIN_BAR_WARPSYNC, true); + gimple_seq_add_stmt (&seq, gimple_build_call (fn, 0)); + } } } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c index 43e139f92314..db8b3749959f 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c @@ -24,6 +24,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[1:2]) + ARRAY_BODY (a, 1, 2) + ARRAY_BODY (o, 1, 2) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[3:2]) #pragma acc loop reduction(+:a[3:2]) ARRAY_BODY (a, 3, 2) @@ -32,6 +40,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[3:2]) + #pragma acc loop worker reduction(+:a[3:2]) + ARRAY_BODY (a, 3, 2) + ARRAY_BODY (o, 3, 2) + for (int i = 0; i < 6; i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[0:5]) ARRAY_BODY (a, 0, 5) @@ -40,6 +56,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[0:5]) + ARRAY_BODY (a, 0, 5) + ARRAY_BODY (o, 0, 5) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, 4, 1) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c index aeae2e06252c..0f023b7c4607 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c @@ -30,6 +30,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:2]) + ARRAY_BODY (a, one, 2) + ARRAY_BODY (o, one, 2) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:2]) #pragma acc loop reduction(+:a[three:2]) ARRAY_BODY (a, three, 2) @@ -38,6 +46,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:2]) + #pragma acc loop worker reduction(+:a[three:2]) + ARRAY_BODY (a, three, 2) + ARRAY_BODY (o, three, 2) + for (int i = 0; i < 6; i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:5]) ARRAY_BODY (a, zero, 5) @@ -46,6 +62,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:5]) + ARRAY_BODY (a, zero, 5) + ARRAY_BODY (o, zero, 5) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, 1) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c index c095284e5660..94dd4c473d57 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c @@ -31,6 +31,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:two]) + ARRAY_BODY (a, one, two) + ARRAY_BODY (o, one, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:two]) #pragma acc loop reduction(+:a[three:two]) ARRAY_BODY (a, three, two) @@ -39,6 +47,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:two]) + #pragma acc loop worker reduction(+:a[three:two]) + ARRAY_BODY (a, three, two) + ARRAY_BODY (o, three, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:five]) ARRAY_BODY (a, zero, five) @@ -47,6 +63,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:five]) + ARRAY_BODY (a, zero, five) + ARRAY_BODY (o, zero, five) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, one) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c index 4794350e8621..56ae020d0fc1 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c @@ -32,6 +32,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel + #pragma acc loop gang reduction(+:a[one:two]) + ARRAY_BODY (a, one, two) + ARRAY_BODY (o, one, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a[three:two]) #pragma acc loop reduction(+:a[three:two]) ARRAY_BODY (a, three, two) @@ -40,6 +48,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a[three:two]) + #pragma acc loop worker reduction(+:a[three:two]) + ARRAY_BODY (a, three, two) + ARRAY_BODY (o, three, two) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel copy(a) #pragma acc loop reduction(+:a[zero:five]) ARRAY_BODY (a, zero, five) @@ -48,6 +64,14 @@ int main (void) if (a[i] != o[i]) __builtin_abort (); + #pragma acc parallel copy(a) + #pragma acc loop vector reduction(+:a[zero:five]) + ARRAY_BODY (a, zero, five) + ARRAY_BODY (o, zero, five) + for (int i = 0; i < sizeof (a) / sizeof (int); i++) + if (a[i] != o[i]) + __builtin_abort (); + #pragma acc parallel #pragma acc loop reduction(+:a) ARRAY_BODY (a, four, one)