https://gcc.gnu.org/g:0a5e48f68ff8d2e581031def92eef5bf3338934b

commit 0a5e48f68ff8d2e581031def92eef5bf3338934b
Author: Chung-Lin Tang <clt...@baylibre.com>
Date:   Sun Mar 30 21:04:45 2025 +0000

    OpenACC: array reductions bug fixes
    
    This is a merge of the v4 to v5 diff patch from:
    https://gcc.gnu.org/pipermail/gcc-patches/2025-March/679682.html
    
    This patch fixes issues found for NVPTX sm_70 testing, and another issue
    related to copying to reduction buffer for worker/vector mode.
    
    gcc/ChangeLog:
    
            * config/gcn/gcn-tree.cc (gcn_goacc_reduction_setup): Fix array case
            copy source into reduction buffer.
            * config/nvptx/nvptx.cc (nvptx_expand_shared_addr): Move default 
size
            init setting place.
            (enum nvptx_builtins): Add NVPTX_BUILTIN_BAR_WARPSYNC.
            (nvptx_init_builtins): Add DEF() of nvptx_builtin_bar_warpsync.
            (nvptx_expand_builtin): Expand NVPTX_BUILTIN_BAR_WARPSYNC.
            (nvptx_goacc_reduction_setup): Fix array case copy source into 
reduction
            buffer.
            (nvptx_goacc_reduction_fini): Add bar.warpsync for at end of 
vector-mode
            reductions for sm_70 and above.
    
    libgomp/ChangeLog:
    
            * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c: Adjust 
test.
            * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c: 
Likewise.
            * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c: 
Likewise.
            * testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c: 
Likewise.

Diff:
---
 gcc/config/gcn/gcn-tree.cc                         |  3 +-
 gcc/config/nvptx/nvptx.cc                          | 32 ++++++++++++++++------
 .../libgomp.oacc-c-c++-common/reduction-arrays-2.c | 24 ++++++++++++++++
 .../libgomp.oacc-c-c++-common/reduction-arrays-3.c | 24 ++++++++++++++++
 .../libgomp.oacc-c-c++-common/reduction-arrays-4.c | 24 ++++++++++++++++
 .../libgomp.oacc-c-c++-common/reduction-arrays-5.c | 24 ++++++++++++++++
 6 files changed, 122 insertions(+), 9 deletions(-)

diff --git a/gcc/config/gcn/gcn-tree.cc b/gcc/config/gcn/gcn-tree.cc
index a71aadc395a0..97f35cd6f7a0 100644
--- a/gcc/config/gcn/gcn-tree.cc
+++ b/gcc/config/gcn/gcn-tree.cc
@@ -750,13 +750,14 @@ gcn_goacc_reduction_setup (gcall *call)
       tree offset = gimple_call_arg (call, 5);
       if (array_p)
        {
+         tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr;
          tree decl = gcn_goacc_get_worker_array_reduction_buffer
            (array_type, array_max_idx, &seq);
          tree ptr = make_ssa_name (TREE_TYPE (array_addr));
          gimplify_assign (ptr, build_fold_addr_expr (decl), &seq);
 
          /* Store incoming value to worker reduction buffer.  */
-         oacc_build_array_copy (ptr, array_addr, array_max_idx, &seq);
+         oacc_build_array_copy (ptr, copy_src, array_max_idx, &seq);
        }
       else
        {
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 886a39a26e95..8e5d9ecce70a 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -6500,16 +6500,16 @@ nvptx_expand_shared_addr (tree exp, rtx target,
   if (TREE_CONSTANT (size_expr))
     size = TREE_INT_CST_LOW (size_expr);
 
+  /* Default size for unknown size expression.  */
+  if (size == 0)
+    size = 256;
+
   if (vector)
     {
       offload_attrs oa;
 
       populate_offload_attrs (&oa);
 
-      /* Default size for unknown size expression.  */
-      if (size == 0)
-       size = 256;
-
       unsigned int psize = ROUND_UP (size + offset, align);
       unsigned int pnum = nvptx_mach_max_workers ();
       vector_red_partition = MAX (vector_red_partition, psize);
@@ -6605,6 +6605,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_BAR_RED_AND,
   NVPTX_BUILTIN_BAR_RED_OR,
   NVPTX_BUILTIN_BAR_RED_POPC,
+  NVPTX_BUILTIN_BAR_WARPSYNC,
   NVPTX_BUILTIN_BREV,
   NVPTX_BUILTIN_BREVLL,
   NVPTX_BUILTIN_COND_UNI,
@@ -6737,6 +6738,8 @@ nvptx_init_builtins (void)
   DEF (BAR_RED_POPC, "bar_red_popc",
        (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
 
+  DEF (BAR_WARPSYNC, "bar_warpsync", (VOID, VOID, NULL_TREE));
+
   DEF (BREV, "brev", (UINT, UINT, NULL_TREE));
   DEF (BREVLL, "brevll", (LLUINT, LLUINT, NULL_TREE));
 
@@ -6787,6 +6790,10 @@ nvptx_expand_builtin (tree exp, rtx target, rtx 
ARG_UNUSED (subtarget),
     case NVPTX_BUILTIN_BAR_RED_POPC:
       return nvptx_expand_bar_red (exp, target, mode, ignore);
 
+    case NVPTX_BUILTIN_BAR_WARPSYNC:
+      emit_insn (gen_nvptx_warpsync ());
+      return NULL_RTX;
+
     case NVPTX_BUILTIN_BREV:
     case NVPTX_BUILTIN_BREVLL:
       return nvptx_expand_brev (exp, target, mode, ignore);
@@ -7758,11 +7765,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs 
*oa)
 
   push_gimplify_context (true);
 
+  /* Copy the receiver object.  */
+  tree ref_to_res = gimple_call_arg (call, 1);
+
   if (level != GOMP_DIM_GANG)
     {
-      /* Copy the receiver object.  */
-      tree ref_to_res = gimple_call_arg (call, 1);
-
       if (!integer_zerop (ref_to_res))
        {
          if (!array_p)
@@ -7785,13 +7792,14 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs 
*oa)
       tree call, ptr;
       if (array_p)
        {
+         tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr;
          tree array_elem_type = TREE_TYPE (array_type);
          call = nvptx_get_shared_red_addr (array_elem_type, array_max_idx,
                                            offset, level == GOMP_DIM_VECTOR);
          ptr = make_ssa_name (TREE_TYPE (call));
          gimplify_assign (ptr, call, &seq);
          oacc_build_array_copy (fold_convert (TREE_TYPE (array_addr), ptr),
-                                array_addr, array_max_idx, &seq);
+                                copy_src, array_max_idx, &seq);
        }
       else
        {
@@ -8026,6 +8034,14 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs 
*oa)
          else
            r = nvptx_reduction_update (gimple_location (call), &gsi,
                                        accum, var, op, level);
+
+         if (TARGET_SM70 && level == GOMP_DIM_VECTOR)
+           {
+             /* After SM70, with Independent Thread Scheduling introduced,
+                place a warpsync after vector-mode update of accum buffer.  */
+             tree fn = nvptx_builtin_decl (NVPTX_BUILTIN_BAR_WARPSYNC, true);
+             gimple_seq_add_stmt (&seq, gimple_build_call (fn, 0));
+           }
        }
     }
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c
index 43e139f92314..db8b3749959f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-2.c
@@ -24,6 +24,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[1:2])
+  ARRAY_BODY (a, 1, 2)
+  ARRAY_BODY (o, 1, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a[3:2])
   #pragma acc loop reduction(+:a[3:2])
   ARRAY_BODY (a, 3, 2)
@@ -32,6 +40,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a[3:2])
+  #pragma acc loop worker reduction(+:a[3:2])
+  ARRAY_BODY (a, 3, 2)
+  ARRAY_BODY (o, 3, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a)
   #pragma acc loop reduction(+:a[0:5])
   ARRAY_BODY (a, 0, 5)
@@ -40,6 +56,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[0:5])
+  ARRAY_BODY (a, 0, 5)
+  ARRAY_BODY (o, 0, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel
   #pragma acc loop reduction(+:a)
   ARRAY_BODY (a, 4, 1)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c
index aeae2e06252c..0f023b7c4607 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-3.c
@@ -30,6 +30,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:2])
+  ARRAY_BODY (a, one, 2)
+  ARRAY_BODY (o, one, 2)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a[three:2])
   #pragma acc loop reduction(+:a[three:2])
   ARRAY_BODY (a, three, 2)
@@ -38,6 +46,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a[three:2])
+  #pragma acc loop worker reduction(+:a[three:2])
+  ARRAY_BODY (a, three, 2)
+  ARRAY_BODY (o, three, 2)
+  for (int i = 0; i < 6; i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a)
   #pragma acc loop reduction(+:a[zero:5])
   ARRAY_BODY (a, zero, 5)
@@ -46,6 +62,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:5])
+  ARRAY_BODY (a, zero, 5)
+  ARRAY_BODY (o, zero, 5)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel
   #pragma acc loop reduction(+:a)
   ARRAY_BODY (a, four, 1)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c
index c095284e5660..94dd4c473d57 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-4.c
@@ -31,6 +31,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a[three:two])
   #pragma acc loop reduction(+:a[three:two])
   ARRAY_BODY (a, three, two)
@@ -39,6 +47,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop worker reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a)
   #pragma acc loop reduction(+:a[zero:five])
   ARRAY_BODY (a, zero, five)
@@ -47,6 +63,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel
   #pragma acc loop reduction(+:a)
   ARRAY_BODY (a, four, one)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c
index 4794350e8621..56ae020d0fc1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-arrays-5.c
@@ -32,6 +32,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel
+  #pragma acc loop gang reduction(+:a[one:two])
+  ARRAY_BODY (a, one, two)
+  ARRAY_BODY (o, one, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a[three:two])
   #pragma acc loop reduction(+:a[three:two])
   ARRAY_BODY (a, three, two)
@@ -40,6 +48,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a[three:two])
+  #pragma acc loop worker reduction(+:a[three:two])
+  ARRAY_BODY (a, three, two)
+  ARRAY_BODY (o, three, two)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel copy(a)
   #pragma acc loop reduction(+:a[zero:five])
   ARRAY_BODY (a, zero, five)
@@ -48,6 +64,14 @@ int main (void)
     if (a[i] != o[i])
       __builtin_abort ();
 
+  #pragma acc parallel copy(a)
+  #pragma acc loop vector reduction(+:a[zero:five])
+  ARRAY_BODY (a, zero, five)
+  ARRAY_BODY (o, zero, five)
+  for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+    if (a[i] != o[i])
+      __builtin_abort ();
+
   #pragma acc parallel
   #pragma acc loop reduction(+:a)
   ARRAY_BODY (a, four, one)

Reply via email to