On 2016/5/31 05:51 PM, Chung-Lin Tang wrote: > On 2016/5/31 3:28 PM, Thomas Schwinge wrote: >> > Hi! >> > >> > On Mon, 30 May 2016 18:53:41 +0200, Jakub Jelinek <ja...@redhat.com> wrote: >>> >> On Mon, May 30, 2016 at 10:38:59PM +0800, Chung-Lin Tang wrote: >>>> >>> Hi, a previous patch of Cesar's has made the middle-end omp-lowering >>>> >>> automatically create and insert a tofrom (i.e. present_or_copy) map for >>>> >>> parallel reductions. This allowed the user to not need explicit >>>> >>> clauses to copy out the reduction result, but because reduction >>>> >>> arguments >>>> >>> are not marked addressable, async does not work as expected, >>>> >>> i.e. the asynchronous copy-out results are not used in the compiler >>>> >>> generated code. >>> >> >>> >> If you need it only for async parallel/kernels? regions, can't you do >>> >> that >>> >> only for those and not for others? > That is achievable, but not in line with how we currently treat all other > data clause OMP_CLAUSE_MAPs, which are all marked addressable. Is this special > case handling really better here? >
Hi Jakub, here's a version of the patch with the addressable marking restricted to when there's an async clause. Tests re-ran to ensure no regressions. Please inform if this way seems better. Also attached are some new testcases. Thanks, Chung-Lin 2016-06-01 Chung-Lin Tang <clt...@codesourcery.com> c/ * c-typeck.c (c_finish_omp_clauses): Mark OpenACC reduction arguments as addressable when async clause exists. cp/ * semantics.c (finish_omp_clauses): Mark OpenACC reduction arguments as addressable when async clause exists. fortran/ * trans-openmp.c (gfc_trans_oacc_construct): Mark OpenACC reduction arguments as addressable. when async clause exists. (gfc_trans_oacc_combined_directive): Likewise. libgomp/testsuite/ * libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c: New test. * libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90: New test.
Index: c/c-typeck.c =================================================================== --- c/c-typeck.c (revision 236845) +++ c/c-typeck.c (working copy) @@ -12529,6 +12529,7 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg tree *nowait_clause = NULL; bool ordered_seen = false; tree schedule_clause = NULL_TREE; + bool oacc_async = false; bitmap_obstack_initialize (NULL); bitmap_initialize (&generic_head, &bitmap_default_obstack); @@ -12539,6 +12540,14 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg bitmap_initialize (&map_field_head, &bitmap_default_obstack); bitmap_initialize (&oacc_reduction_head, &bitmap_default_obstack); + if (ort & C_ORT_ACC) + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC) + { + oacc_async = true; + break; + } + for (pc = &clauses, c = clauses; c ; c = *pc) { bool remove = false; @@ -12575,6 +12584,8 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg remove = true; break; } + if (oacc_async) + c_mark_addressable (t); type = TREE_TYPE (t); if (TREE_CODE (t) == MEM_REF) type = TREE_TYPE (type); Index: cp/semantics.c =================================================================== --- cp/semantics.c (revision 236845) +++ cp/semantics.c (working copy) @@ -5774,6 +5774,7 @@ finish_omp_clauses (tree clauses, enum c_omp_regio bool branch_seen = false; bool copyprivate_seen = false; bool ordered_seen = false; + bool oacc_async = false; bitmap_obstack_initialize (NULL); bitmap_initialize (&generic_head, &bitmap_default_obstack); @@ -5784,6 +5785,14 @@ finish_omp_clauses (tree clauses, enum c_omp_regio bitmap_initialize (&map_field_head, &bitmap_default_obstack); bitmap_initialize (&oacc_reduction_head, &bitmap_default_obstack); + if (ort & C_ORT_ACC) + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC) + { + oacc_async = true; + break; + } + for (pc = &clauses, c = clauses; c ; c = *pc) { bool remove = false; @@ -5827,6 +5836,8 @@ finish_omp_clauses (tree clauses, enum c_omp_regio t = n; goto check_dup_generic_t; } + if (oacc_async) + cxx_mark_addressable (t); goto check_dup_generic; case OMP_CLAUSE_COPYPRIVATE: copyprivate_seen = true; Index: fortran/trans-openmp.c =================================================================== --- fortran/trans-openmp.c (revision 236845) +++ fortran/trans-openmp.c (working copy) @@ -2704,6 +2704,15 @@ gfc_trans_oacc_construct (gfc_code *code) gfc_start_block (&block); oacc_clauses = gfc_trans_omp_clauses (&block, code->ext.omp_clauses, code->loc); + for (tree c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC) + { + for (c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION + && DECL_P (OMP_CLAUSE_DECL (c))) + TREE_ADDRESSABLE (OMP_CLAUSE_DECL (c)) = 1; + break; + } stmt = gfc_trans_omp_code (code->block->next, true); stmt = build2_loc (input_location, construct_code, void_type_node, stmt, oacc_clauses); @@ -3501,6 +3510,15 @@ gfc_trans_oacc_combined_directive (gfc_code *code) construct_clauses.lists[OMP_LIST_REDUCTION] = NULL; oacc_clauses = gfc_trans_omp_clauses (&block, &construct_clauses, code->loc); + for (tree c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC) + { + for (c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION + && DECL_P (OMP_CLAUSE_DECL (c))) + TREE_ADDRESSABLE (OMP_CLAUSE_DECL (c)) = 1; + break; + } } if (!loop_clauses.seq) pblock = █
Index: libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90 =================================================================== --- libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90 (revision 0) +++ libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90 (revision 0) @@ -0,0 +1,41 @@ +! { dg-do run } + +program reduction + implicit none + integer, parameter :: n = 100 + integer :: i, h1, h2, s1, s2, a1, a2 + + h1 = 0 + h2 = 0 + do i = 1, n + h1 = h1 + 1 + h2 = h2 + 2 + end do + + s1 = 0 + s2 = 0 + !$acc parallel loop reduction(+:s1, s2) + do i = 1, n + s1 = s1 + 1 + s2 = s2 + 2 + end do + !$acc end parallel loop + + a1 = 0 + a2 = 0 + !$acc parallel loop reduction(+:a1, a2) async(1) + do i = 1, n + a1 = a1 + 1 + a2 = a2 + 2 + end do + !$acc end parallel loop + + if (h1 .ne. s1) call abort () + if (h2 .ne. s2) call abort () + + !$acc wait(1) + + if (h1 .ne. a1) call abort () + if (h2 .ne. a2) call abort () + +end program reduction Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c (revision 0) @@ -0,0 +1,30 @@ +const int n = 100; + +// Check async over parallel construct with reduction + +int +async_sum (int c) +{ + int s = 0; + +#pragma acc parallel loop num_gangs (10) gang reduction (+:s) async + for (int i = 0; i < n; i++) + s += i+c; + +#pragma acc wait + return s; +} + +int +main() +{ + int result = 0; + + for (int i = 0; i < n; i++) + result += i+1; + + if (async_sum (1) != result) + __builtin_abort (); + + return 0; +}