https://github.com/skatrak updated https://github.com/llvm/llvm-project/pull/150927
>From 35cecfa6f0ffb3e320fc00e107518f7cdae27f1b Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Fri, 25 Jul 2025 13:52:11 +0100 Subject: [PATCH 1/2] [OpenMPOpt] Make parallel regions reachable from new DeviceRTL loop functions This patch updates the OpenMP optimization pass to know about the new DeviceRTL functions for loop constructs. This change marks these functions as potentially containing parallel regions, which fixes a current bug with the state machine rewrite optimization. It previously failed to identify parallel regions located inside of the callbacks passed to these new DeviceRTL functions, causing the resulting code to skip executing these parallel regions. As a result, Generic kernels produced by Flang that contain parallel regions now work properly. One known related issue not fixed by this patch is that the presence of calls to these functions will prevent the SPMD-ization of Generic kernels by OpenMPOpt. Previously, this was due to assuming there was no parallel region. This is changed by this patch, but instead we now mark it temporarily as unsupported in an SPMD context. The reason is that, without additional changes, code intended for the main thread of the team located outside of the parallel region would not be guarded properly, resulting in race conditions and generally invalid behavior. --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 22 +++ .../fortran/target-generic-loops.f90 | 130 ++++++++++++++++++ .../offloading/fortran/target-spmd-loops.f90 | 39 ++++++ 3 files changed, 191 insertions(+) create mode 100644 offload/test/offloading/fortran/target-generic-loops.f90 create mode 100644 offload/test/offloading/fortran/target-spmd-loops.f90 diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 1f52f9a78b907..0a7e7cee6909c 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5020,6 +5020,28 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_free_shared: // Return without setting a fixpoint, to be resolved in updateImpl. return; + case OMPRTL___kmpc_distribute_static_loop_4: + case OMPRTL___kmpc_distribute_static_loop_4u: + case OMPRTL___kmpc_distribute_static_loop_8: + case OMPRTL___kmpc_distribute_static_loop_8u: + case OMPRTL___kmpc_distribute_for_static_loop_4: + case OMPRTL___kmpc_distribute_for_static_loop_4u: + case OMPRTL___kmpc_distribute_for_static_loop_8: + case OMPRTL___kmpc_distribute_for_static_loop_8u: + case OMPRTL___kmpc_for_static_loop_4: + case OMPRTL___kmpc_for_static_loop_4u: + case OMPRTL___kmpc_for_static_loop_8: + case OMPRTL___kmpc_for_static_loop_8u: + // Parallel regions might be reached by these calls, as they take a + // callback argument potentially arbitrary user-provided code. + ReachedUnknownParallelRegions.insert(&CB); + // TODO: The presence of these calls on their own does not prevent a + // kernel from being SPMD-izable. We mark it as such because we need + // further changes in order to also consider the contents of the + // callbacks passed to them. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + break; default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, // generally. However, they do not hide parallel regions. diff --git a/offload/test/offloading/fortran/target-generic-loops.f90 b/offload/test/offloading/fortran/target-generic-loops.f90 new file mode 100644 index 0000000000000..07bcbfd2c8752 --- /dev/null +++ b/offload/test/offloading/fortran/target-generic-loops.f90 @@ -0,0 +1,130 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, i2, n1, n2, counter + + n1 = 100 + n2 = 50 + + counter = 0 + !$omp target map(tofrom:counter) + !$omp teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 200 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target map(tofrom:counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 3 203 + print '(I2" "I0)', 3, counter + + counter = 0 + !$omp target map(tofrom: counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 4 102 + print '(I2" "I0)', 4, counter + + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 5 5000 + print '(I2" "I0)', 5, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 6 5200 + print '(I2" "I0)', 6, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 7 10000 + print '(I2" "I0)', 7, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 8 10300 + print '(I2" "I0)', 8, counter +end program diff --git a/offload/test/offloading/fortran/target-spmd-loops.f90 b/offload/test/offloading/fortran/target-spmd-loops.f90 new file mode 100644 index 0000000000000..7407f0c0768cb --- /dev/null +++ b/offload/test/offloading/fortran/target-spmd-loops.f90 @@ -0,0 +1,39 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, n1, counter + + n1 = 100 + + counter = 0 + !$omp target parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 100 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target teams distribute parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 3 100 + print '(I2" "I0)', 3, counter +end program >From 988241104054b095a9de2f8589a7703fb9481442 Mon Sep 17 00:00:00 2001 From: Sergio Afonso <[email protected]> Date: Thu, 14 Aug 2025 14:01:48 +0100 Subject: [PATCH 2/2] Address review comments --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 0a7e7cee6909c..626a44ff0bffa 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5033,7 +5033,8 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_for_static_loop_8: case OMPRTL___kmpc_for_static_loop_8u: // Parallel regions might be reached by these calls, as they take a - // callback argument potentially arbitrary user-provided code. + // callback argument potentially containing arbitrary user-provided + // code. ReachedUnknownParallelRegions.insert(&CB); // TODO: The presence of these calls on their own does not prevent a // kernel from being SPMD-izable. We mark it as such because we need _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
