jdoerfert updated this revision to Diff 311105.
jdoerfert added a comment.
Herald added a reviewer: aaron.ballman.
Herald added projects: clang, OpenMP.
Herald added a subscriber: cfe-commits.

Add remark and documentation


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93079/new/

https://reviews.llvm.org/D93079

Files:
  clang/include/clang/Basic/AttrDocs.td
  llvm/lib/Transforms/IPO/OpenMPOpt.cpp
  llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
  openmp/docs/remarks/OptimizationRemarks.rst

Index: openmp/docs/remarks/OptimizationRemarks.rst
===================================================================
--- openmp/docs/remarks/OptimizationRemarks.rst
+++ openmp/docs/remarks/OptimizationRemarks.rst
@@ -1,2 +1,30 @@
 OpenMP Optimization Remarks
 ===========================
+
+
+.. _omp100:
+.. _omp_no_external_caller_in_target_region:
+
+`[OMP100]` Potentially unknown OpenMP target region caller
+----------------------------------------------------------
+
+To facilitate OpenMP semantics on GPUs we provide a runtime mechanism through
+which the code that makes up the body of a parallel region is shared with the
+threads in the team. Generally we use the address of the outlined parallel
+region to identify the code that needs to be executed. If we know all target
+regions that reach the parallel region we can avoid this function pointer
+passing scheme and often improve the register usage on the GPU. However, If a
+parallel region on the GPU is in a function with external linkage we may not
+know all callers statically. If there are outside callers within target
+regions, this remark is to be ignored. If there are no such callers, users can
+modify the linkage and thereby help optimization with a `static` or
+`__attribute__((internal))` function annotation. If changing the linkage is
+impossible, e.g., because there are outside callers on the host, we provide an
+assumption for the target side which can be spelled either as
+`__attribute__((assume("omp_no_external_caller_in_target_region")))` or as
+OpenMP assumption, i.a., `#pragma omp begin assume
+ext_no_external_caller_in_target_region`.
+
+The assumption effectively allows the compiler to assume no caller outside the
+current translation unit will call the function from a target region, hence
+the function is not called from outside translation units on the device.
Index: llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
===================================================================
--- llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -7,12 +7,18 @@
 ;     #pragma omp parallel
 ;     { }
 ; }
+; __attribute__((assume("no_external_callers")))
+; void baz(void) {
+;     #pragma omp parallel
+;     { }
+; }
 ; void foo(void) {
 ;   #pragma omp target teams
 ;   {
 ;     #pragma omp parallel
 ;     {}
 ;     bar();
+;     baz();
 ;     #pragma omp parallel
 ;     {}
 ;   }
@@ -23,13 +29,16 @@
 ; another kernel.
 
 ; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef
+; CHECK-DAG: @__omp_outlined__2b_wrapper.ID = private constant i8 undef
 ; CHECK-DAG: @__omp_outlined__3_wrapper.ID = private constant i8 undef
 
 ; CHECK-DAG:   icmp eq i8* %5, @__omp_outlined__1_wrapper.ID
+; CHECK-DAG:   icmp eq i8* %b6, @__omp_outlined__2b_wrapper.ID
 ; CHECK-DAG:   icmp eq i8* %7, @__omp_outlined__3_wrapper.ID
 
 ; CHECK-DAG:   call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__1_wrapper.ID)
-; CHECK-DAG:   call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*))
+; CHECK-DAG:   call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2a_wrapper to i8*))
+; CHECK-DAG:   call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__2b_wrapper.ID)
 ; CHECK-DAG:   call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__3_wrapper.ID)
 
 
@@ -69,11 +78,20 @@
 
 .check.next:                                      ; preds = %.execute.parallel
   %6 = load i8*, i8** %work_fn, align 8
-  %work_match1 = icmp eq i8* %6, bitcast (void ()* @__omp_outlined__2_wrapper to i8*)
-  br i1 %work_match1, label %.execute.fn2, label %.check.next3
+  %work_match1 = icmp eq i8* %6, bitcast (void ()* @__omp_outlined__2a_wrapper to i8*)
+  br i1 %work_match1, label %.execute.fn2a, label %.check.next2
+
+.execute.fn2a:                                     ; preds = %.check.next
+  call void @__omp_outlined__2a_wrapper()
+  br label %.terminate.parallel
+
+.check.next2:                                      ; preds = %.execute.parallel
+  %b6 = load i8*, i8** %work_fn, align 8
+  %work_match1b = icmp eq i8* %b6, bitcast (void ()* @__omp_outlined__2b_wrapper to i8*)
+  br i1 %work_match1b, label %.execute.fn2b, label %.check.next3
 
-.execute.fn2:                                     ; preds = %.check.next
-  call void @__omp_outlined__2_wrapper()
+.execute.fn2b:                                     ; preds = %.check.next
+  call void @__omp_outlined__2b_wrapper()
   br label %.terminate.parallel
 
 .check.next3:                                     ; preds = %.check.next
@@ -111,6 +129,7 @@
 define internal void @__omp_outlined__() {
   call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__1_wrapper to i8*))
   call void @bar()
+  call void @baz()
   call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__3_wrapper to i8*))
   ret void
 }
@@ -125,11 +144,20 @@
 }
 
 define hidden void @bar() {
-  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*))
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2a_wrapper to i8*))
+  ret void
+}
+
+define hidden void @baz() #0 {
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2b_wrapper to i8*))
+  ret void
+}
+
+define internal void @__omp_outlined__2a_wrapper() {
   ret void
 }
 
-define internal void @__omp_outlined__2_wrapper() {
+define internal void @__omp_outlined__2b_wrapper() {
   ret void
 }
 
@@ -147,6 +175,7 @@
 
 declare i32 @__kmpc_global_thread_num(%struct.ident_t* nocapture readnone)
 
+attributes #0 = { "llvm.assume"="abc,omp_no_external_caller_in_target_region,123" }
 
 !nvvm.annotations = !{!0}
 
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Assumptions.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -1450,6 +1451,12 @@
   }
 };
 
+/// The "omp_no_external_caller_in_target_region" assumption guarantees that
+/// there are no external caller of a function which are inside an OpenMP
+/// target region.
+static KnownAssumptionString
+    NoExternalCallerInTargetRegion("omp_no_external_caller_in_target_region");
+
 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
   if (!OMPInfoCache.ModuleSlice.count(&F))
     return nullptr;
@@ -1469,8 +1476,17 @@
     }
 
     CachedKernel = nullptr;
-    if (!F.hasLocalLinkage())
+    if (!F.hasLocalLinkage() &&
+        !hasAssumption(F, NoExternalCallerInTargetRegion)) {
+
+      // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "[OMP100] Potentially unknown OpenMP target region caller";
+      };
+      emitRemarkOnFunction(&F, "OMP100", Remark);
+
       return nullptr;
+    }
   }
 
   auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
Index: clang/include/clang/Basic/AttrDocs.td
===================================================================
--- clang/include/clang/Basic/AttrDocs.td
+++ clang/include/clang/Basic/AttrDocs.td
@@ -3945,9 +3945,13 @@
     "omp_no_openmp"
     "omp_no_openmp_routines"
     "omp_no_parallelism"
+    "omp_no_external_caller_in_target_region"
 
-The OpenMP standard defines the meaning of OpenMP assumptions ("omp_XYZ" is
-spelled "XYZ" in the `OpenMP 5.1 Standard`_).
+The OpenMP standard defines the meaning of the first three OpenMP assumptions
+("omp_XYZ" is spelled "XYZ" in the `OpenMP 5.1 Standard`_) while other are
+clang extensions spelled described `here
+<http://openmp.llvm.org/docs/remarks/OptimizationRemarks.html>`_ and spelled
+without the `omp` in the OpenMP assume directive syntax.
 
 .. _`OpenMP 5.1 Standard`: https://www.openmp.org/spec-html/5.1/openmpsu37.html#x56-560002.5.2
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to