[PATCH] D59328: [OpenMP][Offloading][2/3] Codegen for target regions (TRegions)

Johannes Doerfert via Phabricator via cfe-commits Wed, 13 Mar 2019 13:57:52 -0700

jdoerfert created this revision.
jdoerfert added a project: OpenMP.
jdoerfert added reviewers: ABataev, arpith-jacob, guraypp, gtbercea, hfinkel.
Herald added a project: clang.


The commit includes the Clang code generation for OpenMP target
constructs based on the interface target region (TRegion) interface.

The interface was introduced in https://reviews.llvm.org/D59319 .

This target code generation is a vastly simplified clone of the NVPTX
code generation but

- there is no NVPTX (or other) target specific code, at least there should not 
be any. The "checkArchForUnifiedAddressing" functionality should therefore be 
moved to a target specific location later on.
- we provide hooks for subclasses in order to perform front-end analysis, as an 
alternative of LLVM based optimizations, e.g., to enable SPMD-mode. (See 
isKnownSPMDMode, mayNeedRuntimeSupport, and mayPerformDataSharing)

The interface is deliberately simple to be easily analyzable in the
middle end. Design decisions included:

- Hide all (complex) implementation choices in the runtime library but allow 
complete removal of the abstraction once the runtime is inlined.
- Provide all runtime calls with sufficient, easy encoded information.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D59328

Files:
  clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
  clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
  clang/lib/CodeGen/CMakeLists.txt
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/OpenMP/target_tregion_no_SPMD_mode.c

Index: clang/test/OpenMP/target_tregion_no_SPMD_mode.c
===================================================================
--- /dev/null
+++ clang/test/OpenMP/target_tregion_no_SPMD_mode.c
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -mllvm -openmp-tregion-runtime -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+// CHECK: loop_in_loop_in_tregion
+// CHECK:  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+// CHECK:  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+void loop_in_loop_in_tregion(int *A, int *B) {
+#pragma omp target
+  for (int i = 0; i < 512; i++) {
+    for (int j = 0; j < 1024; j++)
+      A[j] += B[i + j];
+  }
+}
+
+// CHECK: parallel_loops_and_accesses_in_tregion
+// CHECK:  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+// CHECK:  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false)
+// CHECK:  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false)
+// CHECK:  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false)
+// CHECK:  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+void parallel_loops_and_accesses_in_tregion(int *A, int *B) {
+#pragma omp target
+  {
+#pragma omp parallel for
+    for (int j = 0; j < 1024; j++)
+      A[j] += B[0 + j];
+#pragma omp parallel for
+    for (int j = 0; j < 1024; j++)
+      A[j] += B[1 + j];
+#pragma omp parallel for
+    for (int j = 0; j < 1024; j++)
+      A[j] += B[2 + j];
+
+    // This needs a guard in SPMD mode
+    A[0] = B[0];
+  }
+}
+
+void extern_func();
+static void parallel_loop(int *A, int *B, int i) {
+#pragma omp parallel for
+  for (int j = 0; j < 1024; j++)
+    A[j] += B[i + j];
+}
+
+// CHECK: parallel_loop_in_function_in_loop_with_global_acc_in_tregion
+// CHECK:  %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+// CHECK:  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+int Global[512];
+void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) {
+#pragma omp target
+  for (int i = 0; i < 512; i++) {
+    parallel_loop(A, B, i);
+    Global[i]++;
+  }
+}
+
+// CHECK: parallel_loop
+// CHECK:  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+
+// CHECK: parallel_loops_in_functions_and_extern_func_in_tregion
+// CHECK:  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+// CHECK:  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) {
+#pragma omp target
+  {
+    parallel_loop(A, B, 1);
+    parallel_loop(A, B, 2);
+    extern_func();
+    parallel_loop(A, B, 3);
+  }
+}
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -20,6 +20,7 @@
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeNVPTX.h"
+#include "CGOpenMPRuntimeTRegion.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "ConstantEmitter.h"
@@ -67,6 +68,11 @@
     llvm::cl::desc("Emit limited coverage mapping information (experimental)"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> UseGenericTRegionInterface(
+    "openmp-tregion-runtime", llvm::cl::ZeroOrMore, llvm::cl::Hidden,
+    llvm::cl::desc("Use the generic target region OpenMP runtime interface"),
+    llvm::cl::init(false));
+
 static const char AnnotationSection[] = "llvm.metadata";
 
 static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
@@ -206,7 +212,10 @@
   case llvm::Triple::nvptx64:
     assert(getLangOpts().OpenMPIsDevice &&
            "OpenMP NVPTX is only prepared to deal with device code.");
-    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
+    if (UseGenericTRegionInterface)
+      OpenMPRuntime.reset(new CGOpenMPRuntimeTRegion(*this));
+    else
+      OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
     break;
   default:
     if (LangOpts.OpenMPSimd)
Index: clang/lib/CodeGen/CMakeLists.txt
===================================================================
--- clang/lib/CodeGen/CMakeLists.txt
+++ clang/lib/CodeGen/CMakeLists.txt
@@ -69,6 +69,7 @@
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
   CGOpenMPRuntimeNVPTX.cpp
+  CGOpenMPRuntimeTRegion.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
Index: clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
===================================================================
--- /dev/null
+++ clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
@@ -0,0 +1,260 @@
+//===-- CGOpenMPRuntimeTRegion.h --- OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code generation interface for OpenMP target offloading though the generic
+// Target Region (TRegion) interface.
+//
+// See openmp/libomptarget/deviceRTLs/common/target_Region.h for further
+// information on the interface functions and their intended use.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+
+#include "CGOpenMPRuntime.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeTRegion : public CGOpenMPRuntime {
+  // TODO: The target region interface only covers kernel codes for now. This
+  //       therefore codegen implicitly assumes the target region kernel
+  //       interface is targeted. Once a second target region interface is put
+  //       in place, e.g., specialized to many-core offloading, we might need
+  //       to make the target interface explicit.
+
+  /// Create an outlined function for a target kernel.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+                  const RegionCodeGenTy &CodeGen);
+
+  /// Helper for generic kernel mode, target directive's entry function.
+  void emitKernelHeader(CodeGenFunction &CGF, llvm::BasicBlock *&ExitBB);
+
+  /// Signal termination of generic mode execution.
+  void emitKernelFooter(CodeGenFunction &CGF, llvm::BasicBlock *ExitBB);
+
+  //
+  // Base class overrides.
+  //
+
+  /// Creates offloading entry for the provided entry ID \a ID,
+  /// address \a Addr, size \a Size, and flags \a Flags.
+  void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                          uint64_t Size, int32_t Flags,
+                          llvm::GlobalValue::LinkageTypes Linkage) override;
+
+  /// Emit outlined function for 'target' directive.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
+                                  StringRef ParentName,
+                                  llvm::Function *&OutlinedFn,
+                                  llvm::Constant *&OutlinedFnID,
+                                  bool IsOffloadEntry,
+                                  const RegionCodeGenTy &CodeGen) override;
+
+protected:
+  /// Get the function name of an outlined region, customized to the target.
+  StringRef getOutlinedHelperName() const override { return ".omp_TRegion."; }
+
+public:
+  explicit CGOpenMPRuntimeTRegion(CodeGenModule &CGM);
+
+  /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
+  /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
+  virtual void emitProcBindClause(CodeGenFunction &CGF,
+                                  OpenMPProcBindClauseKind ProcBind,
+                                  SourceLocation Loc) override;
+
+  /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
+  /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
+  /// clause.
+  /// \param NumThreads An integer value of threads.
+  virtual void emitNumThreadsClause(CodeGenFunction &CGF,
+                                    llvm::Value *NumThreads,
+                                    SourceLocation Loc) override;
+
+  /// Set the number of teams to \p NumTeams and the thread limit to
+  /// \p ThreadLimit.
+  ///
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                          const Expr *ThreadLimit, SourceLocation Loc) override;
+
+  /// Emits inlined function for the specified OpenMP parallel directive.
+  ///
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Function *
+  emitParallelOutlinedFunction(const OMPExecutableDirective &D,
+                               const VarDecl *ThreadIDVar,
+                               OpenMPDirectiveKind InnermostKind,
+                               const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                        llvm::Function *OutlinedFn,
+                        ArrayRef<llvm::Value *> CapturedVars,
+                        const Expr *IfCond) override;
+
+  /// Emits a critical region.
+  /// \param CriticalName Name of the critical region.
+  /// \param CriticalOpGen Generator for the statement associated with the given
+  /// critical region.
+  /// \param Hint Value of the 'hint' clause (optional).
+  void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName,
+                          const RegionCodeGenTy &CriticalOpGen,
+                          SourceLocation Loc,
+                          const Expr *Hint = nullptr) override;
+
+  /// Emit a code for reduction clause.
+  ///
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  /// \param Options List of options for reduction codegen:
+  ///     WithNowait true if parent directive has also nowait clause, false
+  ///     otherwise.
+  ///     SimpleReduction Emit reduction operation only. Used for omp simd
+  ///     directive on the host.
+  ///     ReductionKind The kind of reduction to perform.
+  virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
+                             ArrayRef<const Expr *> Privates,
+                             ArrayRef<const Expr *> LHSExprs,
+                             ArrayRef<const Expr *> RHSExprs,
+                             ArrayRef<const Expr *> ReductionOps,
+                             ReductionOptionsTy Options) override;
+
+  /// Emits OpenMP-specific function prolog.
+  /// Required for device constructs.
+  void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override;
+
+  /// Cleans up references to the objects in finished function.
+  ///
+  void functionFinished(CodeGenFunction &CGF) override;
+
+  /// Choose a default value for the dist_schedule clause.
+  void
+  getDefaultDistScheduleAndChunk(CodeGenFunction &CGF,
+                                 const OMPLoopDirective &S,
+                                 OpenMPDistScheduleClauseKind &ScheduleKind,
+                                 llvm::Value *&Chunk) const override;
+
+  /// Choose a default value for the schedule clause.
+  void getDefaultScheduleAndChunk(CodeGenFunction &CGF,
+                                  const OMPLoopDirective &S,
+                                  OpenMPScheduleClauseKind &ScheduleKind,
+                                  const Expr *&ChunkExpr) const override;
+
+  /// Perform check on requires decl to ensure that target architecture
+  /// supports unified addressing
+  void checkArchForUnifiedAddressing(CodeGenModule &CGM,
+                                     const OMPRequiresDecl *D) const override;
+
+  /// Emits inlined function for the specified OpenMP teams
+  //  directive.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Function *
+  emitTeamsOutlinedFunction(const OMPExecutableDirective &D,
+                            const VarDecl *ThreadIDVar,
+                            OpenMPDirectiveKind InnermostKind,
+                            const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
+                     SourceLocation Loc, llvm::Function *OutlinedFn,
+                     ArrayRef<llvm::Value *> CapturedVars) override;
+
+protected:
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// SPMD mode.
+  virtual bool isKnownSPMDMode() const { return false; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without runtime support.
+  virtual bool mayNeedRuntimeSupport() const { return true; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without data sharing support.
+  virtual bool mayPerformDataSharing() const { return true; }
+
+private:
+  /// Simple container for a wrapper of an outlined parallel function and the
+  /// layout of the passed variables (= captured variables, both shared and
+  /// firstprivate).
+  struct WrapperInfo {
+    llvm::Function *WrapperFn = nullptr;
+    llvm::StructType *SharedVarsStructTy = nullptr;
+    llvm::StructType *PrivateVarsStructTy = nullptr;
+    llvm::SmallBitVector CaptureIsPrivate;
+  };
+
+  /// Map an outlined function to its wrapper and shared struct type. The latter
+  /// defines the layout of the payload and the wrapper will unpack that payload
+  /// and pass the values to the outlined function.
+  llvm::DenseMap<llvm::Function *, WrapperInfo> WrapperInfoMap;
+
+  /// Emit function which wraps the outline parallel region
+  /// and controls the parameters which are passed to this function.
+  /// The wrapper ensures that the outlined function is called
+  /// with the correct arguments when data is shared.
+  void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn,
+                                        const OMPExecutableDirective &D);
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEKERNEL_H
Index: clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
===================================================================
--- /dev/null
+++ clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
@@ -0,0 +1,712 @@
+//===-- CGOpenMPRuntimeTRegion.cpp - OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the code generation interface for OpenMP target offloading
+// though the Target Region (TRegion) interface.
+//
+// See the file comment in CGOpenMPRuntimeTRegion.h for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeTRegion.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtVisitor.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+namespace {
+
+/// Enums for all functions in the target
+enum OpenMPTargetRuntimeLibraryCalls {
+  OMPRTL__kmpc_target_region_kernel_init,
+  OMPRTL__kmpc_target_region_kernel_deinit,
+  OMPRTL__kmpc_target_region_kernel_parallel,
+};
+
+/// Return the runtime function declaration specified by \p Function.
+static llvm::Function *getOrCreateRuntimeFunctionDeclaration(
+    CGOpenMPRuntimeTRegion &CG, CodeGenModule &CGM,
+    OpenMPTargetRuntimeLibraryCalls Function) {
+
+  llvm::Function *RTFn;
+  auto *I1Ty = llvm::IntegerType::getInt1Ty(CGM.getLLVMContext());
+  switch (Function) {
+  case OMPRTL__kmpc_target_region_kernel_init: {
+    // char __kmpc_target_region_kernel_init(bool UseSPMDMode,
+    //                                       bool UseStateMachine,
+    //                                       bool RequiresOMPRuntime,
+    //                                       bool RequiresDataSharing);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty, I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int8Ty, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_init")
+            .getCallee());
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_deinit: {
+    // void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+    //                                         bool RequiredOMPRuntime);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_deinit")
+            .getCallee());
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_parallel: {
+    // typedef void (*ParallelWorkFnTy)(void *, void *);
+    auto *ParWorkFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, {CGM.VoidPtrTy, CGM.VoidPtrTy},
+                                /* isVarArg */ false);
+    //
+    // void __kmpc_target_region_kernel_parallel(bool UseSPMDMode,
+    //                                           bool RequiredOMPRuntime,
+    //                                           ParallelWorkFnTy WorkFn,
+    //                                           void *SharedVars,
+    //                                           uint16_t SharedVarsBytes,
+    //                                           void *PrivateVars,
+    //                                           uint16_t PrivateVarsBytes,
+    //                                           bool SharedPointers);
+    llvm::Type *TypeParams[] = {
+        I1Ty,          I1Ty,        ParWorkFnTy->getPointerTo(),
+        CGM.VoidPtrTy, CGM.Int16Ty, CGM.VoidPtrTy,
+        CGM.Int16Ty,   I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_parallel")
+            .getCallee());
+
+    RTFn->addParamAttr(2, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(3, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::ReadOnly);
+
+    // Add the callback metadata if it is not present already.
+    if (!RTFn->hasMetadata(llvm::LLVMContext::MD_callback)) {
+      llvm::LLVMContext &Ctx = RTFn->getContext();
+      llvm::MDBuilder MDB(Ctx);
+      // Annotate the callback behavior of __kmpc_target_region_kernel_parallel:
+      //  - The callback callee is WorkFn, argument 2 starting with 0.
+      //  - The first callback payload is SharedVars.
+      //  - The second callback payload is PrivateVars.
+      RTFn->addMetadata(
+          llvm::LLVMContext::MD_callback,
+          *llvm::MDNode::get(
+              Ctx, {MDB.createCallbackEncoding(2, {3, 5},
+                                               /* VarArgsArePassed */ false)}));
+    }
+    break;
+  }
+  }
+
+  // TODO: Remove all globals and set this attribute.
+  //
+  // This is overwritten when the definition is linked in.
+  // RTFn->addFnAttr(llvm::Attribute::InaccessibleMemOrArgMemOnly);
+
+  return RTFn;
+}
+
+} // anonymous namespace
+
+void CGOpenMPRuntimeTRegion::emitKernel(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        const RegionCodeGenTy &CodeGen) {
+  WrapperInfoMap.clear();
+
+  // Emit target region as a standalone region.
+  class KernelPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeTRegion &RT;
+    llvm::BasicBlock *ExitBB;
+
+  public:
+    KernelPrePostActionTy(CGOpenMPRuntimeTRegion &RT)
+        : RT(RT), ExitBB(nullptr) {}
+
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitKernelHeader(CGF, ExitBB);
+      // Skip target region initialization.
+      RT.setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+    }
+
+    void Exit(CodeGenFunction &CGF) override {
+      RT.clearLocThreadIdInsertPt(CGF);
+      RT.emitKernelFooter(CGF, ExitBB);
+    }
+
+  } Action(*this);
+  CodeGen.setAction(Action);
+
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   /* IsOffloadEntry */ true, CodeGen);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelHeader(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *&ExitBB) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::Value *Args[] = {
+      /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+      /* UseStateMachine */ Bld.getInt1(1),
+      /* RequiresOMPRuntime */
+      Bld.getInt1(mayNeedRuntimeSupport()),
+      /* RequiresDataSharing */ Bld.getInt1(mayPerformDataSharing())};
+  llvm::CallInst *InitCI = CGF.EmitRuntimeCall(
+      getOrCreateRuntimeFunctionDeclaration(
+          *this, CGM, OMPRTL__kmpc_target_region_kernel_init),
+      Args);
+
+  llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(InitCI, Bld.getInt8(1));
+
+  Bld.CreateCondBr(ExecuteCnd, ExecuteBB, ExitBB);
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelFooter(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *ExitBB) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+
+  CGBuilderTy &Bld = CGF.Builder;
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  llvm::Value *Args[] = {/* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+                         /* RequiredOMPRuntime */
+                         Bld.getInt1(mayNeedRuntimeSupport())};
+
+  CGF.EmitRuntimeCall(getOrCreateRuntimeFunctionDeclaration(
+                          *this, CGM, OMPRTL__kmpc_target_region_kernel_deinit),
+                      Args);
+
+  CGF.EmitBranch(ExitBB);
+  CGF.EmitBlock(ExitBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitTargetOutlinedFunction(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  if (!IsOffloadEntry) // Nothing to do.
+    return;
+
+  assert(!ParentName.empty() && "Invalid target region parent name!");
+
+  emitKernel(D, ParentName, OutlinedFn, OutlinedFnID, CodeGen);
+
+  // Create a unique global variable to indicate the execution mode of this
+  // target region. The execution mode is either 'non-SPMD' or 'SPMD'. Initially
+  // all regions are executed in non-SPMD mode. This variable is picked up by
+  // the offload library to setup the device appropriately before kernel launch.
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /* isConstant */ true,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::ConstantInt::get(CGM.Int8Ty, 1),
+      Twine(OutlinedFn->getName(), "_exec_mode"));
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+CGOpenMPRuntimeTRegion::CGOpenMPRuntimeTRegion(CodeGenModule &CGM)
+    : CGOpenMPRuntime(CGM, "_", "$") {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("TRegion code generation does only handle device code!");
+}
+
+void CGOpenMPRuntimeTRegion::emitProcBindClause(
+    CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind,
+    SourceLocation Loc) {
+  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumThreadsClause(CodeGenFunction &CGF,
+                                                  llvm::Value *NumThreads,
+                                                  SourceLocation Loc) {
+  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumTeamsClause(CodeGenFunction &CGF,
+                                                const Expr *NumTeams,
+                                                const Expr *ThreadLimit,
+                                                SourceLocation Loc) {
+  // Nothing to do for kernel mode, no other modes supported yet.
+}
+
+llvm::Function *CGOpenMPRuntimeTRegion::emitParallelOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Function *OutlinedFun =
+      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
+          D, ThreadIDVar, InnermostKind, CodeGen));
+
+  createParallelDataSharingWrapper(OutlinedFun, D);
+
+  return OutlinedFun;
+}
+
+// TODO: Check if this is actually needed.
+static const ValueDecl *getUnderlyingVar(const Expr *E) {
+  E = E->IgnoreParens();
+  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(E)) {
+    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      Base = TempOASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  }
+  E = E->IgnoreParenImpCasts();
+  if (const auto *DE = dyn_cast<DeclRefExpr>(E))
+    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+  const auto *ME = cast<MemberExpr>(E);
+  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
+void CGOpenMPRuntimeTRegion::createParallelDataSharingWrapper(
+    llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
+  ASTContext &Ctx = CGM.getContext();
+  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
+
+  // Create a function that takes as argument the source thread.
+  FunctionArgList WrapperArgs;
+  ImplicitParamDecl SharedVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                      /* Id */ nullptr, Ctx.VoidPtrTy,
+                                      ImplicitParamDecl::Other);
+  ImplicitParamDecl PrivateVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                       /* Id */ nullptr, Ctx.VoidPtrTy,
+                                       ImplicitParamDecl::Other);
+  WrapperArgs.emplace_back(&SharedVarsArgDecl);
+  WrapperArgs.emplace_back(&PrivateVarsArgDecl);
+
+  const CGFunctionInfo &CGFI =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
+
+  auto *WrapperFn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
+      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), WrapperFn, CGFI);
+
+  OutlinedParallelFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  OutlinedParallelFn->setDoesNotRecurse();
+  WrapperFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  WrapperFn->setDoesNotRecurse();
+
+  CodeGenFunction CGF(CGM, /* suppressNewContext */ true);
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WrapperFn, CGFI, WrapperArgs,
+                    D.getBeginLoc(), D.getBeginLoc());
+
+  auto AI = WrapperFn->arg_begin();
+  llvm::Argument &SharedVarsArg = *(AI++);
+  llvm::Argument &PrivateVarsArg = *(AI);
+  SharedVarsArg.setName("shared_vars");
+  PrivateVarsArg.setName("private_vars");
+
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /* DestWidth */ 32, /* Signed */ 1),
+                                       /* Name */ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/* C */ 0));
+
+  setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+
+  // Create the array of arguments and fill it with boilerplate values.
+  SmallVector<llvm::Value *, 8> Args;
+  Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer());
+  Args.emplace_back(ZeroAddr.getPointer());
+
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Collect all variables marked as shared.
+  llvm::SmallPtrSet<const ValueDecl *, 16> SharedVars;
+  for (const auto *C : D.getClausesOfKind<OMPSharedClause>())
+    for (const Expr *E : C->getVarRefs())
+      SharedVars.insert(getUnderlyingVar(E));
+
+  // Retrieve the shared and private variables from argument pointers and pass
+  // them to the outlined function.
+  llvm::SmallVector<llvm::Type *, 8> SharedStructMemberTypes;
+  llvm::SmallVector<llvm::Type *, 8> PrivateStructMemberTypes;
+
+  WrapperInfo &WI = WrapperInfoMap[OutlinedParallelFn];
+  WI.WrapperFn = WrapperFn;
+
+  auto ArgIt = OutlinedParallelFn->arg_begin() + /* global_tid & bound_tid */ 2;
+
+  // If we require loop bounds they are already part of the outlined function
+  // encoding, just after global_tid and bound_tid.
+  bool RequiresLoopBounds =
+      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind());
+  if (RequiresLoopBounds) {
+    // Register the lower bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+    // Register the upper bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+  }
+
+  auto CIt = CS.capture_begin();
+  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CIt) {
+    bool IsPrivate = CIt->capturesVariableArrayType() ||
+                     CIt->capturesVariableByCopy() ||
+                     !SharedVars.count(CIt->getCapturedVar());
+    WI.CaptureIsPrivate.push_back(IsPrivate);
+
+    auto &StructMemberTypes =
+        IsPrivate ? PrivateStructMemberTypes : SharedStructMemberTypes;
+    llvm::Type *ArgTy = (ArgIt++)->getType();
+    if (!IsPrivate) {
+      assert(ArgTy->isPointerTy());
+      ArgTy = ArgTy->getPointerElementType();
+    }
+    StructMemberTypes.push_back(ArgTy);
+  }
+
+  // Verify the position of the outlined function argument iterator as a sanity
+  // check.
+  assert(ArgIt == OutlinedParallelFn->arg_end() &&
+         "Not all arguments have been processed!");
+
+  llvm::Value *SharedVarsStructPtr = nullptr;
+  llvm::Value *PrivateVarsStructPtr = nullptr;
+  llvm::LLVMContext &LLVMCtx = OutlinedParallelFn->getContext();
+  if (!PrivateStructMemberTypes.empty()) {
+    WI.PrivateVarsStructTy = llvm::StructType::create(
+        LLVMCtx, PrivateStructMemberTypes, "omp.private.struct");
+    PrivateVarsStructPtr = Bld.CreateBitCast(
+        &PrivateVarsArg, WI.PrivateVarsStructTy->getPointerTo());
+  }
+  if (!SharedStructMemberTypes.empty()) {
+    WI.SharedVarsStructTy = llvm::StructType::create(
+        LLVMCtx, SharedStructMemberTypes, "omp.shared.struct");
+    SharedVarsStructPtr = Bld.CreateBitCast(
+        &SharedVarsArg, WI.SharedVarsStructTy->getPointerTo());
+  }
+
+  assert(WI.CaptureIsPrivate.size() + /* global_tid & bound_tid */ 2 ==
+             OutlinedParallelFn->arg_size() &&
+         "Not all arguments have been processed!");
+
+  unsigned PrivateIdx = 0, SharedIdx = 0;
+  for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+    bool IsPrivate = WI.CaptureIsPrivate[i];
+
+    llvm::Value *StructPtr =
+        IsPrivate ? PrivateVarsStructPtr : SharedVarsStructPtr;
+    unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+
+    // TODO: Figure out the real alignment
+    if (IsPrivate) {
+      Args.emplace_back(
+          Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1));
+    } else {
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      Args.emplace_back(GEP);
+    }
+  }
+
+  assert(Args.size() == OutlinedParallelFn->arg_size());
+  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
+
+  CGF.FinishFunction();
+
+  clearLocThreadIdInsertPt(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::emitParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *Fn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  const WrapperInfo &WI = WrapperInfoMap[Fn];
+
+  auto &&ParGen = [this, CapturedVars, WI](CodeGenFunction &CGF,
+                                           PrePostActionTy &) {
+    CGBuilderTy &Bld = CGF.Builder;
+    assert(WI.WrapperFn && "Wrapper function does not exist!");
+
+    llvm::Value *SharedVarsSize = llvm::Constant::getNullValue(CGM.Int16Ty);
+    llvm::Value *PrivateVarsSize = SharedVarsSize;
+    llvm::Value *SharedStructAlloca = llvm::UndefValue::get(CGM.VoidPtrTy);
+    llvm::Value *PrivateStructAlloca = SharedStructAlloca;
+
+    if (WI.SharedVarsStructTy) {
+      SharedStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                  WI.SharedVarsStructTy, ".shared.vars")
+                               .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      SharedVarsSize = Bld.getInt16(DL.getTypeAllocSize(WI.SharedVarsStructTy));
+    }
+    if (WI.PrivateVarsStructTy) {
+      PrivateStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                   WI.PrivateVarsStructTy, ".private.vars")
+                                .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      PrivateVarsSize =
+          Bld.getInt16(DL.getTypeAllocSize(WI.PrivateVarsStructTy));
+    }
+
+    llvm::SmallVector<llvm::Value *, 4> Args;
+    Args.push_back(
+        /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()));
+    Args.push_back(
+        /* RequiredOMPRuntime */ Bld.getInt1(mayNeedRuntimeSupport()));
+    Args.push_back(WI.WrapperFn);
+    Args.push_back(CGF.EmitCastToVoidPtr(SharedStructAlloca));
+    Args.push_back(SharedVarsSize);
+    Args.push_back(CGF.EmitCastToVoidPtr(PrivateStructAlloca));
+    Args.push_back(PrivateVarsSize);
+    Args.push_back(
+        /* SharedPointers */ Bld.getInt1(0));
+
+    assert((CapturedVars.empty() ||
+            (WI.SharedVarsStructTy || WI.PrivateVarsStructTy)) &&
+           "Expected the shared or private struct type to be set if variables "
+           "are captured!");
+    assert((CapturedVars.empty() ||
+            CapturedVars.size() ==
+                (WI.SharedVarsStructTy ? WI.SharedVarsStructTy->getNumElements()
+                                       : 0) +
+                    (WI.PrivateVarsStructTy
+                         ? WI.PrivateVarsStructTy->getNumElements()
+                         : 0)) &&
+           "# elements in shared struct types should be number of captured "
+           "variables!");
+
+    // Store all captured variables into a single local structure that is then
+    // passed to the runtime library.
+    unsigned PrivateIdx = 0, SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+
+      llvm::Value *StructPtr =
+          IsPrivate ? PrivateStructAlloca : SharedStructAlloca;
+      unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      llvm::Value *Var = IsPrivate ? CapturedVars[i]
+                                   : Bld.CreateAlignedLoad(CapturedVars[i], 1);
+      Bld.CreateDefaultAlignedStore(Var, GEP);
+    }
+
+    CGF.EmitRuntimeCall(
+        getOrCreateRuntimeFunctionDeclaration(
+            *this, CGM, OMPRTL__kmpc_target_region_kernel_parallel),
+        Args);
+
+    SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+      if (IsPrivate)
+        continue;
+
+      llvm::Value *GEP = Bld.CreateStructGEP(SharedStructAlloca, SharedIdx++);
+      llvm::Value *Var = Bld.CreateAlignedLoad(GEP, 1);
+      Bld.CreateDefaultAlignedStore(Var, CapturedVars[i]);
+    }
+  };
+
+  auto &&SeqGen = [this, &ParGen, Loc](CodeGenFunction &CGF,
+                                       PrePostActionTy &Action) {
+    // Use an artifical "num_threads(1)" clause to force sequential execution if
+    // the expression in the 'if clause' evaluated to false. We expect the
+    // middle-end to clean this up.
+    emitNumThreadsClause(CGF, CGF.Builder.getInt32(/* C */ 1), Loc);
+    ParGen(CGF, Action);
+  };
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, ParGen, SeqGen);
+  } else {
+    CodeGenFunction::RunCleanupsScope Scope(CGF);
+    RegionCodeGenTy ThenRCG(ParGen);
+    ThenRCG(CGF);
+  }
+}
+
+llvm::Function *CGOpenMPRuntimeTRegion::emitTeamsOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Function *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
+      D, ThreadIDVar, InnermostKind, CodeGen);
+
+  return OutlinedFunVal;
+}
+
+void CGOpenMPRuntimeTRegion::emitTeamsCall(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc,
+    llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  Address ZeroAddr = CGF.CreateMemTemp(
+      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
+      /*Name*/ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+}
+
+void CGOpenMPRuntimeTRegion::emitCriticalRegion(
+    CodeGenFunction &CGF, StringRef CriticalName,
+    const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
+    const Expr *Hint) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support critical regions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitReduction(
+    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+    ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support reductions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitFunctionProlog(CodeGenFunction &CGF,
+                                                const Decl *D) {}
+
+void CGOpenMPRuntimeTRegion::functionFinished(CodeGenFunction &CGF) {
+  CGOpenMPRuntime::functionFinished(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultDistScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const {
+  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(CGF, S, ScheduleKind, Chunk);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const {
+  ScheduleKind = OMPC_SCHEDULE_static;
+  // Chunk size is 1 in this case.
+  llvm::APInt ChunkSize(32, 1);
+  ChunkExpr = IntegerLiteral::Create(
+      CGF.getContext(), ChunkSize,
+      CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ 0),
+      SourceLocation());
+}
+
+// ------------------------------------------------------------------------ //
+// TODO: The following cuda specific part should live somewhere else,
+//       potentially in a derived class.
+
+void CGOpenMPRuntimeTRegion::createOffloadEntry(
+    llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t,
+    llvm::GlobalValue::LinkageTypes) {
+  // TODO: Add support for global variables on the device after declare target
+  // support.
+  if (!isa<llvm::Function>(Addr))
+    return;
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+
+  llvm::Metadata *MDVals[] = {
+      llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
+      llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+#include "clang/Basic/Cuda.h"
+
+// Get current CudaArch and ignore any unknown values
+static CudaArch getCudaArch(CodeGenModule &CGM) {
+  if (!CGM.getTarget().hasFeature("ptx"))
+    return CudaArch::UNKNOWN;
+  llvm::StringMap<bool> Features;
+  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
+                                 CGM.getTarget().getTargetOpts().CPU,
+                                 CGM.getTarget().getTargetOpts().Features);
+  for (const auto &Feature : Features) {
+    if (Feature.getValue()) {
+      CudaArch Arch = StringToCudaArch(Feature.getKey());
+      if (Arch != CudaArch::UNKNOWN)
+        return Arch;
+    }
+  }
+  return CudaArch::UNKNOWN;
+}
+
+/// Check to see if target architecture supports unified addressing which is
+/// a restriction for OpenMP requires clause "unified_shared_memory".
+void CGOpenMPRuntimeTRegion::checkArchForUnifiedAddressing(
+    CodeGenModule &CGM, const OMPRequiresDecl *D) const {
+  for (const OMPClause *Clause : D->clauselists()) {
+    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
+      switch (getCudaArch(CGM)) {
+      case CudaArch::SM_20:
+      case CudaArch::SM_21:
+      case CudaArch::SM_30:
+      case CudaArch::SM_32:
+      case CudaArch::SM_35:
+      case CudaArch::SM_37:
+      case CudaArch::SM_50:
+      case CudaArch::SM_52:
+      case CudaArch::SM_53:
+      case CudaArch::SM_60:
+      case CudaArch::SM_61:
+      case CudaArch::SM_62:
+        CGM.Error(Clause->getBeginLoc(),
+                  "Target architecture does not support unified addressing");
+        return;
+      case CudaArch::SM_70:
+      case CudaArch::SM_72:
+      case CudaArch::SM_75:
+      case CudaArch::GFX600:
+      case CudaArch::GFX601:
+      case CudaArch::GFX700:
+      case CudaArch::GFX701:
+      case CudaArch::GFX702:
+      case CudaArch::GFX703:
+      case CudaArch::GFX704:
+      case CudaArch::GFX801:
+      case CudaArch::GFX802:
+      case CudaArch::GFX803:
+      case CudaArch::GFX810:
+      case CudaArch::GFX900:
+      case CudaArch::GFX902:
+      case CudaArch::GFX904:
+      case CudaArch::GFX906:
+      case CudaArch::GFX909:
+      case CudaArch::UNKNOWN:
+        break;
+      case CudaArch::LAST:
+        llvm_unreachable("Unexpected Cuda arch.");
+      }
+    }
+  }
+}

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D59328: [OpenMP][Offloading][2/3] Codegen for target regions (TRegions)

Reply via email to