[PATCH] D29910: [OpenMP] Specialize default schedule on a worksharing loop on the NVPTX device.

Arpith Jacob via Phabricator via cfe-commits Mon, 13 Feb 2017 13:49:03 -0800

arpith-jacob created this revision.

The default schedule type on a worksharing loop is implementation
defined according to the OpenMP specifications.  Currently, the
compiler codegens a doubly nested loop that effectively implements
a schedule of type (static).  This is ideal for threads on CPUs.


On the NVPTX and other SIMT GPUs, this schedule provides very poor
performance because consecutive threads in a warp access loop arrays
in a non-coalesced manner.  That is, to achieve coalescing, and good
performance, the best schedule is static with a chunk size of 1.

This patch adds support for target devices to select the best default
schedule depending on their architecture.  It modifies loop codegen
to generate optimized code for (static,1) on the NVPTX device, i.e.,
by using a single loop instead of a doubly nested loop as is
currently the case.


https://reviews.llvm.org/D29910

Files:
  include/clang/AST/StmtOpenMP.h
  include/clang/Basic/OpenMPKinds.h
  lib/AST/StmtOpenMP.cpp
  lib/Basic/OpenMPKinds.cpp
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/Sema/SemaOpenMP.cpp
  test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp

Index: test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp
===================================================================
--- /dev/null
+++ test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp
@@ -0,0 +1,322 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of the target regions on the gpu is set to the right mode.
+// CHECK-DAG: {{@__omp_offloading_.+l19}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate() {
+  tx a[100];
+  tx b[10][10];
+
+  #pragma omp target parallel
+  {
+    #pragma omp for
+    for (int i = 0; i < 99; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(auto)
+    for (int i = 0; i < 98; i++) {
+      a[i] = 2;
+    }
+
+    #pragma omp for schedule(static,1)
+    for (int i = 0; i < 97; i++) {
+      a[i] = 3;
+    }
+
+    #pragma omp for schedule(static,2)
+    for (int i = 0; i < 96; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(static)
+    for (int i = 0; i < 95; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(auto) ordered
+    for (int i = 0; i < 94; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(runtime)
+    for (int i = 0; i < 93; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(dynamic)
+    for (int i = 0; i < 92; i++) {
+      a[i] = 1;
+    }
+
+    #pragma omp for schedule(guided)
+    for (int i = 0; i < 91; i++) {
+      a[i] = 1;
+    }
+  }
+
+  return a[0] + b[9][9];
+}
+
+int bar(){
+  int a = 0;
+
+  a += ftemplate<int>();
+
+  return a;
+}
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l19}}(
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP1:@.+]](i32*
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP1]](
+
+  // No schedule clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 98, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 99
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 1, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(auto) clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 97, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 98
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 2, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(static,1) clause.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 96, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  // CHECK: br label {{%?}}[[FOR_COND:.+]]
+  //
+  // CHECK: [[FOR_COND]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 97
+  // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]]
+  //
+  // [[FOR_BODY]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1
+  // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]]
+  // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align
+  // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align
+  // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]]
+  // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64
+  // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]]
+  // CHECK: store i32 3, i32* [[ELEM_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_CONT:.+]]
+  //
+  // CHECK: [[FOR_CONT]]
+  // CHECK: br label {{%?}}[[FOR_INC:.+]]
+  //
+  // CHECK: [[FOR_INC]]
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]]
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+  // CHECK: br label {{%?}}[[FOR_COND]]
+  //
+  // CHECK: [[FOR_END]]
+
+
+
+  // schedule(static,2) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 95, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 2)
+  // CHECK: br label {{%?}}[[DISPATCH_COND:.+]]
+  //
+  // CHECK: [[DISPATCH_COND]]
+  // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align
+  // CHECK: = icmp sgt i32 [[UB]], 95
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(static) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 94, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 34, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1)
+  // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align
+  // CHECK: = icmp sgt i32 [[UB]], 94
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(auto) ordered clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 93, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 70
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(runtime) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 92, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 37
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // schedule(dynamic) clause.  Non-coalesced codegen.
+  //
+  // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align
+  // CHECK: store i32 91, i32* [[UB_PTR:%.+]], align
+  // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align
+  // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 35
+  // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]])
+  //
+  // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align
+  // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align
+  //
+  // CHECK:  = getelementptr
+  //
+  // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align
+  // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1
+  // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align
+
+
+
+  // CHECK: ret void
+  // CHECK: }
+
+#endif
Index: lib/Sema/SemaOpenMP.cpp
===================================================================
--- lib/Sema/SemaOpenMP.cpp
+++ lib/Sema/SemaOpenMP.cpp
@@ -3859,11 +3859,14 @@
 /// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
 /// number of collapsed loops otherwise.
 static unsigned
-CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
-                Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef,
-                DSAStackTy &DSA,
+CheckOpenMPLoop(OpenMPDirectiveKind DKind, ArrayRef<OMPClause *> Clauses,
+                Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr,
+                Stmt *AStmt, Sema &SemaRef, DSAStackTy &DSA,
                 llvm::DenseMap<ValueDecl *, Expr *> &VarsWithImplicitDSA,
                 OMPLoopDirective::HelperExprs &Built) {
+  OpenMPDefaultScheduleKind DefaultScheduleKind =
+      getDefaultSchedule(SemaRef, DKind, Clauses);
+
   unsigned NestedLoopCount = 1;
   if (CollapseLoopCountExpr) {
     // Found 'collapse' clause - calculate collapse number.
@@ -4136,17 +4139,20 @@
   // Loop condition (IV < NumIterations) or (IV <= UB) for worksharing loops.
   SourceLocation CondLoc;
   ExprResult Cond =
-      (isOpenMPWorksharingDirective(DKind) ||
-       isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
+      (DefaultScheduleKind != OMPDSK_static_chunkone &&
+       (isOpenMPWorksharingDirective(DKind) ||
+        isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind)))
           ? SemaRef.BuildBinOp(CurScope, CondLoc, BO_LE, IV.get(), UB.get())
           : SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
                                NumIterations.get());
 
-  // Loop increment (IV = IV + 1)
+  // Loop increment (IV = IV + 1) or (IV = IV + ST) if (static,1) scheduling.
   SourceLocation IncLoc;
   ExprResult Inc =
-      SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
-                         SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
+      DefaultScheduleKind == OMPDSK_static_chunkone
+          ? SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(), ST.get())
+          : SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
+                               SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
   if (!Inc.isUsable())
     return 0;
   Inc = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, IV.get(), Inc.get());
@@ -4295,6 +4301,7 @@
   Built.NUB = NextUB.get();
   Built.PrevLB = PrevLB.get();
   Built.PrevUB = PrevUB.get();
+  Built.DefaultScheduleKind = DefaultScheduleKind;
 
   Expr *CounterVal = SemaRef.DefaultLvalueConversion(IV.get()).get();
   // Fill data for doacross depend clauses.
@@ -4417,9 +4424,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_simd, Clauses, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4456,9 +4464,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_for, Clauses, getCollapseNumberExpr(Clauses),
+                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4493,7 +4502,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_for_simd, Clauses, getCollapseNumberExpr(Clauses),
                       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -4694,10 +4703,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -4739,10 +4748,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5697,10 +5706,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_target_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
+      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5905,7 +5914,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_taskloop, Clauses, getCollapseNumberExpr(Clauses),
                       /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -5936,10 +5945,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_taskloop_simd, Clauses, getCollapseNumberExpr(Clauses),
+      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -5980,7 +5989,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_distribute, Clauses, getCollapseNumberExpr(Clauses),
                       nullptr /*ordered not a clause on distribute*/, AStmt,
                       *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6013,7 +6022,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
+      OMPD_distribute_parallel_for, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6045,10 +6054,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6081,10 +6091,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, AStmt,
-                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_distribute_simd, Clauses, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6118,7 +6128,7 @@
   // In presence of clause 'collapse' or 'ordered' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
+      OMPD_target_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses),
       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6164,7 +6174,7 @@
   // In presence of clause 'collapse' with number of loops, it will define the
   // nested loops number.
   unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
+      CheckOpenMPLoop(OMPD_target_simd, Clauses, getCollapseNumberExpr(Clauses),
                       getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6210,10 +6220,10 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount =
-      CheckOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, AStmt,
-                      *this, *DSAStack, VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount = CheckOpenMPLoop(
+      OMPD_teams_distribute, Clauses, getCollapseNumberExpr(Clauses),
+      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6244,7 +6254,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
+      OMPD_teams_distribute_simd, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
 
@@ -6291,10 +6301,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_teams_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
@@ -6339,10 +6350,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  unsigned NestedLoopCount = CheckOpenMPLoop(
-      OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  unsigned NestedLoopCount =
+      CheckOpenMPLoop(OMPD_teams_distribute_parallel_for, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
@@ -6406,8 +6418,7 @@
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
   auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute,
-      getCollapseNumberExpr(Clauses),
+      OMPD_target_teams_distribute, Clauses, getCollapseNumberExpr(Clauses),
       nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
@@ -6439,11 +6450,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6484,11 +6495,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_parallel_for_simd,
-      getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
@@ -6530,10 +6541,11 @@
   OMPLoopDirective::HelperExprs B;
   // In presence of clause 'collapse' with number of loops, it will
   // define the nested loops number.
-  auto NestedLoopCount = CheckOpenMPLoop(
-      OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack,
-      VarsWithImplicitDSA, B);
+  auto NestedLoopCount =
+      CheckOpenMPLoop(OMPD_target_teams_distribute_simd, Clauses,
+                      getCollapseNumberExpr(Clauses),
+                      nullptr /*ordered not a clause on distribute*/, AStmt,
+                      *this, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -2187,13 +2187,60 @@
       }
       const unsigned IVSize = getContext().getTypeSize(IVExpr->getType());
       const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation();
-      // OpenMP 4.5, 2.7.1 Loop Construct, Description.
-      // If the static schedule kind is specified or if the ordered clause is
-      // specified, and if no monotonic modifier is specified, the effect will
-      // be as if the monotonic modifier was specified.
-      if (RT.isStaticNonchunked(ScheduleKind.Schedule,
-                                /* Chunked */ Chunk != nullptr) &&
-          !Ordered) {
+      if (S.getDefaultSchedule() == OMPDSK_static_chunkone) {
+        // For NVPTX and other GPU targets high performance is often achieved
+        // if adjacent threads access memory in a coalesced manner.  This is
+        // true for loops that access memory with stride one if a static
+        // schedule with chunk size of 1 is used.  We generate such code
+        // whenever the OpenMP standard gives us freedom to do so.
+        //
+        // This case is called if there is no schedule clause, with a
+        // schedule(auto), or with a schedule(static,1).
+        //
+        // Codegen is optimized for this case.  Since chunk size is 1 we do not
+        // need to generate the inner loop, i.e., the chunk iterator can be
+        // removed.
+        // while(UB = min(UB, GlobalUB), idx = LB, idx < UB) {
+        //   BODY;
+        //   LB = LB + ST;
+        // }
+        if (!Chunk) // Force use of chunk=1
+          Chunk = Builder.getIntN(IVSize, 1);
+        if (isOpenMPSimdDirective(S.getDirectiveKind()))
+          EmitOMPSimdInit(S, /*IsMonotonic=*/true);
+
+        OpenMPScheduleTy LoopSchedule;
+        LoopSchedule.Schedule = OMPC_SCHEDULE_static;
+        RT.emitForStaticInit(*this, S.getLocStart(), LoopSchedule, IVSize,
+                             IVSigned, Ordered, IL.getAddress(),
+                             LB.getAddress(), UB.getAddress(), ST.getAddress(),
+                             Chunk);
+        auto LoopExit =
+            getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit"));
+
+        // IV = LB;
+        EmitIgnoredExpr(S.getInit());
+        EmitOMPInnerLoop(S, LoopScope.requiresCleanups(),
+                         S.getCond() /* IV < GlobalUB */,
+                         S.getInc() /* IV += Stride */,
+                         [&S, LoopExit](CodeGenFunction &CGF) {
+                           CGF.EmitOMPLoopBody(S, LoopExit);
+                           CGF.EmitStopPoint(&S);
+                         },
+                         [](CodeGenFunction &) {});
+        EmitBlock(LoopExit.getBlock());
+        // Tell the runtime we are done.
+        auto &&CodeGen = [&S](CodeGenFunction &CGF) {
+          CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocEnd());
+        };
+        OMPCancelStack.emitExit(*this, S.getDirectiveKind(), CodeGen);
+      } else if (RT.isStaticNonchunked(ScheduleKind.Schedule,
+                                       /* Chunked */ Chunk != nullptr) &&
+                 !Ordered) {
+        // OpenMP 4.5, 2.7.1 Loop Construct, Description.
+        // If the static schedule kind is specified or if the ordered clause is
+        // specified, and if no monotonic modifier is specified, the effect will
+        // be as if the monotonic modifier was specified.
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
           EmitOMPSimdInit(S, /*IsMonotonic=*/true);
         // OpenMP [2.7.1, Loop Construct, Description, table 2-1]
Index: lib/Basic/OpenMPKinds.cpp
===================================================================
--- lib/Basic/OpenMPKinds.cpp
+++ lib/Basic/OpenMPKinds.cpp
@@ -12,7 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/OpenMPKinds.h"
+#include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/IdentifierTable.h"
+#include "clang/Sema/SemaInternal.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -936,3 +939,53 @@
     llvm_unreachable("Unknown OpenMP directive");
   }
 }
+
+OpenMPDefaultScheduleKind
+clang::getDefaultSchedule(Sema &SemaRef, OpenMPDirectiveKind Kind,
+                          ArrayRef<OMPClause *> Clauses) {
+  OpenMPDefaultScheduleKind DefaultSchedule = OMPDSK_unknown;
+
+  if (SemaRef.getLangOpts().OpenMPIsDevice &&
+      SemaRef.Context.getTargetInfo().getTriple().isNVPTX()) {
+    // Force a schedule type of (static,1) if there is no schedule clause, or
+    // the user specifies schedule(auto) or schedule(static,1).
+    bool ChunkSizeOne = false;
+    auto ScheduleKind = OMPC_SCHEDULE_unknown;
+    auto ScheduleClause =
+        OMPExecutableDirective::getClausesOfKind<OMPScheduleClause>(Clauses);
+    if (ScheduleClause.begin() != ScheduleClause.end()) {
+      ScheduleKind = (*ScheduleClause.begin())->getScheduleKind();
+      if (const auto *Ch = (*ScheduleClause.begin())->getChunkSize()) {
+        if (!Ch->isValueDependent() && !Ch->isTypeDependent() &&
+            !Ch->isInstantiationDependent() &&
+            !Ch->containsUnexpandedParameterPack()) {
+          SourceLocation ChLoc = Ch->getLocStart();
+          ExprResult Val = SemaRef.PerformOpenMPImplicitIntegerConversion(
+              ChLoc, const_cast<Expr *>(Ch));
+          if (!Val.isInvalid()) {
+            Expr *ValExpr = Val.get();
+            llvm::APSInt Result;
+            ChunkSizeOne =
+                ValExpr->isIntegerConstantExpr(Result, SemaRef.Context) &&
+                Result == 1;
+          }
+        }
+      }
+    }
+
+    // Ordered clause requires dynamic dispatch.
+    auto OrderedClause =
+        OMPExecutableDirective::getClausesOfKind<OMPOrderedClause>(Clauses);
+    bool Ordered = OrderedClause.begin() != OrderedClause.end();
+
+    bool StaticOneSchedule =
+        (!Ordered && (ScheduleKind == OMPC_SCHEDULE_unknown ||
+                      ScheduleKind == OMPC_SCHEDULE_auto ||
+                      (ScheduleKind == OMPC_SCHEDULE_static && ChunkSizeOne)));
+
+    if (StaticOneSchedule)
+      DefaultSchedule = OMPDSK_static_chunkone;
+  }
+
+  return DefaultSchedule;
+}
Index: lib/AST/StmtOpenMP.cpp
===================================================================
--- lib/AST/StmtOpenMP.cpp
+++ lib/AST/StmtOpenMP.cpp
@@ -105,6 +105,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -156,6 +157,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  Dir->setDefaultSchedule(Exprs.DefaultScheduleKind);
   return Dir;
 }
 
@@ -207,6 +209,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -373,6 +376,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -423,6 +427,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -760,6 +765,7 @@
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
   Dir->setHasCancel(HasCancel);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1003,6 +1009,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1077,6 +1084,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1133,6 +1141,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1188,6 +1197,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1242,6 +1252,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1336,6 +1347,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1389,6 +1401,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1444,6 +1457,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1502,6 +1516,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1582,6 +1597,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1640,6 +1656,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1701,6 +1718,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
@@ -1759,6 +1777,7 @@
   Dir->setUpdates(Exprs.Updates);
   Dir->setFinals(Exprs.Finals);
   Dir->setPreInits(Exprs.PreInits);
+  // TODO: Set default schedule.
   return Dir;
 }
 
Index: include/clang/Basic/OpenMPKinds.h
===================================================================
--- include/clang/Basic/OpenMPKinds.h
+++ include/clang/Basic/OpenMPKinds.h
@@ -15,10 +15,14 @@
 #ifndef LLVM_CLANG_BASIC_OPENMPKINDS_H
 #define LLVM_CLANG_BASIC_OPENMPKINDS_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace clang {
 
+class Sema;
+class OMPClause;
+
 /// \brief OpenMP directives.
 enum OpenMPDirectiveKind {
 #define OPENMP_DIRECTIVE(Name) \
@@ -127,6 +131,9 @@
   OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown;
 };
 
+/// Default schedule type for any loop-based (#for) OpenMP directive.
+enum OpenMPDefaultScheduleKind { OMPDSK_static_chunkone, OMPDSK_unknown };
+
 OpenMPDirectiveKind getOpenMPDirectiveKind(llvm::StringRef Str);
 const char *getOpenMPDirectiveName(OpenMPDirectiveKind Kind);
 
@@ -239,7 +246,14 @@
 void getOpenMPCaptureRegions(
     llvm::SmallVectorImpl<OpenMPDirectiveKind> &CaptureRegions,
     OpenMPDirectiveKind DKind);
-}
 
+/// Get the default schedule type for any loop-based OpenMP directive,
+/// specialized for a particular target.  This is used to guide codegen
+/// if a) no 'schedule' clause is specified, or b) a 'schedule' type of
+/// 'auto' is specified by the user.
+OpenMPDefaultScheduleKind
+getDefaultSchedule(Sema &SemaRef, OpenMPDirectiveKind Kind,
+                   llvm::ArrayRef<OMPClause *> Clauses);
+}
 #endif
 
Index: include/clang/AST/StmtOpenMP.h
===================================================================
--- include/clang/AST/StmtOpenMP.h
+++ include/clang/AST/StmtOpenMP.h
@@ -314,6 +314,9 @@
   friend class ASTStmtReader;
   /// \brief Number of collapsed loops as specified by 'collapse' clause.
   unsigned CollapsedNum;
+  /// \brief DefaultScheduleKind - Schedule type to use for a given target
+  /// if no 'schedule' clause or a 'schedule' type 'auto' is specified.
+  OpenMPDefaultScheduleKind DefaultScheduleKind;
 
   /// \brief Offsets to the stored exprs.
   /// This enumeration contains offsets to all the pointers to children
@@ -412,7 +415,7 @@
       : OMPExecutableDirective(That, SC, Kind, StartLoc, EndLoc, NumClauses,
                                numLoopChildren(CollapsedNum, Kind) +
                                    NumSpecialChildren),
-        CollapsedNum(CollapsedNum) {}
+        CollapsedNum(CollapsedNum), DefaultScheduleKind(OMPDSK_unknown) {}
 
   /// \brief Offset to the start of children expression arrays.
   static unsigned getArraysOffset(OpenMPDirectiveKind Kind) {
@@ -521,6 +524,9 @@
            "expected worksharing loop directive");
     *std::next(child_begin(), PrevUpperBoundVariableOffset) = PrevUB;
   }
+  void setDefaultSchedule(OpenMPDefaultScheduleKind SK) {
+    DefaultScheduleKind = SK;
+  }
   void setCounters(ArrayRef<Expr *> A);
   void setPrivateCounters(ArrayRef<Expr *> A);
   void setInits(ArrayRef<Expr *> A);
@@ -567,6 +573,9 @@
     /// \brief PreviousUpperBound - local variable passed to runtime in the
     /// enclosing schedule or null if that does not apply.
     Expr *PrevUB;
+    /// \brief DefaultScheduleKind - Schedule type to use for the given target
+    /// if no 'schedule' clause or a 'schedule' type 'auto' is specified.
+    OpenMPDefaultScheduleKind DefaultScheduleKind;
     /// \brief Counters Loop counters.
     SmallVector<Expr *, 4> Counters;
     /// \brief PrivateCounters Loop counters.
@@ -608,6 +617,7 @@
       NumIterations = nullptr;
       PrevLB = nullptr;
       PrevUB = nullptr;
+      DefaultScheduleKind = OMPDSK_unknown;
       Counters.resize(Size);
       PrivateCounters.resize(Size);
       Inits.resize(Size);
@@ -739,6 +749,9 @@
     return const_cast<Expr *>(reinterpret_cast<const Expr *>(
         *std::next(child_begin(), PrevUpperBoundVariableOffset)));
   }
+  OpenMPDefaultScheduleKind getDefaultSchedule() const {
+    return DefaultScheduleKind;
+  }
   const Stmt *getBody() const {
     // This relies on the loop form is already checked by Sema.
     Stmt *Body = getAssociatedStmt()->IgnoreContainers(true);

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D29910: [OpenMP] Specialize default schedule on a worksharing loop on the NVPTX device.

Reply via email to