https://github.com/erichkeane updated 
https://github.com/llvm/llvm-project/pull/140971

>From ec6359330368ed72d6e4354afdcf6d0a32f9a2f9 Mon Sep 17 00:00:00 2001
From: erichkeane <eke...@nvidia.com>
Date: Tue, 20 May 2025 08:55:39 -0700
Subject: [PATCH 1/2] [OpenACC][CIR] Add lowering for 'copy' array indexes

The array indexes(and sections) are represented by the acc.bounds
operation, which this ensures we fill in properly.  The lowerbound is
required, so we always get that.

The upperbound or extent is required.  We typically do extent, since
that is the 'length' as specified by ACC, but in cases where we have
implicit length, we use the extent instead.

It isn't clear when 'stride' should be anything besides 1, though by my
reading, since we have full-types in the emitted code, we should
never have it be anything but 1.

This patch enables these for copy on compute and combined constructs,
and makes sure to test everything I could think of for
combinations/permutations.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp |  88 ++-
 clang/test/CIR/CodeGenOpenACC/combined-copy.c | 501 ++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/compute-copy.c  | 425 +++++++++++++++
 3 files changed, 1001 insertions(+), 13 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp 
b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 91aa358638d5d..9e45ad8519e11 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -121,6 +121,11 @@ class OpenACCClauseCIREmitter final
     return constOp.getResult();
   }
 
+  mlir::Value createConstantInt(SourceLocation loc, unsigned width,
+                                int64_t value) {
+    return createConstantInt(cgf.cgm.getLoc(loc), width, value);
+  }
+
   mlir::acc::DeviceType decodeDeviceType(const IdentifierInfo *ii) {
     // '*' case leaves no identifier-info, just a nullptr.
     if (!ii)
@@ -184,37 +189,94 @@ class OpenACCClauseCIREmitter final
     mlir::Location beginLoc;
     mlir::Value varValue;
     llvm::StringRef name;
+    llvm::SmallVector<mlir::Value> bounds;
   };
 
+  mlir::Value createBound(mlir::Location boundLoc, mlir::Value lowerBound,
+                          mlir::Value upperBound, mlir::Value extent) {
+    // Arrays always have a start-idx of 0.
+    mlir::Value startIdx = createConstantInt(boundLoc, 64, 0);
+    // TODO: OpenACC: It isn't clear that stride would ever be anything other
+    // than '1'?  We emit the type of the reference 'correctly' as far as I
+    // know, so it should just be 1 element each time. We could perhaps use
+    // the 'inBytes' variant here, but it isn't clear what value that gets us.
+    // We might need to revisit this once we try to opt this and see what is
+    // going to happen.
+    mlir::Value stride = createConstantInt(boundLoc, 64, 1);
+
+    auto bound = builder.create<mlir::acc::DataBoundsOp>(boundLoc, lowerBound,
+                                                         upperBound);
+    bound.getStartIdxMutable().assign(startIdx);
+    if (extent)
+      bound.getExtentMutable().assign(extent);
+    bound.getStrideMutable().assign(stride);
+
+    return bound;
+  }
+
   // A helper function that gets the information from an operand to a data
   // clause, so that it can be used to emit the data operations.
-  inline DataOperandInfo getDataOperandInfo(OpenACCDirectiveKind dk,
-                                            const Expr *e) {
+  DataOperandInfo getDataOperandInfo(OpenACCDirectiveKind dk, const Expr *e) {
     // TODO: OpenACC: Cache was different enough as to need a separate
     // `ActOnCacheVar`, so we are going to need to do some investigations here
     // when it comes to implement this for cache.
     if (dk == OpenACCDirectiveKind::Cache) {
       cgf.cgm.errorNYI(e->getSourceRange(),
                        "OpenACC data operand for 'cache' directive");
-      return {cgf.cgm.getLoc(e->getBeginLoc()), {}, {}};
+      return {cgf.cgm.getLoc(e->getBeginLoc()), {}, {}, {}};
     }
 
     const Expr *curVarExpr = e->IgnoreParenImpCasts();
 
     mlir::Location exprLoc = cgf.cgm.getLoc(curVarExpr->getBeginLoc());
+    llvm::SmallVector<mlir::Value> bounds;
+
+    // Assemble the list of bounds.
+    while (isa<ArraySectionExpr, ArraySubscriptExpr>(curVarExpr)) {
+      mlir::Location boundLoc = cgf.cgm.getLoc(curVarExpr->getBeginLoc());
+      mlir::Value lowerBound;
+      mlir::Value upperBound;
+      mlir::Value extent;
+
+      if (const auto *section = dyn_cast<ArraySectionExpr>(curVarExpr)) {
+        if (const Expr *lb = section->getLowerBound())
+          lowerBound = emitIntExpr(lb);
+        else
+          lowerBound = createConstantInt(boundLoc, 64, 0);
+
+        if (const Expr *len = section->getLength()) {
+          extent = emitIntExpr(len);
+        } else {
+          QualType baseTy = ArraySectionExpr::getBaseOriginalType(
+              section->getBase()->IgnoreParenImpCasts());
+          // We know this is the case as implicit lengths are only allowed for
+          // array types with a constant size, or a dependent size.  AND since
+          // we are codegen we know we're not dependent.
+          auto *arrayTy = cgf.getContext().getAsConstantArrayType(baseTy);
+          // Rather than trying to calculate the extent based on the
+          // lower-bound, we can just emit this as an upper bound.
+          upperBound =
+              createConstantInt(boundLoc, 64, arrayTy->getLimitedSize() - 1);
+        }
 
-    // TODO: OpenACC: Assemble the list of bounds.
-    if (isa<ArraySectionExpr, ArraySubscriptExpr>(curVarExpr)) {
-      cgf.cgm.errorNYI(curVarExpr->getSourceRange(),
-                       "OpenACC data clause array subscript/section");
-      return {exprLoc, {}, {}};
+        curVarExpr = section->getBase()->IgnoreParenImpCasts();
+      } else {
+        const auto *subscript = dyn_cast<ArraySubscriptExpr>(curVarExpr);
+
+        lowerBound = emitIntExpr(subscript->getIdx());
+        // Length of an array index is always 1.
+        extent = createConstantInt(boundLoc, 64, 1);
+        curVarExpr = subscript->getBase()->IgnoreParenImpCasts();
+      }
+
+      bounds.push_back(createBound(boundLoc, lowerBound, upperBound, extent));
     }
 
     // TODO: OpenACC: if this is a member expr, emit the VarPtrPtr correctly.
     if (isa<MemberExpr>(curVarExpr)) {
       cgf.cgm.errorNYI(curVarExpr->getSourceRange(),
                        "OpenACC Data clause member expr");
-      return {exprLoc, {}, {}};
+      return {exprLoc, {}, {}, std::move(bounds)};
     }
 
     // Sema has made sure that only 4 types of things can get here, array
@@ -223,14 +285,14 @@ class OpenACCClauseCIREmitter final
     // right.
     const auto *dre = cast<DeclRefExpr>(curVarExpr);
     const auto *vd = cast<VarDecl>(dre->getFoundDecl()->getCanonicalDecl());
-    return {exprLoc, cgf.emitDeclRefLValue(dre).getPointer(), vd->getName()};
+    return {exprLoc, cgf.emitDeclRefLValue(dre).getPointer(), vd->getName(),
+            std::move(bounds)};
   }
 
   template <typename BeforeOpTy, typename AfterOpTy>
   void addDataOperand(const Expr *varOperand, mlir::acc::DataClause dataClause,
                       bool structured, bool implicit) {
     DataOperandInfo opInfo = getDataOperandInfo(dirKind, varOperand);
-    mlir::ValueRange bounds;
 
     // TODO: OpenACC: we should comprehend the 'modifier-list' here for the 
data
     // operand. At the moment, we don't have a uniform way to assign these
@@ -239,7 +301,7 @@ class OpenACCClauseCIREmitter final
 
     auto beforeOp =
         builder.create<BeforeOpTy>(opInfo.beginLoc, opInfo.varValue, 
structured,
-                                   implicit, opInfo.name, bounds);
+                                   implicit, opInfo.name, opInfo.bounds);
     operation.getDataClauseOperandsMutable().append(beforeOp.getResult());
 
     AfterOpTy afterOp;
@@ -248,7 +310,7 @@ class OpenACCClauseCIREmitter final
       builder.setInsertionPointAfter(operation);
       afterOp = builder.create<AfterOpTy>(opInfo.beginLoc, 
beforeOp.getResult(),
                                           opInfo.varValue, structured, 
implicit,
-                                          opInfo.name, bounds);
+                                          opInfo.name, opInfo.bounds);
     }
 
     // Set the 'rest' of the info for both operations.
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-copy.c 
b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
index 8df6b9fc03454..3ebca6578c8a9 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
@@ -12,6 +12,8 @@ void acc_compute(int parmVar) {
   // CHECK-NEXT: %[[LOCAL3:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, 
["localVar3"] 
   // CHECK-NEXT: %[[LOCALPTR:.*]] = cir.alloca !cir.ptr<!s16i>, 
!cir.ptr<!cir.ptr<!s16i>>, ["localPointer"]
   // CHECK-NEXT: %[[LOCALARRAY:.*]] = cir.alloca !cir.array<!cir.float x 5>, 
!cir.ptr<!cir.array<!cir.float x 5>>, ["localArray"]
+  // CHECK-NEXT: %[[LOCALARRAYOFPTRS:.*]] = cir.alloca 
!cir.array<!cir.ptr<!cir.float> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.float> 
x 5>>, ["localArrayOfPtrs"]
+  // CHECK-NEXT: %[[THREEDARRAY:.*]] = cir.alloca 
!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>, 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>, 
["threeDArray"]
   // CHECK-NEXT: cir.store %[[ARG]], %[[PARM]] : !s32i, !cir.ptr<!s32i>
 
 #pragma acc parallel loop copy(localVar1, parmVar) copy(localVar2) 
copy(localVar3, parmVar)
@@ -261,4 +263,503 @@ void acc_compute(int parmVar) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
   // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!s32i>)  
async(%[[ZERO_CAST]] : si32, %[[ONE_CAST]] : si32 [#acc.device_type<nvidia>], 
%[[ONE_CAST]] : si32 [#acc.device_type<radeon>]) to varPtr(%[[LOCAL1]] : 
!cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, name = "localVar1"} 
loc
+
+#pragma acc parallel loop copy(localArray[3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+  
+#pragma acc serial loop copy(localArray[1:3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc kernels loop copy(localArray[:3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc parallel loop copy(localArray[1:])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc serial loop copy(localArray[localVar1:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc kernels loop copy(localArray[:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc parallel loop copy(localArray[localVar1:])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc serial loop copy(localPointer[3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc kernels loop copy(localPointer[1:3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc parallel loop copy(localPointer[:3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc serial loop copy(localPointer[localVar1:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc kernels loop copy(localPointer[:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+  float *localArrayOfPtrs[5];
+#pragma acc parallel loop copy(localArrayOfPtrs[3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial loop copy(localArrayOfPtrs[3][2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc kernels loop copy(localArrayOfPtrs[localVar1:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel loop copy(localArrayOfPtrs[localVar1:])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial loop copy(localArrayOfPtrs[:localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc kernels loop copy(localArrayOfPtrs[localVar1])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel loop copy(localArrayOfPtrs[localVar1][localVar2])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV2_CAST]] : si16) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  
+#pragma acc serial loop copy(localArrayOfPtrs[localVar1][localVar2:parmVar])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[PV:.*]] = cir.load %[[PARM]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[PV_CAST:.*]] = builtin.unrealized_conversion_cast %[[PV]] 
: !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV2_CAST]] : si16) 
extent(%[[PV_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+ 
+#pragma acc kernels loop copy(localArrayOfPtrs[localVar1][:parmVar])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[PV:.*]] = cir.load %[[PARM]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[PV_CAST:.*]] = builtin.unrealized_conversion_cast %[[PV]] 
: !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[PV_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel loop copy(localArrayOfPtrs[localVar1:localVar2][:1])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel combined(loop)  dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial loop copy(localArrayOfPtrs[localVar1:localVar2][1:1])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+  double threeDArray[5][6][7];
+#pragma acc kernels loop copy(threeDArray[1][2][3])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS3:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) -> 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>> 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+  // CHECK-NEXT: acc.kernels combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) to varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+
+#pragma acc parallel loop copy(threeDArray[1:1][2:1][3:1])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS3:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) -> 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>> 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+  // CHECK-NEXT: acc.parallel combined(loop) dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) {
+  // CHECK-NEXT: acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) to varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-copy.c 
b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
index a542409f07152..77e8cc8562c73 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
@@ -12,6 +12,9 @@ void acc_compute(int parmVar) {
   // CHECK-NEXT: %[[LOCAL3:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, 
["localVar3"] 
   // CHECK-NEXT: %[[LOCALPTR:.*]] = cir.alloca !cir.ptr<!s16i>, 
!cir.ptr<!cir.ptr<!s16i>>, ["localPointer"]
   // CHECK-NEXT: %[[LOCALARRAY:.*]] = cir.alloca !cir.array<!cir.float x 5>, 
!cir.ptr<!cir.array<!cir.float x 5>>, ["localArray"]
+  // CHECK-NEXT: %[[LOCALARRAYOFPTRS:.*]] = cir.alloca 
!cir.array<!cir.ptr<!cir.float> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.float> 
x 5>>, ["localArrayOfPtrs"]
+  // CHECK-NEXT: %[[THREEDARRAY:.*]] = cir.alloca 
!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>, 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>, 
["threeDArray"]
+
   // CHECK-NEXT: cir.store %[[ARG]], %[[PARM]] : !s32i, !cir.ptr<!s32i>
 
 #pragma acc parallel copy(localVar1, parmVar) copy(localVar2) copy(localVar3, 
parmVar)
@@ -210,4 +213,426 @@ void acc_compute(int parmVar) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
   // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!s32i>)  
async(%[[ZERO_CAST]] : si32, %[[ONE_CAST]] : si32 [#acc.device_type<nvidia>], 
%[[ONE_CAST]] : si32 [#acc.device_type<radeon>]) to varPtr(%[[LOCAL1]] : 
!cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, name = "localVar1"} 
loc
+
+#pragma acc parallel copy(localArray[3])
+  ;
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc serial copy(localArray[1:3])
+  ;
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc kernels copy(localArray[:3])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc parallel copy(localArray[1:])
+  ;
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+#pragma acc serial copy(localArray[localVar1:localVar2])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc kernels copy(localArray[:localVar2])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc parallel copy(localArray[localVar1:])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAY]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.float x 5>> {dataClause = #acc<data_clause acc_copy>, 
name = "localArray"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAY]] : !cir.ptr<!cir.array<!cir.float x 5>>) {dataClause = 
#acc<data_clause acc_copy>, name = "localArray"} loc
+
+#pragma acc serial copy(localPointer[3])
+  ;
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc kernels copy(localPointer[1:3])
+  ;
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc parallel copy(localPointer[:3])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[THREE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc serial copy(localPointer[localVar1:localVar2])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+#pragma acc kernels copy(localPointer[:localVar2])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALPTR]] : 
!cir.ptr<!cir.ptr<!s16i>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.ptr<!s16i>> 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.ptr<!s16i>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : !cir.ptr<!cir.ptr<!s16i>>) 
bounds(%[[BOUNDS]]) to varPtr(%[[LOCALPTR]] : !cir.ptr<!cir.ptr<!s16i>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localPointer"} loc
+
+  float *localArrayOfPtrs[5];
+#pragma acc parallel copy(localArrayOfPtrs[3])
+  ;
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+#pragma acc serial copy(localArrayOfPtrs[3][2])
+  ;
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc kernels copy(localArrayOfPtrs[localVar1:localVar2])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel copy(localArrayOfPtrs[localVar1:])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_CONST:.*]] = arith.constant 4 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
upperbound(%[[FOUR_CONST]] : i64) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial copy(localArrayOfPtrs[:localVar2])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc kernels copy(localArrayOfPtrs[localVar1])
+  ;
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) -> 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]]) to 
varPtr(%[[LOCALARRAYOFPTRS]] : !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel copy(localArrayOfPtrs[localVar1][localVar2])
+  ;
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV2_CAST]] : si16) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial copy(localArrayOfPtrs[localVar1][localVar2:parmVar])
+  ;
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[PV:.*]] = cir.load %[[PARM]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[PV_CAST:.*]] = builtin.unrealized_conversion_cast %[[PV]] 
: !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[LV2_CAST]] : si16) 
extent(%[[PV_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc kernels copy(localArrayOfPtrs[localVar1][:parmVar])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[PV:.*]] = cir.load %[[PARM]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[PV_CAST:.*]] = builtin.unrealized_conversion_cast %[[PV]] 
: !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[PV_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc parallel copy(localArrayOfPtrs[localVar1:localVar2][:1])
+  ;
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ZERO_CONST]] : i64) 
extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.parallel  dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+#pragma acc serial copy(localArrayOfPtrs[localVar1:localVar2][1:1])
+  ;
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST2:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST2]] : i64) loc
+  // CHECK-NEXT: %[[LV1:.*]] = cir.load %[[LOCAL1]] : !cir.ptr<!s32i>, !s32i 
+  // CHECK-NEXT: %[[LV1_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV1]] : !s32i to si32
+  // CHECK-NEXT: %[[LV2:.*]] = cir.load %[[LOCAL2]] : !cir.ptr<!s16i>, !s16i 
+  // CHECK-NEXT: %[[LV2_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[LV2]] : !s16i to si16
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[LV1_CAST]] : si32) 
extent(%[[LV2_CAST]] : si16) stride(%[[ONE_CONST]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) -> !cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>> {dataClause = 
#acc<data_clause acc_copy>, name = "localArrayOfPtrs"} loc
+  // CHECK-NEXT: acc.serial dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) bounds(%[[BOUNDS]], 
%[[BOUNDS2]]) to varPtr(%[[LOCALARRAYOFPTRS]] : 
!cir.ptr<!cir.array<!cir.ptr<!cir.float> x 5>>) {dataClause = #acc<data_clause 
acc_copy>, name = "localArrayOfPtrs"} loc
+
+  double threeDArray[5][6][7];
+#pragma acc kernels copy(threeDArray[1][2][3])
+  ;
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS3:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) -> 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>> 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+  // CHECK-NEXT: acc.kernels dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) to varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+
+#pragma acc parallel copy(threeDArray[1:1][2:1][3:1])
+  ;
+  // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[THREE_CAST]] : 
si32) extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[TWO]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS2:.*]] = acc.bounds lowerbound(%[[TWO_CAST]] : si32) 
extent(%[[ONE_CAST]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast 
%[[ONE]] : !s32i to si32
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
+  // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
+  // CHECK-NEXT: %[[BOUNDS3:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) 
extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) 
startIdx(%[[ZERO_CONST]] : i64) loc
+  // CHECK-NEXT: %[[COPYIN1:.*]] = acc.copyin varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) -> 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>> 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
+  // CHECK-NEXT: acc.parallel dataOperands(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+  // CHECK-NEXT: acc.copyout accPtr(%[[COPYIN1]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
bounds(%[[BOUNDS]], %[[BOUNDS2]], %[[BOUNDS3]]) to varPtr(%[[THREEDARRAY]] : 
!cir.ptr<!cir.array<!cir.array<!cir.array<!cir.double x 7> x 6> x 5>>) 
{dataClause = #acc<data_clause acc_copy>, name = "threeDArray"} loc
 }

>From ce0998b47eddc3aca2e5f8037a07ff6cb4dfa96c Mon Sep 17 00:00:00 2001
From: erichkeane <eke...@nvidia.com>
Date: Fri, 23 May 2025 05:57:52 -0700
Subject: [PATCH 2/2] Review comment fixes

---
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp 
b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 9e45ad8519e11..8b61d3fae3ad0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -196,12 +196,7 @@ class OpenACCClauseCIREmitter final
                           mlir::Value upperBound, mlir::Value extent) {
     // Arrays always have a start-idx of 0.
     mlir::Value startIdx = createConstantInt(boundLoc, 64, 0);
-    // TODO: OpenACC: It isn't clear that stride would ever be anything other
-    // than '1'?  We emit the type of the reference 'correctly' as far as I
-    // know, so it should just be 1 element each time. We could perhaps use
-    // the 'inBytes' variant here, but it isn't clear what value that gets us.
-    // We might need to revisit this once we try to opt this and see what is
-    // going to happen.
+    // Stride is always 1 in C/C++.
     mlir::Value stride = createConstantInt(boundLoc, 64, 1);
 
     auto bound = builder.create<mlir::acc::DataBoundsOp>(boundLoc, lowerBound,
@@ -261,7 +256,7 @@ class OpenACCClauseCIREmitter final
 
         curVarExpr = section->getBase()->IgnoreParenImpCasts();
       } else {
-        const auto *subscript = dyn_cast<ArraySubscriptExpr>(curVarExpr);
+        const auto *subscript = cast<ArraySubscriptExpr>(curVarExpr);
 
         lowerBound = emitIntExpr(subscript->getIdx());
         // Length of an array index is always 1.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to