https://github.com/MattPD updated https://github.com/llvm/llvm-project/pull/123867
>From b69344ca228d3954d43c47a316a6e397ecfd8c64 Mon Sep 17 00:00:00 2001 From: "Matt P. Dziubinski" <matt-p.dziubin...@hpe.com> Date: Tue, 21 Jan 2025 19:42:41 -0600 Subject: [PATCH] [OpenMP][SIMD][FIX] Use conservative "omp simd ordered" lowering A proposed fix for #95611 [OpenMP][SIMD] ordered has no effect in a loop SIMD region as of LLVM 18.1.0 Changes: - Implement new lowering behavior: Conservatively serialize "omp simd" loops that have `omp simd ordered` directive to prevent incorrect vectorization (which results in incorrect execution behavior of the miscompiled program). Implementation outline: - We start with the optimistic default initial value of `LoopStack.setParallel(/Enable=/true);` in `CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective &D)`. - We only disable the loop parallel memory access assumption with `if (HasOrderedDirective) LoopStack.setParallel(/Enable=/false);` using the `HasOrderedDirective` (which tests for the presence of an `OMPOrderedDirective`). - This results in no longer incorrectly vectorizing the loop when the `omp simd ordered` directive is present. Motivation: We'd like to prevent incorrect vectorization of the loops marked with the `#pragma omp ordered simd` directive which has previously resulted in miscompiled code. At the same time, we'd like the usage outside of the `#pragma omp ordered simd` context to remain unaffected: Note that in the test "clang/test/OpenMP/ordered_codegen.cpp" we only "lose" the `!llvm.access.group` metadata in `foo_simd` alone. This is a conservative, in that it's possible some of the loops would be possible to vectorize, but we prefer to avoid miscompilation of the loops that are currently illegal to vectorize. A concrete example follows: ```cpp // "test.c" #include <float.h> #include <math.h> #include <omp.h> #include <stdio.h> #include <stdlib.h> #include <time.h> int compare_float(float x1, float x2, float scalar) { const float diff = fabsf(x1 - x2); x1 = fabsf(x1); x2 = fabsf(x2); const float l = (x2 > x1) ? x2 : x1; if (diff <= l * scalar * FLT_EPSILON) return 1; else return 0; } #define ARRAY_SIZE 256 __attribute__((noinline)) void initialization_loop( float X[ARRAY_SIZE][ARRAY_SIZE], float Y[ARRAY_SIZE][ARRAY_SIZE]) { const float max = 1000.0; srand(time(NULL)); for (int r = 0; r < ARRAY_SIZE; r++) { for (int c = 0; c < ARRAY_SIZE; c++) { X[r][c] = ((float)rand() / (float)(RAND_MAX)) * max; Y[r][c] = X[r][c]; } } } __attribute__((noinline)) void omp_simd_loop(float X[ARRAY_SIZE][ARRAY_SIZE]) { for (int r = 1; r < ARRAY_SIZE; ++r) { for (int c = 1; c < ARRAY_SIZE; ++c) { #pragma omp simd for (int k = 2; k < ARRAY_SIZE; ++k) { #pragma omp ordered simd X[r][k] = X[r][k - 2] + sinf((float)(r / c)); } } } } __attribute__((noinline)) int comparison_loop(float X[ARRAY_SIZE][ARRAY_SIZE], float Y[ARRAY_SIZE][ARRAY_SIZE]) { int totalErrors_simd = 0; const float scalar = 1.0; for (int r = 1; r < ARRAY_SIZE; ++r) { for (int c = 1; c < ARRAY_SIZE; ++c) { for (int k = 2; k < ARRAY_SIZE; ++k) { Y[r][k] = Y[r][k - 2] + sinf((float)(r / c)); } } // check row for simd update for (int k = 0; k < ARRAY_SIZE; ++k) { if (!compare_float(X[r][k], Y[r][k], scalar)) { ++totalErrors_simd; } } } return totalErrors_simd; } int main(void) { float X[ARRAY_SIZE][ARRAY_SIZE]; float Y[ARRAY_SIZE][ARRAY_SIZE]; initialization_loop(X, Y); omp_simd_loop(X); const int totalErrors_simd = comparison_loop(X, Y); if (totalErrors_simd) { fprintf(stdout, "totalErrors_simd: %d \n", totalErrors_simd); fprintf(stdout, "%s : %d - FAIL: error in ordered simd computation.\n", __FILE__, __LINE__); } else { fprintf(stdout, "Success!\n"); } return totalErrors_simd; } ``` Before: ``` $ clang -fopenmp-simd -O3 -ffast-math -lm test.c -o test && ./test totalErrors_simd: 15408 test.c : 76 - FAIL: error in ordered simd computation. ``` clang 19.1.0: https://godbolt.org/z/6EvhxqEhe After: ``` $ clang -fopenmp-simd -O3 -ffast-math test.c -o test && ./test Success! ``` --- clang/lib/CodeGen/CGStmtOpenMP.cpp | 76 ++++++ clang/test/OpenMP/ordered_codegen.cpp | 232 +++++++++--------- clang/test/OpenMP/simd_conservative_ordered.c | 110 +++++++++ 3 files changed, 302 insertions(+), 116 deletions(-) create mode 100644 clang/test/OpenMP/simd_conservative_ordered.c diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 8e694b95dc7e73..3542e939678cf4 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2457,10 +2457,86 @@ static void emitSimdlenSafelenClause(CodeGenFunction &CGF, } } +// Check for the presence of an `OMPOrderedDirective`, +// i.e., `ordered` in `#pragma omp ordered simd`. +// +// Consider the following source code: +// ``` +// __attribute__((noinline)) void omp_simd_loop(float X[ARRAY_SIZE][ARRAY_SIZE]) +// { +// for (int r = 1; r < ARRAY_SIZE; ++r) { +// for (int c = 1; c < ARRAY_SIZE; ++c) { +// #pragma omp simd +// for (int k = 2; k < ARRAY_SIZE; ++k) { +// #pragma omp ordered simd +// X[r][k] = X[r][k - 2] + sinf((float)(r / c)); +// } +// } +// } +// } +// ``` +// +// Suppose we are in `CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective +// &D)`. By examining `D.dump()` we have the following AST containing +// `OMPOrderedDirective`: +// +// ``` +// OMPSimdDirective 0x1c32950 +// `-CapturedStmt 0x1c32028 +// |-CapturedDecl 0x1c310e8 +// | |-ForStmt 0x1c31e30 +// | | |-DeclStmt 0x1c31298 +// | | | `-VarDecl 0x1c31208 used k 'int' cinit +// | | | `-IntegerLiteral 0x1c31278 'int' 2 +// | | |-<<<NULL>>> +// | | |-BinaryOperator 0x1c31308 'int' '<' +// | | | |-ImplicitCastExpr 0x1c312f0 'int' <LValueToRValue> +// | | | | `-DeclRefExpr 0x1c312b0 'int' lvalue Var 0x1c31208 'k' 'int' +// | | | `-IntegerLiteral 0x1c312d0 'int' 256 +// | | |-UnaryOperator 0x1c31348 'int' prefix '++' +// | | | `-DeclRefExpr 0x1c31328 'int' lvalue Var 0x1c31208 'k' 'int' +// | | `-CompoundStmt 0x1c31e18 +// | | `-OMPOrderedDirective 0x1c31dd8 +// | | |-OMPSimdClause 0x1c31380 +// | | `-CapturedStmt 0x1c31cd0 +// ``` +// +// Note the presence of `OMPOrderedDirective` above: +// It's (transitively) nested in a `CapturedStmt` representing the pragma +// annotated compound statement. Thus, we need to consider this nesting and +// include checking the `getCapturedStmt` in this case. +static bool hasOrderedDirective(const Stmt *S) { + if (isa<OMPOrderedDirective>(S)) + return true; + + if (const auto *CS = dyn_cast<CapturedStmt>(S)) + return hasOrderedDirective(CS->getCapturedStmt()); + + for (const Stmt *Child : S->children()) { + if (Child && hasOrderedDirective(Child)) + return true; + } + + return false; +} + +static void applyConservativeSimdOrderedDirective(const Stmt &AssociatedStmt, + LoopInfoStack &LoopStack) { + // Check for the presence of an `OMPOrderedDirective` + // i.e., `ordered` in `#pragma omp ordered simd` + bool HasOrderedDirective = hasOrderedDirective(&AssociatedStmt); + // If present then conservatively disable loop vectorization + // analogously to how `emitSimdlenSafelenClause` does. + if (HasOrderedDirective) + LoopStack.setParallel(/*Enable=*/false); +} + void CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective &D) { // Walk clauses and process safelen/lastprivate. LoopStack.setParallel(/*Enable=*/true); LoopStack.setVectorizeEnable(); + const Stmt *AssociatedStmt = D.getAssociatedStmt(); + applyConservativeSimdOrderedDirective(*AssociatedStmt, LoopStack); emitSimdlenSafelenClause(*this, D); if (const auto *C = D.getSingleClause<OMPOrderClause>()) if (C->getKind() == OMPC_ORDER_concurrent) diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp index 67285cfaef34d5..5cd95f1927e5ce 100644 --- a/clang/test/OpenMP/ordered_codegen.cpp +++ b/clang/test/OpenMP/ordered_codegen.cpp @@ -572,30 +572,30 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK1-NEXT: [[ADD6:%.*]] = add i32 [[TMP9]], 1 // CHECK1-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP8]], [[ADD6]] // CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP11]], 1 // CHECK1-NEXT: [[ADD8:%.*]] = add i32 [[TMP10]], [[MUL]] -// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK1-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP13]], 1 -// CHECK1-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK1-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK1: omp.inner.for.end: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -645,31 +645,31 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29:%.*]] // CHECK1: omp.inner.for.cond29: -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: [[ADD30:%.*]] = add i32 [[TMP29]], 1 // CHECK1-NEXT: [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]] // CHECK1-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]] // CHECK1: omp.inner.for.body32: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: [[MUL33:%.*]] = mul i32 [[TMP31]], 1 // CHECK1-NEXT: [[ADD34:%.*]] = add i32 [[TMP30]], [[MUL33]] -// CHECK1-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4 // CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM35]] -// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4 +// CHECK1-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE37:%.*]] // CHECK1: omp.body.continue37: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC38:%.*]] // CHECK1: omp.inner.for.inc38: -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: [[ADD39:%.*]] = add i32 [[TMP33]], 1 -// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4 +// CHECK1-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK1: omp.inner.for.end40: // CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK1: omp.dispatch.inc: @@ -1201,32 +1201,32 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1-IRBUILDER: omp.inner.for.cond: -// CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK1-IRBUILDER-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK1-IRBUILDER-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body: -// CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK1-IRBUILDER-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK1-IRBUILDER-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1-IRBUILDER: omp.inner.for.inc: -// CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], 1 -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK1-IRBUILDER: omp.inner.for.end: // CHECK1-IRBUILDER-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK1-IRBUILDER-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -1278,34 +1278,34 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store i32 [[TMP26]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] // CHECK1-IRBUILDER: omp.inner.for.cond30: -// CHECK1-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK1-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD31:%.*]] = add i32 [[TMP28]], 1 // CHECK1-IRBUILDER-NEXT: [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]] // CHECK1-IRBUILDER-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body33: -// CHECK1-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL34:%.*]] = mul i32 [[TMP30]], 1 // CHECK1-IRBUILDER-NEXT: [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]] -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4 // CHECK1-IRBUILDER-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64 // CHECK1-IRBUILDER-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM36]] -// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4 +// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body33.ordered.after: // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] // CHECK1-IRBUILDER: omp.body.continue38: // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] // CHECK1-IRBUILDER: omp.inner.for.inc39: -// CHECK1-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) -// CHECK1-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) +// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK1-IRBUILDER: omp.inner.for.end42: // CHECK1-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK1-IRBUILDER: omp.dispatch.inc: @@ -1812,30 +1812,30 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK3-NEXT: [[ADD6:%.*]] = add i32 [[TMP9]], 1 // CHECK3-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP8]], [[ADD6]] // CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[MUL:%.*]] = mul i32 [[TMP11]], 1 // CHECK3-NEXT: [[ADD8:%.*]] = add i32 [[TMP10]], [[MUL]] -// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK3-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[ADD9:%.*]] = add i32 [[TMP13]], 1 -// CHECK3-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK3-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -1885,31 +1885,31 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29:%.*]] // CHECK3: omp.inner.for.cond29: -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: [[ADD30:%.*]] = add i32 [[TMP29]], 1 // CHECK3-NEXT: [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]] // CHECK3-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]] // CHECK3: omp.inner.for.body32: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: [[MUL33:%.*]] = mul i32 [[TMP31]], 1 // CHECK3-NEXT: [[ADD34:%.*]] = add i32 [[TMP30]], [[MUL33]] -// CHECK3-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4 // CHECK3-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP32]] to i64 // CHECK3-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM35]] -// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4 +// CHECK3-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE37:%.*]] // CHECK3: omp.body.continue37: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC38:%.*]] // CHECK3: omp.inner.for.inc38: -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: [[ADD39:%.*]] = add i32 [[TMP33]], 1 -// CHECK3-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK3-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4 +// CHECK3-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK3: omp.inner.for.end40: // CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK3: omp.dispatch.inc: @@ -2441,32 +2441,32 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3-IRBUILDER: omp.inner.for.cond: -// CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK3-IRBUILDER-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK3-IRBUILDER-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body: -// CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK3-IRBUILDER-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK3-IRBUILDER-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3-IRBUILDER: omp.inner.for.inc: -// CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], 1 -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3-IRBUILDER: omp.inner.for.end: // CHECK3-IRBUILDER-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK3-IRBUILDER-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -2518,34 +2518,34 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store i32 [[TMP26]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] // CHECK3-IRBUILDER: omp.inner.for.cond30: -// CHECK3-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK3-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD31:%.*]] = add i32 [[TMP28]], 1 // CHECK3-IRBUILDER-NEXT: [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]] // CHECK3-IRBUILDER-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body33: -// CHECK3-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL34:%.*]] = mul i32 [[TMP30]], 1 // CHECK3-IRBUILDER-NEXT: [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]] -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4 // CHECK3-IRBUILDER-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64 // CHECK3-IRBUILDER-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM36]] -// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4 +// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body33.ordered.after: // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] // CHECK3-IRBUILDER: omp.body.continue38: // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] // CHECK3-IRBUILDER: omp.inner.for.inc39: -// CHECK3-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) -// CHECK3-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK3-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) +// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK3-IRBUILDER: omp.inner.for.end42: // CHECK3-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK3-IRBUILDER: omp.dispatch.inc: @@ -2885,33 +2885,33 @@ void foo_simd(int low, int up) { // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]] -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK5-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK5-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK5-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK5-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK5-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP12]] to i64 // CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM9]] -// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX10]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: [[ADD11:%.*]] = add i32 [[TMP13]], 1 -// CHECK5-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK5-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] // CHECK5: omp.inner.for.end: // CHECK5-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -2951,33 +2951,33 @@ void foo_simd(int low, int up) { // CHECK5-NEXT: store i32 [[TMP25]], ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32:%.*]] // CHECK5: omp.inner.for.cond32: -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]] -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK5-NEXT: [[ADD33:%.*]] = add i32 [[TMP27]], 1 // CHECK5-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP26]], [[ADD33]] // CHECK5-NEXT: br i1 [[CMP34]], label [[OMP_INNER_FOR_BODY35:%.*]], label [[OMP_INNER_FOR_END45:%.*]] // CHECK5: omp.inner.for.body35: -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: [[MUL36:%.*]] = mul i32 [[TMP29]], 1 // CHECK5-NEXT: [[ADD37:%.*]] = add i32 [[TMP28]], [[MUL36]] -// CHECK5-NEXT: store i32 [[ADD37]], ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store i32 [[ADD37]], ptr [[I31]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, ptr [[I31]], align 4 // CHECK5-NEXT: [[IDXPROM38:%.*]] = sext i32 [[TMP30]] to i64 // CHECK5-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM38]] -// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX39]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP31:%.*]] = load i32, ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX39]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, ptr [[I31]], align 4 // CHECK5-NEXT: [[IDXPROM40:%.*]] = sext i32 [[TMP31]] to i64 // CHECK5-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM40]] -// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX41]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX41]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE42:%.*]] // CHECK5: omp.body.continue42: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC43:%.*]] // CHECK5: omp.inner.for.inc43: -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: [[ADD44:%.*]] = add i32 [[TMP32]], 1 -// CHECK5-NEXT: store i32 [[ADD44]], ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK5-NEXT: store i32 [[ADD44]], ptr [[DOTOMP_IV30]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32]], !llvm.loop [[LOOP11:![0-9]+]] // CHECK5: omp.inner.for.end45: // CHECK5-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4 // CHECK5-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_20]], align 4 diff --git a/clang/test/OpenMP/simd_conservative_ordered.c b/clang/test/OpenMP/simd_conservative_ordered.c new file mode 100644 index 00000000000000..15fb61c7263a37 --- /dev/null +++ b/clang/test/OpenMP/simd_conservative_ordered.c @@ -0,0 +1,110 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "omp_simd_loop" --check-globals smart --filter "access|\%omp.inner.for.cond.*\!llvm.loop" +// +// RUN: %clang -g0 -fopenmp-simd -x c -S -emit-llvm %s -o - | FileCheck %s +#include <float.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +int compare_float(float x1, float x2, float scalar) { + const float diff = fabsf(x1 - x2); + x1 = fabsf(x1); + x2 = fabsf(x2); + const float l = (x2 > x1) ? x2 : x1; + if (diff <= l * scalar * FLT_EPSILON) + return 1; + else + return 0; +} + +#define ARRAY_SIZE 256 + +__attribute__((noinline)) void initialization_loop( + float X[ARRAY_SIZE][ARRAY_SIZE], float Y[ARRAY_SIZE][ARRAY_SIZE]) { + const float max = 1000.0; + srand(time(NULL)); + for (int r = 0; r < ARRAY_SIZE; r++) { + for (int c = 0; c < ARRAY_SIZE; c++) { + X[r][c] = ((float)rand() / (float)(RAND_MAX)) * max; + Y[r][c] = X[r][c]; + } + } +} + +// CHECK-LABEL: @omp_simd_loop( +// do not emit llvm.access.group metadata due to usage of ordered clause. +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}} +// CHECK-NOT: store i32 {{.+}}, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load ptr, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load float, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: call float @sinf(float noundef {{.+}}) #[[ATTR:[0-9]+]], llvm.access.group {{![0-9]+}} +// CHECK-NOT: load ptr, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: store float {{.+}}, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: load i32, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// CHECK-NOT: store i32 {{.+}}, ptr {{.+}}, align {{.+}}, llvm.access.group {{![0-9]+}} +// +// CHECK: br label [[OMP_INNER_FOR_COND:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +// +__attribute__((noinline)) void omp_simd_loop(float X[ARRAY_SIZE][ARRAY_SIZE]) { + for (int r = 1; r < ARRAY_SIZE; ++r) { + for (int c = 1; c < ARRAY_SIZE; ++c) { +#pragma omp simd + for (int k = 2; k < ARRAY_SIZE; ++k) { +#pragma omp ordered simd + X[r][k] = X[r][k - 2] + sinf((float)(r / c)); + } + } + } +} + +__attribute__((noinline)) int comparison_loop(float X[ARRAY_SIZE][ARRAY_SIZE], + float Y[ARRAY_SIZE][ARRAY_SIZE]) { + int totalErrors_simd = 0; + const float scalar = 1.0; + for (int r = 1; r < ARRAY_SIZE; ++r) { + for (int c = 1; c < ARRAY_SIZE; ++c) { + for (int k = 2; k < ARRAY_SIZE; ++k) { + Y[r][k] = Y[r][k - 2] + sinf((float)(r / c)); + } + } + // check row for simd update + for (int k = 0; k < ARRAY_SIZE; ++k) { + if (!compare_float(X[r][k], Y[r][k], scalar)) { + ++totalErrors_simd; + } + } + } + return totalErrors_simd; +} + +int main(void) { + float X[ARRAY_SIZE][ARRAY_SIZE]; + float Y[ARRAY_SIZE][ARRAY_SIZE]; + + initialization_loop(X, Y); + omp_simd_loop(X); + const int totalErrors_simd = comparison_loop(X, Y); + + if (totalErrors_simd) { + fprintf(stdout, "totalErrors_simd: %d \n", totalErrors_simd); + fprintf(stdout, "%s : %d - FAIL: error in ordered simd computation.\n", + __FILE__, __LINE__); + } else { + fprintf(stdout, "Success!\n"); + } + + return totalErrors_simd; +} +//. +// CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]]} +// CHECK-NOT: !{!"llvm.loop.parallel_accesses" +// CHECK: [[META10]] = !{!"llvm.loop.vectorize.enable", i1 true} +//. _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits