Author: Prateek Gupta Date: 2021-09-27T22:46:32+05:30 New Revision: 4f5c3045514341cd5e98bc7b84dd05d49ef37cb7
URL: https://github.com/llvm/llvm-project/commit/4f5c3045514341cd5e98bc7b84dd05d49ef37cb7 DIFF: https://github.com/llvm/llvm-project/commit/4f5c3045514341cd5e98bc7b84dd05d49ef37cb7.diff LOG: [MLIR] Add to enforce perfect nests while loop fusion This commit adds an option to the affine-cs pipeline to enforce perfect nests while loop fusion. Relevant test cases are also added. Signed-Off-By: Prateek Gupta <prat...@polymagelabs.com> Changes while porting to upstream by Vinayaka Bandishti: 1. Remove test case of reduction fusion into pointwise, since this was being prevented due to an incorrect reason. Added: mlir/test/Transforms/affine-cs-pipeline-perfect-nest.mlir Modified: mlir/include/mlir/Transforms/Passes.h mlir/include/mlir/Transforms/Passes.td mlir/lib/Transforms/AffineCSPipeline.cpp mlir/lib/Transforms/LoopFusion.cpp Removed: ################################################################################ diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index 4296fcdc590c..3d402eb4f61c 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -26,6 +26,9 @@ class AffineForOp; class GreedyRewriteConfig; class OpPassManager; +// Cerebras specific options. +struct AffineCSPipelineOptions; + //===----------------------------------------------------------------------===// // Passes //===----------------------------------------------------------------------===// @@ -75,11 +78,11 @@ std::unique_ptr<Pass> createCSEPass(); /// Creates a loop fusion pass which fuses loops. Buffers of size less than or /// equal to `localBufSizeThreshold` are promoted to memory space -/// `fastMemorySpace'. -std::unique_ptr<OperationPass<FuncOp>> -createLoopFusionPass(unsigned fastMemorySpace = 0, - uint64_t localBufSizeThreshold = 0, - bool maximalFusion = false); +/// `fastMemorySpace'. If `enforcePerfectNest` is true, perfect nesting is +/// enforced while fusing. +std::unique_ptr<OperationPass<FuncOp>> createLoopFusionPass( + unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0, + bool maximalFusion = false, bool enforcePerfectNest = false); /// Creates a loop invariant code motion pass that hoists loop invariant /// instructions out of the loop. @@ -143,7 +146,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createNormalizeMemRefsPass(); /// Creates an affine optimization pipeline including fusion and other /// complementary passes. -void createAffineCSPipeline(OpPassManager &pm); +void createAffineCSPipeline(OpPassManager &pm, + const AffineCSPipelineOptions &options); } // end namespace mlir diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td index 91af2a2c56a9..479b811cbfc0 100644 --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -136,6 +136,8 @@ def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> { "to fast memory space">, Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false", "Enables maximal loop fusion">, + Option<"enforcePerfectNest", "enforce-perfect-nest", "bool", /*default=*/"false", + "Enforces perfect nesting while performing loop fusion."> ]; let dependentDialects = ["memref::MemRefDialect"]; } diff --git a/mlir/lib/Transforms/AffineCSPipeline.cpp b/mlir/lib/Transforms/AffineCSPipeline.cpp index c6994188b0fd..340e4ad50c5b 100644 --- a/mlir/lib/Transforms/AffineCSPipeline.cpp +++ b/mlir/lib/Transforms/AffineCSPipeline.cpp @@ -17,19 +17,32 @@ #include "mlir/Transforms/Passes.h" namespace mlir { -void createAffineCSPipeline(OpPassManager &pm) { +struct AffineCSPipelineOptions + : public PassPipelineOptions<AffineCSPipelineOptions> { + Option<bool> enforce_perfect_nest{ + *this, "enforce-perfect-nest", + llvm::cl::desc("Enables enforcement of perfect nesting while performing " + "loop fusion."), + llvm::cl::init(false)}; +}; + +void createAffineCSPipeline(OpPassManager &pm, + const AffineCSPipelineOptions &options) { pm.addPass(mlir::createCanonicalizerPass()); - pm.addPass(mlir::createLoopFusionPass(/*fastMemorySpace=*/0, - /*localBufSizeThreshold=*/0, - /*maximalFusion=*/true)); + pm.addPass(mlir::createLoopFusionPass( + /*fastMemorySpace=*/0, + /*localBufSizeThreshold=*/0, + /*maximalFusion=*/true, + /*enforcePerfectNest=*/options.enforce_perfect_nest)); pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(mlir::createAffineScalarReplacementPass()); pm.addPass(mlir::createCanonicalizerPass()); } + void registerAffineCSPipeline() { - mlir::PassPipelineRegistration<>( + mlir::PassPipelineRegistration<AffineCSPipelineOptions>( "affine-cs-pipeline", - "runs all passes for performing affine loop fusion and other " + "runs passes to perform affine loop fusion and other " "complimentary optimizations.", createAffineCSPipeline); } diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index c19c887a593d..5bedaeafc99f 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -49,10 +49,11 @@ namespace { struct LoopFusion : public AffineLoopFusionBase<LoopFusion> { LoopFusion() = default; LoopFusion(unsigned fastMemorySpace, uint64_t localBufSizeThresholdBytes, - bool maximalFusion) { + bool maximalFusion, bool enforcePerfectNest) { this->fastMemorySpace = fastMemorySpace; this->localBufSizeThreshold = localBufSizeThresholdBytes / 1024; this->maximalFusion = maximalFusion; + this->enforcePerfectNest = enforcePerfectNest; } void runOnFunction() override; @@ -62,9 +63,10 @@ struct LoopFusion : public AffineLoopFusionBase<LoopFusion> { std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopFusionPass(unsigned fastMemorySpace, - uint64_t localBufSizeThreshold, bool maximalFusion) { + uint64_t localBufSizeThreshold, bool maximalFusion, + bool enforcePerfectNest) { return std::make_unique<LoopFusion>(fastMemorySpace, localBufSizeThreshold, - maximalFusion); + maximalFusion, enforcePerfectNest); } namespace { @@ -1397,16 +1399,18 @@ struct GreedyFusion { // unique consumer. // *) Second pass fuses sibling nodes which share no dependence edges. // *) Third pass fuses any remaining producer nodes into their users. - void run() { + void run(bool enforcePerfectNest = false) { // TODO: Run this repeatedly until a fixed-point is reached. - fuseProducerConsumerNodes(/*maxSrcUserCount=*/1); + fuseProducerConsumerNodes(/*maxSrcUserCount=*/1, enforcePerfectNest); fuseSiblingNodes(); fuseProducerConsumerNodes( - /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max()); + /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max(), + enforcePerfectNest); eraseUnusedMemRefAllocations(); } - void fuseProducerConsumerNodes(unsigned maxSrcUserCount) { + void fuseProducerConsumerNodes(unsigned maxSrcUserCount, + bool enforcePerfectNest) { LLVM_DEBUG(llvm::dbgs() << "--- Producer/Consumer Fusion ---\n"); init(); while (!worklist.empty()) { @@ -1606,10 +1610,18 @@ struct GreedyFusion { privateMemrefs.insert(memref); } + // To preserve perfect nesting, the required destination loop depth + // must be the depth of the load operation. + if (enforcePerfectNest) { + if (bestDstLoopDepth != dstLoopDepthTest) { + continue; + } + } + // Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'. - fuseLoops(srcAffineForOp, dstAffineForOp, bestSlice); + fuseLoops(srcAffineForOp, dstAffineForOp, + depthSliceUnions[bestDstLoopDepth - 1]); dstNodeChanged = true; - LLVM_DEBUG(llvm::dbgs() << "Fused src loop " << srcId << " into dst loop " << dstId << " at depth " << bestDstLoopDepth << ":\n" @@ -1971,5 +1983,5 @@ void LoopFusion::runOnFunction() { unsigned localBufSizeThresholdBytes = localBufSizeThreshold * 1024; GreedyFusion fusion(&g, localBufSizeThresholdBytes, fastMemorySpaceOpt, maximalFusion, computeToleranceThreshold); - fusion.run(); + fusion.run(this->enforcePerfectNest); } diff --git a/mlir/test/Transforms/affine-cs-pipeline-perfect-nest.mlir b/mlir/test/Transforms/affine-cs-pipeline-perfect-nest.mlir new file mode 100644 index 000000000000..a810d700a784 --- /dev/null +++ b/mlir/test/Transforms/affine-cs-pipeline-perfect-nest.mlir @@ -0,0 +1,78 @@ +// RUN: mlir-opt %s -affine-cs-pipeline="enforce-perfect-nest" | FileCheck %s + +// This test cases checks the enforcement of perfect nesting for simple matrix multiplication. +func @simple_matmul_one(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: memref<16x16xf32>) { + %cst = constant 0.000000e+00 : f32 + affine.for %arg3 = 0 to 16 { + affine.for %arg4 = 0 to 16 { + affine.store %cst, %arg2[%arg3, %arg4] : memref<16x16xf32> + } + } + affine.for %arg3 = 0 to 16 { + affine.for %arg4 = 0 to 16 { + affine.for %arg5 = 0 to 16 { + %0 = affine.load %arg0[%arg3, %arg5] : memref<16x16xf32> + %1 = affine.load %arg1[%arg5, %arg4] : memref<16x16xf32> + %2 = affine.load %arg2[%arg3, %arg4] : memref<16x16xf32> + %3 = mulf %0, %1 : f32 + %4 = addf %3, %2 : f32 + affine.store %4, %arg2[%arg3, %arg4] : memref<16x16xf32> + } + } + } + return +} + +// CHECK-LABEL: func @simple_matmul_one +// CHECK: (%[[LHS:.*]]: memref<16x16xf32>, %[[RHS:.*]]: memref<16x16xf32>, %[[OUT:.*]]: memref<16x16xf32>) { +// CHECK: %[[INIT:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: affine.for %[[i:.*]] = 0 to 16 { +// CHECK-NEXT: affine.for %[[j:.*]] = 0 to 16 { +// CHECK-NEXT: affine.store %[[INIT]], %[[OUT]][%[[i]], %[[j]]] +// CHECK: affine.for %[[i:.*]] = 0 to 16 { +// CHECK-NEXT: affine.for %[[j:.*]] = 0 to 16 { +// CHECK-NEXT: affine.for %[[k:.*]] = 0 to 16 { +// CHECK-NEXT: %[[LHS_VAL:.*]] = affine.load %[[LHS]][%[[i]], %[[k]]] +// CHECK-NEXT: %[[RHS_VAL:.*]] = affine.load %[[RHS]][%[[k]], %[[j]]] +// CHECK-NEXT: %[[OUT_VAL:.*]] = affine.load %[[OUT]][%[[i]], %[[j]]] +// CHECK-NEXT: %[[PROD:.*]] = mulf %[[LHS_VAL]], %[[RHS_VAL]] +// CHECK-NEXT: %[[RES:.*]] = addf %[[PROD]], %[[OUT_VAL]] +// CHECK-NEXT: affine.store %[[RES]], %[[OUT]][%[[i]], %[[j]]] + +func @should_fuse_pointwise(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: memref<16x16xf32>){ + affine.for %arg3 = 0 to 16 { + affine.for %arg4 = 0 to 16 { + %0 = affine.load %arg0[%arg3, %arg4] : memref<16x16xf32> + %1 = affine.load %arg1[%arg3, %arg4] : memref<16x16xf32> + %2 = addf %0, %1 : f32 + affine.store %2, %arg0[%arg3, %arg4] : memref<16x16xf32> + } + } + affine.for %arg3 = 0 to 16 { + affine.for %arg4 = 0 to 16 { + affine.for %arg5 = 0 to 16 { + %0 = affine.load %arg0[%arg3, %arg5] : memref<16x16xf32> + %1 = affine.load %arg1[%arg5, %arg4] : memref<16x16xf32> + %2 = affine.load %arg2[%arg3, %arg4] : memref<16x16xf32> + %3 = mulf %0, %1 : f32 + %4 = addf %3, %2 : f32 + affine.store %4, %arg2[%arg3, %arg4] : memref<16x16xf32> + } + } + } + return +} + +// CHECK-LABEL: func @should_fuse_pointwise +// CHECK: affine.for +// CHECK-NEXT: affine.for +// CHECK-NEXT: affine.for +// CHECK-NEXT: {{.*}} = affine.load +// CHECK-NEXT: {{.*}} = affine.load +// CHECK-NEXT: {{.*}} = addf +// CHECK-NEXT: affine.store +// CHECK-NEXT: {{.*}} = affine.load +// CHECK-NEXT: {{.*}} = affine.load +// CHECK-NEXT: {{.*}} = mulf +// CHECK-NEXT: {{.*}} = addf +// CHHECK-NEXT: affine.store _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits