https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/149293
From 5deba69e57d2e13927987de919fecd30a87ac33d Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski <andrzej.warzyn...@arm.com> Date: Wed, 16 Jul 2025 17:08:55 +0000 Subject: [PATCH] [mlir][linalg] Enable scalable vectorization of linalg.unpack (WIP) This patch updates `vectorizeAsTensorUnpackOp` to support scalable vectorization by requiring user-specified vector sizes for both the _read_ and _write_ operations involved in `linalg.unpack`. Detailed rationale and an example are provided below. Conceptually, `linalg.unpack` consists of the following high-level steps: 1. _Read_ from the source tensor. 2. Transpose the value read in step (1). 3. _Write_ the value from step (2) into the destination tensor. Currently, when vectorizing with user-provided vector sizes, only the sizes for the _write_ operation (step 3) are required. Sizes for the _read_ operation (step 1) are inferred from static shapes and inner tile sizes. This logic breaks when the input shapes or tile sizes are dynamic (indeed, `vectorizeUnPackOpPrecondition` rejects such cases ATM and the vectorization fails). This patch addresses the issue by requiring explicit vector sizes for both the read and write sides, enabling scalable vectorization in such cases. Example: ```mlir func.func @unpack(%in: tensor<1x1x8x?xf32>, %out: tensor<8x?xf32>) -> tensor<8x?xf32> { %vs = vector.vscale %c8 = arith.constant 8 : index %tile_size = arith.muli %vs, %c8 : index %unpack = linalg.unpack %in inner_dims_pos = [0, 1] inner_tiles = [8, %tile_size] into %out : tensor<1x1x8x?xf32> -> tensor<8x?xf32> return %unpack : tensor<8x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [1, 1, 8, [8], 8, [8]] : !transform.any_op // \ / \ / // read-sizes write-sizes transform.yield } } ``` Finally, this patch also extends `createReadOrMaskedRead` and `createWriteOrMaskedWrite` to take scalable flags. --- .../mlir/Dialect/Vector/Utils/VectorUtils.h | 4 +- .../Linalg/Transforms/Vectorization.cpp | 130 ++++++++++++----- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 7 +- .../Linalg/vectorization/linalg-ops.mlir | 135 ++++++++++++++---- 4 files changed, 206 insertions(+), 70 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index cc8421b23a074..be71483410744 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -225,7 +225,9 @@ bool isLinearizableVector(VectorType type); /// /// Note: all read offsets are set to 0. Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, - ArrayRef<int64_t> inputVectorSizes, Value padValue, + ArrayRef<int64_t> inputVectorSizes, + ArrayRef<bool> inputScalableVecSizes, + Value padValue, bool useInBoundsInsteadOfMasking = false); /// Returns success if `inputVectorSizes` is a valid masking configuraion for diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 4add50f4b36e5..6d6ff645d5e29 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1709,7 +1709,8 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vecToStore, return write; // Compute the mask and mask the write Op. - auto writeMaskType = VectorType::get(vecToStoreShape, builder.getI1Type()); + auto writeMaskType = VectorType::get(vecToStoreShape, builder.getI1Type(), + vecToStoreType.getScalableDims()); SmallVector<OpFoldResult> destSizes = tensor::getMixedSizes(builder, loc, dest); @@ -1801,8 +1802,8 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, for (auto [idx, size] : enumerate(innerTiles)) inputShape[innerDimsPos[idx]] *= size; auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, packOp.getSource(), inputShape, padValue, - useInBoundsInsteadOfMasking); + rewriter, loc, packOp.getSource(), inputShape, + /*inputScalableVecSizes=*/{}, padValue, useInBoundsInsteadOfMasking); // Create ShapeCastOp. SmallVector<int64_t> destShape(inputVectorSizes); @@ -1828,18 +1829,23 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, return success(); } -/// Vectorize a `linalg::UnPackOp` to these 4 Ops: -/// Vector::TransferReadOp - Reads a vector from the source tensor -/// vector::TransposeOp - Transpose the Source tensor -/// ShapeCastOp - Reshape the data based on the target. -/// vector::TransferWriteOp. - Write the result vector back to the destination -/// tensor. -/// If the vector sizes are not provided: +/// Vectorize `linalg.unpack %src into %dest` as: +/// // Reads a vector from the source tensor +/// %read = vector.transfer_read %src +/// // Transpose %read as specified in `outer_dims_perm` attribute +/// %tr = vector.transpose %read +/// // Reshape the data based on the target +/// %sc = vector.shape_cast %tr +/// // Write the result vector to the destination tensor. +/// vector.transfer_write %sc into %dest +/// +/// If the vector sizes are not provided: /// * the vector sizes are determined by the input operand and attributes, /// * update the inBounds attribute instead of masking. static LogicalResult vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, ArrayRef<int64_t> inputVectorSizes, + ArrayRef<bool> inputScalableVecDims, SmallVectorImpl<Value> &newResults) { // TODO: Introduce a parent class that will handle the insertion point update. @@ -1856,25 +1862,54 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, auto destSize = unpackOp.getDestRank(); - if (!inputVectorSizes.empty()) - assert(inputVectorSizes.size() == destSize && + if (!inputVectorSizes.empty()) { + assert(inputVectorSizes.size() == destSize + sourceShape.size() && "Incorrect number of input vector sizes"); + } + + SmallVector<bool> readScalableVectorFlags; + SmallVector<bool> writeScalableVectorFlags; + SmallVector<int64_t> readVectorSizes; + SmallVector<int64_t> writeVectorSizes; - // vectorSizes is the shape of the vector that will be used to do final + // Split input-vector-sizes into vector sizes for the read and write + // operations. + if (!inputVectorSizes.empty()) { + readVectorSizes.append(inputVectorSizes.begin(), + inputVectorSizes.begin() + sourceShape.size()); + writeVectorSizes.append(inputVectorSizes.begin() + sourceShape.size(), + inputVectorSizes.end()); + } + if (!inputScalableVecDims.empty()) { + readScalableVectorFlags.append(inputScalableVecDims.begin(), + inputScalableVecDims.begin() + + sourceShape.size()); + writeScalableVectorFlags.append(inputScalableVecDims.begin() + + sourceShape.size(), + inputScalableVecDims.end()); + } else { + readScalableVectorFlags = SmallVector<bool>(sourceShape.size(), false); + writeScalableVectorFlags = SmallVector<bool>(destSize, false); + } + + // writeVectorSizes is the shape of the vector that will be used to do final // write on the destination tensor. It is set like this: Let's say the // source tensor is rank 'M' and the dest tensor rank 'N', where N <= M. // Thus: - // 1. vectorSizes = sourceShape.take_front(N) - // 2. if outer_dims_perms is present: do that permutation on vectorSizes. + // 1. writeVectorSizes = sourceShape.take_front(N) + // 2. if outer_dims_perms is present: do that permutation on writeVectorSizes. // 3. multiply all the locations in vectorSize pointed by innerDimPos by the // innerTiles attribute value. - SmallVector<int64_t> vectorSizes(inputVectorSizes); - if (vectorSizes.empty()) { - llvm::append_range(vectorSizes, sourceShape.take_front(destSize)); + // SmallVector<int64_t> writeVectorSizes(inputVectorSizes); + if (writeVectorSizes.empty()) { + if (ShapedType::isDynamicShape(sourceShape)) + return failure(); + + llvm::append_range(writeVectorSizes, sourceShape.take_front(destSize)); if (!outerDimsPerm.empty()) - applyPermutationToVector(vectorSizes, outerDimsPerm); + applyPermutationToVector(writeVectorSizes, outerDimsPerm); for (auto [i, pos] : llvm::enumerate(innerDimPos)) - vectorSizes[pos] *= innerTiles[i]; + writeVectorSizes[pos] *= innerTiles[i]; useInBoundsInsteadOfMasking = true; } @@ -1898,17 +1933,20 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, // After applying outer_dims_perm: [8, 16] // After appending the rest of the sourceShape: [8, 16, 32, 16] - SmallVector<int64_t> readVectorSizes(vectorSizes.begin(), vectorSizes.end()); - - for (auto [index, size] : enumerate(innerTiles)) { - readVectorSizes[innerDimPos[index]] = - llvm::divideCeil(readVectorSizes[innerDimPos[index]], size); - } - if (!outerDimsPerm.empty()) { - applyPermutationToVector(readVectorSizes, outerDimsPerm); + if (readVectorSizes.empty()) { + // Compute read-vector-sizes based on the write-vector-sizes and inner tile + // sizes. Note, this will only work when all sizes are static. + readVectorSizes = writeVectorSizes; + for (auto [index, size] : enumerate(innerTiles)) { + readVectorSizes[innerDimPos[index]] = + llvm::divideCeil(readVectorSizes[innerDimPos[index]], size); + } + if (!outerDimsPerm.empty()) { + applyPermutationToVector(readVectorSizes, outerDimsPerm); + } + readVectorSizes.append(sourceShape.begin() + writeVectorSizes.size(), + sourceShape.end()); } - readVectorSizes.append(sourceShape.begin() + vectorSizes.size(), - sourceShape.end()); ReifiedRankedShapedTypeDims reifiedRetShapes; LogicalResult status = @@ -1926,7 +1964,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, // Read result, mask if necessary. If transferReadOp shape is not equal // to shape of source, then a mask is necessary. Value readResult = vector::createReadOrMaskedRead( - rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue, + rewriter, loc, unpackOp.getSource(), readVectorSizes, + readScalableVectorFlags, padValue, /*useInBoundsInsteadOfMasking=*/false); PackingMetadata packMetadata; @@ -1946,15 +1985,17 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, RankedTensorType collapsedType = tensor::CollapseShapeOp::inferCollapsedType( stripMineTensorType, packMetadata.reassociations); mlir::VectorType vecCollapsedType = - VectorType::get(collapsedType.getShape(), collapsedType.getElementType()); + VectorType::get(collapsedType.getShape(), collapsedType.getElementType(), + writeScalableVectorFlags); vector::ShapeCastOp shapeCastOp = rewriter.create<vector::ShapeCastOp>( loc, vecCollapsedType, transposeOp->getResult(0)); - // writeVectorSizes had to match the shapecast shape for dynamic sizes, + // writeVectorSizesFinal had to match the shapecast shape for dynamic sizes, // otherwise the validator complains that the mask size is invalid. - SmallVector<int64_t> writeVectorSizes( + // FIXME: We should not override write-vector-sizes like this. + SmallVector<int64_t> writeVectorSizesFinal( unpackOp.getDestType().hasStaticShape() - ? vectorSizes + ? writeVectorSizes : shapeCastOp.getResultVectorType().getShape()); Operation *write = createWriteOrMaskedWrite( rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(), @@ -1984,7 +2025,8 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, (void)status; // prevent unused variable warning on non-assert builds assert(succeeded(status) && "failed to reify result shapes"); auto maskedRead = vector::createReadOrMaskedRead( - rewriter, loc, padOp.getSource(), inputVectorSizes, padValue, + rewriter, loc, padOp.getSource(), inputVectorSizes, + /*inputScalableVecSizes=*/{}, padValue, /*useInBoundsInsteadOfMasking=*/false); // Create Xfer write Op @@ -2069,6 +2111,9 @@ static LogicalResult vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp, ArrayRef<int64_t> inputVectorSizes) { + // FIXME!!! + return success(); + if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) { return !getConstantIntValue(res).has_value(); })) { @@ -2319,6 +2364,7 @@ vectorizePackOpPrecondition(linalg::PackOp packOp, LDBG("pad value is not constant: " << packOp << "\n"); return failure(); } + ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape(); bool satisfyEmptyCond = true; if (inputVectorSizes.empty()) { @@ -2397,6 +2443,10 @@ vectorizeScalableVectorPrecondition(Operation *op, if (numOfScalableDims == 0) return success(); + // FIXME!!! + return success(); + + // TODO: Check the following! auto linalgOp = dyn_cast<LinalgOp>(op); // Cond 1: There's been no need for scalable vectorisation of @@ -2498,7 +2548,7 @@ vectorizeScalableVectorPrecondition(Operation *op, isa<linalg::MatmulTransposeAOp>(op) || isa<linalg::DepthwiseConv1DNwcWcOp>(op) || isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) || - hasReductionIterator(linalgOp)); + isa<linalg::UnPackOp>(op) || hasReductionIterator(linalgOp)); } LogicalResult mlir::linalg::vectorizeOpPrecondition( @@ -2627,7 +2677,8 @@ FailureOr<VectorizationResult> mlir::linalg::vectorize( }) .Case<linalg::UnPackOp>([&](auto unpackOp) { return vectorizeAsTensorUnpackOp(rewriter, unpackOp, - inputVectorSizes, results); + inputVectorSizes, + inputScalableVecDims, results); }) .Case<tensor::InsertSliceOp>([&](auto sliceOp) { return vectorizeAsInsertSliceOp(rewriter, sliceOp, inputVectorSizes, @@ -3017,7 +3068,8 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp, SmallVector<Value> readIndices( vecType.getRank(), rewriter.create<arith::ConstantIndexOp>(loc, 0)); Value read = mlir::vector::createReadOrMaskedRead( - rewriter, loc, source, vecType.getShape(), padValue, + rewriter, loc, source, vecType.getShape(), /*inputScalableVecSizes=*/{}, + padValue, /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty()); // Create write diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index 7e4984582b373..0a8729c0e473e 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -319,6 +319,7 @@ bool vector::isLinearizableVector(VectorType type) { Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, ArrayRef<int64_t> inputVectorSizes, + ArrayRef<bool> inputScalableVecSizes, Value padValue, bool useInBoundsInsteadOfMasking) { assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) && @@ -327,7 +328,8 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, auto sourceShape = sourceShapedType.getShape(); assert(sourceShape.size() == inputVectorSizes.size() && "expected same ranks."); - auto vectorType = VectorType::get(inputVectorSizes, padValue.getType()); + auto vectorType = VectorType::get(inputVectorSizes, padValue.getType(), + inputScalableVecSizes); assert(padValue.getType() == sourceShapedType.getElementType() && "expected same pad element type to match source element type"); int64_t readRank = inputVectorSizes.size(); @@ -354,7 +356,8 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, SmallVector<OpFoldResult> mixedSourceDims = tensor::getMixedSizes(builder, loc, source); - auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type()); + auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type(), + inputScalableVecSizes); Value mask = builder.create<vector::CreateMaskOp>(loc, maskType, mixedSourceDims); return mlir::vector::maskOperation(builder, transferReadOp, mask) diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 98e8f5079176c..b38d3bdedd52a 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -940,34 +940,113 @@ module attributes {transform.with_named_sequence} { ///---------------------------------------------------------------------------------------- // CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack -// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, -func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 -// CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32> -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[DIM0:.*]] = tensor.dim %arg0, %[[C1]] : tensor<?x?xf32> -// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 -// CHECK: %[[C01:.*]] = arith.constant 0 -// CHECK: %[[C02:.*]] = arith.constant 0 -// CHECK: %[[DIM4:.*]] = tensor.dim %arg1, %[[C02]] : tensor<?x?x16x2xf32> -// CHECK: %[[CNST14:.*]] = arith.constant 1 -// CHECK: %[[DIM6:.*]] = tensor.dim %arg1, %[[CNST14]] : tensor<?x?x16x2xf32> -// CHECK: %[[CNST16:.*]] = arith.constant 16 : index -// CHECK: %[[CNST2:.*]] = arith.constant 2 : index -// CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1> -// CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> -// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> -// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32> -// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> -// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]] -// CHECK: return %[[write0]] - %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32> - return %ret : tensor<?x?xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[C0:.*]] = arith.constant 0 + // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32> + // CHECK: %[[C1:.*]] = arith.constant 1 : index + // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32> + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 + // CHECK: %[[C01:.*]] = arith.constant 0 + // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST14:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST16:.*]] = arith.constant 16 : index + // CHECK: %[[CNST2:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x16xf32> to vector<4x16xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, 16, 2, 4, 16] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x16x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor<?x?xf32>, %src: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[C0:.*]] = arith.constant 0 + // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32> + // CHECK: %[[C1:.*]] = arith.constant 1 : index + // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32> + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 + // CHECK: %[[C01:.*]] = arith.constant 0 + // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST14:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor<?x?x16x2xf32> + // CHECK: %[[CNST16:.*]] = arith.constant 16 : index + // CHECK: %[[CNST2:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x[16]x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?x16x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size +// CHECK-SAME: %[[DEST:.*]]: tensor<?x?xf32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<?x?x?x2xf32> +func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor<?x?xf32>, %src: tensor<?x?x?x2xf32>) -> tensor<?x?xf32> { + // CHECK: %[[C0:.*]] = arith.constant 0 + // CHECK: %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C0]] : tensor<?x?xf32> + // CHECK: %[[C1:.*]] = arith.constant 1 : index + // CHECK: %[[DIM0:.*]] = tensor.dim %[[DEST]], %[[C1]] : tensor<?x?xf32> + // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 + // CHECK: %[[C01:.*]] = arith.constant 0 + // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor<?x?x?x2xf32> + // CHECK: %[[C1_2:.*]] = arith.constant 1 + // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1_2]] : tensor<?x?x?x2xf32> + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[DIM_2:.*]] = tensor.dim %[[SRC]], %[[C2]] : tensor<?x?x?x2xf32> + // CHECK: %[[C2_1:.*]] = arith.constant 2 : index + // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[DIM_2]], %[[C2_1]] : vector<2x1x[16]x2xi1> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x?x2xf32>, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> + // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> + // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> + // CHECK: %[[WRITE:.*]] = vector.mask %[[MASK_WRITE:.*]] {{.*}} vector.transfer_write %[[SC]], %[[DEST]] + // CHECK: return %[[WRITE]] + + %vs = vector.vscale + %c16 = arith.constant 16 : index + %tile_size = arith.muli %vs, %c16 : index + + %ret = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [%tile_size, 2] into %dest : tensor<?x?x?x2xf32> -> tensor<?x?xf32> + return %ret : tensor<?x?xf32> +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 1, [16], 2, 4, [16]] : !transform.any_op transform.yield } } @@ -1000,7 +1079,7 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [16, 8, 32, 16, 512, 128] : !transform.any_op transform.yield } } @@ -1025,7 +1104,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op transform.yield } } @@ -1050,7 +1129,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [8, 8, 32, 16, 256, 128] : !transform.any_op transform.yield } } @@ -1173,7 +1252,7 @@ module attributes {transform.with_named_sequence} { func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, [2]] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits