Issue |
120733
|
Summary |
[mlir] tiling: Invalid slice when #map(0) != 0
|
Labels |
mlir:linalg,
mlir
|
Assignees |
|
Reporter |
mgehre-amd
|
In the [reproducer](https://godbolt.org/z/fhen1svYd),
```
func.func @test(%arg0 : tensor<9xf32>) -> tensor<6xf32> {
%empty = tensor.empty() : tensor<6xf32>
%generic = linalg.generic
{indexing_maps = [affine_map<(d0) -> (d0 + 3)>,
affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]} ins(%arg0: tensor<9xf32>) outs(%empty : tensor<6xf32>) {
^bb0(%in : f32, %out: f32):
linalg.yield %in : f32
} -> tensor<6xf32>
return %generic : tensor<6xf32>
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
%1, %loop = transform.structured.tile_using_for %0 tile_sizes [3] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.yield
}
}
```
becomes
```
#map = affine_map<(d0) -> (d0 + 3)>
#map1 = affine_map<(d0) -> (d0)>
module {
func.func @test(%arg0: tensor<9xf32>) -> tensor<6xf32> {
%0 = tensor.empty() : tensor<6xf32>
%c0 = arith.constant 0 : index
%c6 = arith.constant 6 : index
%c3 = arith.constant 3 : index
%1 = scf.for %arg1 = %c0 to %c6 step %c3 iter_args(%arg2 = %0) -> (tensor<6xf32>) {
%2 = affine.apply #map(%arg1)
%extracted_slice = tensor.extract_slice %arg0[%2] [6] [1] : tensor<9xf32> to tensor<6xf32>
%extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [3] [1] : tensor<6xf32> to tensor<3xf32>
%3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel"]} ins(%extracted_slice : tensor<6xf32>) outs(%extracted_slice_0 : tensor<3xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<3xf32>
%inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [3] [1] : tensor<3xf32> into tensor<6xf32>
scf.yield %inserted_slice : tensor<6xf32>
}
return %1 : tensor<6xf32>
}
```
This accesses out-of-bounds. `%2 = affine.apply #map(%arg1)` is `%arg1 + 3` (which takes the values `3` and `6` for the two loop iterations`
and `%extracted_slice = tensor.extract_slice %arg0[%2] [6] [1] : tensor<9xf32> to tensor<6xf32>` will extract `6` elements from that offset,
which tries to extract elements `6` to `12` in the second iteration - but the tensor only has 9 elements.
It seems that the implemenation computing the slices is only correct when `#map(0)=0`. The correct offset for the extract slice would be `#map(%arg1) - #map(0)`, which here is `%arg1`.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs