Issue |
124872
|
Summary |
SROA can make a mess out of sequence of memcpys causing to poor optimization
|
Labels |
new issue
|
Assignees |
|
Reporter |
gbaraldi
|
The difference seems to be in https://github.com/JuliaLang/llvm-project/blob/2a7ed2c1aaf5c84280d947eea56daaf302eb83d1/llvm/lib/Transforms/Scalar/SROA.cpp#L3481-L3513 whether we take the extract + trunc path or the copyload +insert path. Of which LLVM can further optimize the copyload + insert much better. The only difference here is wheter I copy the whole alloca or just half of it. I thought this could've been some undef stuff but it looks like it's SROA thinking it's generating good code but actually making things worse.
```llvm
source_filename = "pad"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-darwin24.3.0"
define void @good(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%3 = alloca [8 x i8], align 1
store i64 %1, ptr %3, align 1
%4 = alloca [16 x i8], align 1
%5 = getelementptr inbounds i8, ptr %3, i32 1
%6 = getelementptr inbounds i8, ptr %3, i32 2
%7 = getelementptr inbounds i8, ptr %3, i32 3
%8 = getelementptr inbounds i8, ptr %3, i32 4
%9 = getelementptr inbounds i8, ptr %3, i32 5
%10 = getelementptr inbounds i8, ptr %3, i32 6
%11 = getelementptr inbounds i8, ptr %3, i32 7
%12 = getelementptr inbounds i8, ptr %4, i32 0
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %12, ptr align 1 %3, i64 1, i1 false)
%13 = getelementptr inbounds i8, ptr %4, i32 1
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %13, ptr align 1 %5, i64 1, i1 false)
%14 = getelementptr inbounds i8, ptr %4, i32 2
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %14, ptr align 1 %6, i64 1, i1 false)
%15 = getelementptr inbounds i8, ptr %4, i32 3
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %15, ptr align 1 %7, i64 1, i1 false)
%16 = getelementptr inbounds i8, ptr %4, i32 4
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %16, ptr align 1 %8, i64 1, i1 false)
%17 = getelementptr inbounds i8, ptr %4, i32 5
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %17, ptr align 1 %9, i64 1, i1 false)
%18 = getelementptr inbounds i8, ptr %4, i32 6
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %18, ptr align 1 %10, i64 1, i1 false)
%19 = getelementptr inbounds i8, ptr %4, i32 7
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %19, ptr align 1 %11, i64 1, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %0, ptr align 1 %4, i64 8, i1 false)
ret void
}
define void @bad(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%3 = alloca [8 x i8], align 1
store i64 %1, ptr %3, align 1
%4 = alloca [16 x i8], align 1
%5 = getelementptr inbounds i8, ptr %3, i32 1
%6 = getelementptr inbounds i8, ptr %3, i32 2
%7 = getelementptr inbounds i8, ptr %3, i32 3
%8 = getelementptr inbounds i8, ptr %3, i32 4
%9 = getelementptr inbounds i8, ptr %3, i32 5
%10 = getelementptr inbounds i8, ptr %3, i32 6
%11 = getelementptr inbounds i8, ptr %3, i32 7
%12 = getelementptr inbounds i8, ptr %4, i32 0
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %12, ptr align 1 %3, i64 1, i1 false)
%13 = getelementptr inbounds i8, ptr %4, i32 1
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %13, ptr align 1 %5, i64 1, i1 false)
%14 = getelementptr inbounds i8, ptr %4, i32 2
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %14, ptr align 1 %6, i64 1, i1 false)
%15 = getelementptr inbounds i8, ptr %4, i32 3
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %15, ptr align 1 %7, i64 1, i1 false)
%16 = getelementptr inbounds i8, ptr %4, i32 4
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %16, ptr align 1 %8, i64 1, i1 false)
%17 = getelementptr inbounds i8, ptr %4, i32 5
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %17, ptr align 1 %9, i64 1, i1 false)
%18 = getelementptr inbounds i8, ptr %4, i32 6
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %18, ptr align 1 %10, i64 1, i1 false)
%19 = getelementptr inbounds i8, ptr %4, i32 7
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %19, ptr align 1 %11, i64 1, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %0, ptr align 1 %4, i64 16, i1 false)
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
Output
```llvm
source_filename = "pad"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-darwin24.3.0"
define void @good(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%.sroa.02.0.extract.trunc = trunc i64 %1 to i8
%.sroa.2.0.extract.shift = lshr i64 %1, 8
%.sroa.2.0.extract.trunc = trunc i64 %.sroa.2.0.extract.shift to i8
%.sroa.3.0.extract.shift = lshr i64 %1, 16
%.sroa.3.0.extract.trunc = trunc i64 %.sroa.3.0.extract.shift to i8
%.sroa.4.0.extract.shift = lshr i64 %1, 24
%.sroa.4.0.extract.trunc = trunc i64 %.sroa.4.0.extract.shift to i8
%.sroa.5.0.extract.shift = lshr i64 %1, 32
%.sroa.5.0.extract.trunc = trunc i64 %.sroa.5.0.extract.shift to i8
%.sroa.6.0.extract.shift = lshr i64 %1, 40
%.sroa.6.0.extract.trunc = trunc i64 %.sroa.6.0.extract.shift to i8
%.sroa.7.0.extract.shift = lshr i64 %1, 48
%.sroa.7.0.extract.trunc = trunc i64 %.sroa.7.0.extract.shift to i8
%.sroa.8.0.extract.shift = lshr i64 %1, 56
%.sroa.8.0.extract.trunc = trunc i64 %.sroa.8.0.extract.shift to i8
%.sroa.0.0.insert.ext = zext i8 %.sroa.02.0.extract.trunc to i64
%.sroa.0.0.insert.mask = and i64 undef, -256
%.sroa.0.0.insert.insert = or i64 %.sroa.0.0.insert.mask, %.sroa.0.0.insert.ext
%.sroa.0.1.insert.ext = zext i8 %.sroa.2.0.extract.trunc to i64
%.sroa.0.1.insert.shift = shl i64 %.sroa.0.1.insert.ext, 8
%.sroa.0.1.insert.mask = and i64 %.sroa.0.0.insert.insert, -65281
%.sroa.0.1.insert.insert = or i64 %.sroa.0.1.insert.mask, %.sroa.0.1.insert.shift
%.sroa.0.2.insert.ext = zext i8 %.sroa.3.0.extract.trunc to i64
%.sroa.0.2.insert.shift = shl i64 %.sroa.0.2.insert.ext, 16
%.sroa.0.2.insert.mask = and i64 %.sroa.0.1.insert.insert, -16711681
%.sroa.0.2.insert.insert = or i64 %.sroa.0.2.insert.mask, %.sroa.0.2.insert.shift
%.sroa.0.3.insert.ext = zext i8 %.sroa.4.0.extract.trunc to i64
%.sroa.0.3.insert.shift = shl i64 %.sroa.0.3.insert.ext, 24
%.sroa.0.3.insert.mask = and i64 %.sroa.0.2.insert.insert, -4278190081
%.sroa.0.3.insert.insert = or i64 %.sroa.0.3.insert.mask, %.sroa.0.3.insert.shift
%.sroa.0.4.insert.ext = zext i8 %.sroa.5.0.extract.trunc to i64
%.sroa.0.4.insert.shift = shl i64 %.sroa.0.4.insert.ext, 32
%.sroa.0.4.insert.mask = and i64 %.sroa.0.3.insert.insert, -1095216660481
%.sroa.0.4.insert.insert = or i64 %.sroa.0.4.insert.mask, %.sroa.0.4.insert.shift
%.sroa.0.5.insert.ext = zext i8 %.sroa.6.0.extract.trunc to i64
%.sroa.0.5.insert.shift = shl i64 %.sroa.0.5.insert.ext, 40
%.sroa.0.5.insert.mask = and i64 %.sroa.0.4.insert.insert, -280375465082881
%.sroa.0.5.insert.insert = or i64 %.sroa.0.5.insert.mask, %.sroa.0.5.insert.shift
%.sroa.0.6.insert.ext = zext i8 %.sroa.7.0.extract.trunc to i64
%.sroa.0.6.insert.shift = shl i64 %.sroa.0.6.insert.ext, 48
%.sroa.0.6.insert.mask = and i64 %.sroa.0.5.insert.insert, -71776119061217281
%.sroa.0.6.insert.insert = or i64 %.sroa.0.6.insert.mask, %.sroa.0.6.insert.shift
%.sroa.0.7.insert.ext = zext i8 %.sroa.8.0.extract.trunc to i64
%.sroa.0.7.insert.shift = shl i64 %.sroa.0.7.insert.ext, 56
%.sroa.0.7.insert.mask = and i64 %.sroa.0.6.insert.insert, 72057594037927935
%.sroa.0.7.insert.insert = or i64 %.sroa.0.7.insert.mask, %.sroa.0.7.insert.shift
store i64 %.sroa.0.7.insert.insert, ptr %0, align 1
ret void
}
define void @bad(ptr noalias nocapture noundef nonnull sret([16 x i8]) align 1 dereferenceable(16) %0, i64 signext %1) local_unnamed_addr {
%.sroa.0.0.extract.trunc = trunc i64 %1 to i8
%.sroa.0.1.extract.shift = lshr i64 %1, 8
%.sroa.0.1.extract.trunc = trunc i64 %.sroa.0.1.extract.shift to i8
%.sroa.0.2.extract.shift = lshr i64 %1, 16
%.sroa.0.2.extract.trunc = trunc i64 %.sroa.0.2.extract.shift to i8
%.sroa.0.3.extract.shift = lshr i64 %1, 24
%.sroa.0.3.extract.trunc = trunc i64 %.sroa.0.3.extract.shift to i8
%.sroa.0.4.extract.shift = lshr i64 %1, 32
%.sroa.0.4.extract.trunc = trunc i64 %.sroa.0.4.extract.shift to i8
%.sroa.0.5.extract.shift = lshr i64 %1, 40
%.sroa.0.5.extract.trunc = trunc i64 %.sroa.0.5.extract.shift to i8
%.sroa.0.6.extract.shift = lshr i64 %1, 48
%.sroa.0.6.extract.trunc = trunc i64 %.sroa.0.6.extract.shift to i8
%.sroa.0.7.extract.shift = lshr i64 %1, 56
%.sroa.0.7.extract.trunc = trunc i64 %.sroa.0.7.extract.shift to i8
store i8 %.sroa.0.0.extract.trunc, ptr %0, align 1
%.sroa.2.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 1
store i8 %.sroa.0.1.extract.trunc, ptr %.sroa.2.0..sroa_idx, align 1
%.sroa.3.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 2
store i8 %.sroa.0.2.extract.trunc, ptr %.sroa.3.0..sroa_idx, align 1
%.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 3
store i8 %.sroa.0.3.extract.trunc, ptr %.sroa.4.0..sroa_idx, align 1
%.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 4
store i8 %.sroa.0.4.extract.trunc, ptr %.sroa.5.0..sroa_idx, align 1
%.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 5
store i8 %.sroa.0.5.extract.trunc, ptr %.sroa.6.0..sroa_idx, align 1
%.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 6
store i8 %.sroa.0.6.extract.trunc, ptr %.sroa.7.0..sroa_idx, align 1
%.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 7
store i8 %.sroa.0.7.extract.trunc, ptr %.sroa.8.0..sroa_idx, align 1
%.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 8
store i64 undef, ptr %.sroa.9.0..sroa_idx, align 1
ret void
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
SROA Output https://godbolt.org/z/e3hs5qcfr
O1 Output https://godbolt.org/z/ExbTxx1ra
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs