Issue |
83724
|
Summary |
[LoopVectorize] Missed optimization: fail to recognize `memset` pattern after vectorization
|
Labels |
new issue
|
Assignees |
|
Reporter |
XChy
|
Alive2 proof: https://alive2.llvm.org/ce/z/yK_USj (No unrolling due to the slow verfication)
### Motivating example
For the source IR (similar to what memset do):
```llvm
define i1 @src(i64 %0, ptr %vla) {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%conv26 = phi i64 [ %conv, %for.body ], [ 0, %entry ]
%i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr i8, ptr %vla, i64 %conv26
store i8 0, ptr %arrayidx, align 1
%inc = add i32 %i.025, 1
%conv = zext i32 %inc to i64
%cmp = icmp ugt i64 %0, %conv
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body
ret i1 false
}
```
With `opt -O3`, we vectorize it to https://godbolt.org/z/6d5cxeh8K:
```llvm
define noundef i1 @src(i64 %0, ptr nocapture writeonly %vla) local_unnamed_addr #0 {
entry:
%umax1 = tail call i64 @llvm.umax.i64(i64 %0, i64 1)
%min.iters.check = icmp ult i64 %0, 12
br i1 %min.iters.check, label %for.body.preheader, label %vector.scevcheck
vector.scevcheck: ; preds = %entry
%1 = add i64 %0, -1
%2 = trunc i64 %1 to i32
%3 = icmp eq i32 %2, -1
%4 = icmp ugt i64 %1, 4294967295
%5 = or i1 %3, %4
br i1 %5, label %for.body.preheader, label %vector.ph
vector.ph: ; preds = %vector.scevcheck
%n.vec = and i64 %umax1, 8589934588
%ind.end = trunc i64 %n.vec to i32
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = getelementptr i8, ptr %vla, i64 %index
store <4 x i8> zeroinitializer, ptr %6, align 1
%index.next = add nuw i64 %index, 4
%7 = icmp eq i64 %index.next, %n.vec
br i1 %7, label %middle.block, label %vector.body, !llvm.loop !0
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %umax1, %n.vec
br i1 %cmp.n, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %vector.scevcheck, %entry, %middle.block
%conv26.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %entry ], [ %n.vec, %middle.block ]
%i.025.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %entry ], [ %ind.end, %middle.block ]
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%conv26 = phi i64 [ %conv, %for.body ], [ %conv26.ph, %for.body.preheader ]
%i.025 = phi i32 [ %inc, %for.body ], [ %i.025.ph, %for.body.preheader ]
%arrayidx = getelementptr i8, ptr %vla, i64 %conv26
store i8 0, ptr %arrayidx, align 1
%inc = add i32 %i.025, 1
%conv = zext i32 %inc to i64
%cmp = icmp ult i64 %conv, %0
br i1 %cmp, label %for.body, label %for.end, !llvm.loop !3
for.end: ; preds = %for.body, %middle.block
ret i1 false
}
```
The `vector.body` loop is obviously equivalent to `@llvm.memset.p0.i64(ptr align 1 %vla, i8 0, i64 %n.vec, i1 false)`.
It seems to be a phase-ordering problem since LoopIdiomRecoginze goes before LoopVectorize.
### Real-world motivation
This snippet of IR is derived from [jemalloc/src/background_thread.c@background_threads_enable](https://github.com/jemalloc/jemalloc/blob/373884ab482ad1de4b839e40bd38fd154f324707/src/background_thread.c#L576) (after O3 pipeline).
The example above is a reduced from a big real ir. If you're interested in the original suboptimal IR and optimal IR, please email me.
**Let me know if you can confirm that it's an optimization opportunity, thanks.**
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs