Issue 83724
Summary [LoopVectorize] Missed optimization: fail to recognize `memset` pattern after vectorization
Labels new issue
Assignees
Reporter XChy
    Alive2 proof: https://alive2.llvm.org/ce/z/yK_USj (No unrolling due to the slow verfication)

### Motivating example 

For the source IR (similar to what memset do):
```llvm
define i1 @src(i64 %0, ptr %vla) {
entry:
  br label %for.body

for.body: ; preds = %for.body, %entry
  %conv26 = phi i64 [ %conv, %for.body ], [ 0, %entry ]
  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr i8, ptr %vla, i64 %conv26
  store i8 0, ptr %arrayidx, align 1
  %inc = add i32 %i.025, 1
  %conv = zext i32 %inc to i64
  %cmp = icmp ugt i64 %0, %conv
  br i1 %cmp, label %for.body, label %for.end

for.end: ; preds = %for.body
  ret i1 false
}
```

With `opt -O3`, we vectorize it to https://godbolt.org/z/6d5cxeh8K:
```llvm
define noundef i1 @src(i64 %0, ptr nocapture writeonly %vla) local_unnamed_addr #0 {
entry:
  %umax1 = tail call i64 @llvm.umax.i64(i64 %0, i64 1)
 %min.iters.check = icmp ult i64 %0, 12
  br i1 %min.iters.check, label %for.body.preheader, label %vector.scevcheck

vector.scevcheck: ; preds = %entry
  %1 = add i64 %0, -1
  %2 = trunc i64 %1 to i32
  %3 = icmp eq i32 %2, -1
  %4 = icmp ugt i64 %1, 4294967295
  %5 = or i1 %3, %4
  br i1 %5, label %for.body.preheader, label %vector.ph

vector.ph:                                        ; preds = %vector.scevcheck
  %n.vec = and i64 %umax1, 8589934588
 %ind.end = trunc i64 %n.vec to i32
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %6 = getelementptr i8, ptr %vla, i64 %index
  store <4 x i8> zeroinitializer, ptr %6, align 1
  %index.next = add nuw i64 %index, 4
  %7 = icmp eq i64 %index.next, %n.vec
  br i1 %7, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %umax1, %n.vec
  br i1 %cmp.n, label %for.end, label %for.body.preheader

for.body.preheader: ; preds = %vector.scevcheck, %entry, %middle.block
 %conv26.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %entry ], [ %n.vec, %middle.block ]
  %i.025.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %entry ], [ %ind.end, %middle.block ]
  br label %for.body

for.body:                                         ; preds = %for.body.preheader, %for.body
  %conv26 = phi i64 [ %conv, %for.body ], [ %conv26.ph, %for.body.preheader ]
  %i.025 = phi i32 [ %inc, %for.body ], [ %i.025.ph, %for.body.preheader ]
  %arrayidx = getelementptr i8, ptr %vla, i64 %conv26
  store i8 0, ptr %arrayidx, align 1
  %inc = add i32 %i.025, 1
  %conv = zext i32 %inc to i64
  %cmp = icmp ult i64 %conv, %0
  br i1 %cmp, label %for.body, label %for.end, !llvm.loop !3

for.end:                                          ; preds = %for.body, %middle.block
  ret i1 false
}
```

The `vector.body` loop is obviously equivalent to `@llvm.memset.p0.i64(ptr align 1 %vla, i8 0, i64 %n.vec, i1 false)`.
It seems to be a phase-ordering problem since LoopIdiomRecoginze goes before LoopVectorize.

### Real-world motivation

This snippet of IR is derived from [jemalloc/src/background_thread.c@background_threads_enable](https://github.com/jemalloc/jemalloc/blob/373884ab482ad1de4b839e40bd38fd154f324707/src/background_thread.c#L576) (after O3 pipeline).
The example above is a reduced from a big real ir. If you're interested in the original suboptimal IR and optimal IR, please email me.

**Let me know if you can confirm that it's an optimization opportunity, thanks.**
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to