Issue |
131097
|
Summary |
Missed Optimization: Failing to coalesce multiple stores from `memcpy()`
|
Labels |
new issue
|
Assignees |
|
Reporter |
pan-0
|
Given:
```c
void f(uint8_t *out, uint8_t x, uint8_t y)
{
const uint8_t in[] = {x, y};
memcpy(out, in, sizeof in);
}
```
Clang generates:
```llvm
define dso_local void @f(ptr noundef writeonly captures(none) initializes((0, 2)) %out, i8 noundef zeroext %x, i8 noundef zeroext %y) local_unnamed_addr {
entry:
store i8 %x, ptr %out, align 1
%in.sroa.4.0..sroa_idx = getelementptr inbounds nuw i8, ptr %out, i64 1
store i8 %y, ptr %in.sroa.4.0..sroa_idx, align 1
ret void
}
```
With a curious workaround:
```c
inline static void *memcpy2(void *restrict out, const void *restrict in)
{
__builtin_memcpy_inline(out, in, 2);
return out;
}
#define xmemcpy(out, in, size) \
((size) == 2 ? memcpy2((out), (in)) : memcpy((out), (in), (size))
void g(uint8_t *out, uint8_t x, uint8_t y)
{
const uint8_t in[] = {x, y};
xmemcpy(out, in, sizeof in);
}
```
It generates:
```llvm
define dso_local void @g(ptr noundef writeonly captures(none) initializes((0, 2)) %out, i8 noundef zeroext %x, i8 noundef zeroext %y) local_unnamed_addr {
entry:
%in.sroa.4.0.insert.ext = zext i8 %y to i16
%in.sroa.4.0.insert.shift = shl nuw i16 %in.sroa.4.0.insert.ext, 8
%in.sroa.0.0.insert.ext = zext i8 %x to i16
%in.sroa.0.0.insert.insert = or disjoint i16 %in.sroa.4.0.insert.shift, %in.sroa.0.0.insert.ext
store i16 %in.sroa.0.0.insert.insert, ptr %out, align 1
ret void
}
```
Which affects codegen; `x86_64`:
```
f:
mov byte ptr [rdi], sil
mov byte ptr [rdi + 1], dl
ret
g:
shl edx, 8
or edx, esi
mov word ptr [rdi], dx
ret
```
`arm8v-a`:
```
f:
strb w1, [x0]
strb w2, [x0, #1]
ret
g:
bfi w1, w2, #8, #24
strh w1, [x0]
ret
```
[Godbolt](https://godbolt.org/z/3rW7Maxrs).
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs