Issue |
136574
|
Summary |
[X86] Unnecessary sequences of 8 GPR `mov`s back and forth
|
Labels |
new issue
|
Assignees |
|
Reporter |
dzaima
|
This C code (heavily reduced from real-world code implementing SIMD transpose), compiled via `-O3 -march=haswell`:
```c
#include <immintrin.h>
#include <stdint.h>
#define load(x) _mm256_loadu_si256((void *)(x))
#define store(x, v) _mm256_storeu_si256((void *)(x), v)
void f(char *p1, char *p2, char *p3, uint64_t x, uint64_t y, uint64_t z) {
while (1) {
uint64_t i = 0;
while (1) {
if (i >= x)
break;
uint64_t j = 4 * i ? 4 * i : x;
__m256i a = load(p2 + y * 5);
__m256i l0 = load(p2);
__m256i l1 = load(p2 + j + 3 * y);
__m256i b = l0 + l1;
__m256i l2 = load(p2 + y);
__m256i l3 = load(p3 + y + j);
__m256i c = l2 + l3;
__m256i l4 = load(p3 + 6 * y);
__m256i l5 = load(p2 + j + 7 * y);
__m256i d = l4 + l5;
store(p1 + j * z + 16 * z, _mm256_permute2x128_si256(a, b, 49));
store(p1 + j * z, _mm256_permute2x128_si256(c, d, 49));
i++;
}
}
}
```
results in this segment of assembly:
```asm
...
mov rcx, rdi
mov rdi, r8
mov r8, r15
mov r15, r14
mov r14, r11
mov r11, r9
mov r9, rdx
mov rdx, r10
mov r10, qword ptr [rsp - 8]
vmovdqu ymm1, ymmword ptr [r10 + rbp]
mov r10, rdx
mov rdx, r9
mov r9, r11
mov r11, r14
mov r14, r15
mov r15, r8
mov r8, rdi
mov rdi, rcx
...
```
which could be just:
```asm
mov rcx, qword ptr [rsp - 8]
vmovdqu ymm1, ymmword ptr [rcx + rbp]
```
https://godbolt.org/z/P66GdGzaM
Similar to #81391, but for GPRs, not SIMD registers (though SIMD is still involved).
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs