Issue 136574
Summary [X86] Unnecessary sequences of 8 GPR `mov`s back and forth
Labels new issue
Assignees
Reporter dzaima
    This C code (heavily reduced from real-world code implementing SIMD transpose), compiled via `-O3 -march=haswell`:
```c
#include <immintrin.h>
#include <stdint.h>
#define load(x) _mm256_loadu_si256((void *)(x))
#define store(x, v) _mm256_storeu_si256((void *)(x), v)
void f(char *p1, char *p2, char *p3, uint64_t x, uint64_t y, uint64_t z) {
  while (1) {
    uint64_t i = 0;
 while (1) {
      if (i >= x)
        break;
      uint64_t j = 4 * i ? 4 * i : x;
      __m256i a = load(p2 + y * 5);

      __m256i l0 = load(p2);
      __m256i l1 = load(p2 + j + 3 * y);
      __m256i b = l0 + l1;

      __m256i l2 = load(p2 + y);
      __m256i l3 = load(p3 + y + j);
      __m256i c = l2 + l3;

      __m256i l4 = load(p3 + 6 * y);
 __m256i l5 = load(p2 + j + 7 * y);
      __m256i d = l4 + l5;

 store(p1 + j * z + 16 * z, _mm256_permute2x128_si256(a, b, 49));
 store(p1 + j * z, _mm256_permute2x128_si256(c, d, 49));

      i++;
 }
  }
}
```

results in this segment of assembly:
```asm
        ...
 mov     rcx, rdi
        mov     rdi, r8
        mov     r8, r15
 mov     r15, r14
        mov     r14, r11
        mov     r11, r9
 mov     r9, rdx
        mov     rdx, r10
        mov     r10, qword ptr [rsp - 8]
        vmovdqu ymm1, ymmword ptr [r10 + rbp]
        mov r10, rdx
        mov     rdx, r9
        mov     r9, r11
        mov r11, r14
        mov     r14, r15
        mov     r15, r8
        mov r8, rdi
        mov     rdi, rcx
        ...
```
which could be just:
```asm
        mov     rcx, qword ptr [rsp - 8]
        vmovdqu ymm1, ymmword ptr [rcx + rbp]
```

https://godbolt.org/z/P66GdGzaM

Similar to #81391, but for GPRs, not SIMD registers (though SIMD is still involved).
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to