[Bug middle-end/124697] Unnecessary stack realignment

rguenth at gcc dot gnu.org via Gcc-bugs Tue, 31 Mar 2026 02:11:27 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124697


Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization
                 CC|                            |rguenth at gcc dot gnu.org
             Target|                            |x86_64-*-*

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
But the alignment is required for

typedef double v4df __attribute__((vector_size(32)));
typedef struct {
  double a[4];
} c __attribute__((aligned(32)));
extern v4df d;
void
e (float a1, float a2, float a3, float a4, float a5, float a6, c f)
{
  d = *(v4df *)f.a;
}

when building with -O0 -mavx.  We correctly generate

e:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vmovss  %xmm0, -4(%rsp)
        vmovss  %xmm1, -8(%rsp)
        vmovss  %xmm2, -12(%rsp)
        vmovss  %xmm3, -16(%rsp)
        vmovss  %xmm4, -20(%rsp)
        vmovss  %xmm5, -24(%rsp)
        vmovdqu 16(%rbp), %ymm0
        vmovdqa %ymm0, -64(%rsp)
        leaq    -64(%rsp), %rax
        vmovapd (%rax), %ymm0
        vmovapd %ymm0, d(%rip)
        nop
        leave
        .cfi_def_cfa 7, 8
        ret

with -O0 -msse2 only 16 byte alignment is needed (thus my idea to rely
on BIGGEST_ALIGNMENT).

You cannot rely on the fact that for the simple testcase above we are
able to elide the stack copy as we do with -O2 -mavx2.  Consider:

typedef double v4df __attribute__((vector_size(32)));
typedef struct {
  double a[4];
} c __attribute__((aligned(32)));
extern v4df d;
void
e (float a1, float a2, float a3, float a4, float a5, float a6, c f, double *p)
{
  d = *(v4df *)(p ? p : f.a);
}

which with optimization currently correctly does

e:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        testq   %rdi, %rdi
        leaq    -32(%rsp), %rax
        cmovne  %rdi, %rax
        vmovdqu 16(%rbp), %ymm0
        vmovdqa %ymm0, -32(%rsp)
        vmovapd (%rax), %ymm0
        vmovapd %ymm0, d(%rip)
        vzeroupper
        leave
        .cfi_def_cfa 7, 8
        ret

[Bug middle-end/124697] Unnecessary stack realignment

Reply via email to