https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124697
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Keywords| |missed-optimization
CC| |rguenth at gcc dot gnu.org
Target| |x86_64-*-*
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
But the alignment is required for
typedef double v4df __attribute__((vector_size(32)));
typedef struct {
double a[4];
} c __attribute__((aligned(32)));
extern v4df d;
void
e (float a1, float a2, float a3, float a4, float a5, float a6, c f)
{
d = *(v4df *)f.a;
}
when building with -O0 -mavx. We correctly generate
e:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-32, %rsp
vmovss %xmm0, -4(%rsp)
vmovss %xmm1, -8(%rsp)
vmovss %xmm2, -12(%rsp)
vmovss %xmm3, -16(%rsp)
vmovss %xmm4, -20(%rsp)
vmovss %xmm5, -24(%rsp)
vmovdqu 16(%rbp), %ymm0
vmovdqa %ymm0, -64(%rsp)
leaq -64(%rsp), %rax
vmovapd (%rax), %ymm0
vmovapd %ymm0, d(%rip)
nop
leave
.cfi_def_cfa 7, 8
ret
with -O0 -msse2 only 16 byte alignment is needed (thus my idea to rely
on BIGGEST_ALIGNMENT).
You cannot rely on the fact that for the simple testcase above we are
able to elide the stack copy as we do with -O2 -mavx2. Consider:
typedef double v4df __attribute__((vector_size(32)));
typedef struct {
double a[4];
} c __attribute__((aligned(32)));
extern v4df d;
void
e (float a1, float a2, float a3, float a4, float a5, float a6, c f, double *p)
{
d = *(v4df *)(p ? p : f.a);
}
which with optimization currently correctly does
e:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
andq $-32, %rsp
testq %rdi, %rdi
leaq -32(%rsp), %rax
cmovne %rdi, %rax
vmovdqu 16(%rbp), %ymm0
vmovdqa %ymm0, -32(%rsp)
vmovapd (%rax), %ymm0
vmovapd %ymm0, d(%rip)
vzeroupper
leave
.cfi_def_cfa 7, 8
ret