https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428
--- Comment #5 from Michael_S <already5chosen at yahoo dot com> --- (In reply to Richard Biener from comment #4) > I have a fix that, with -mavx512f generates just > > .L3: > vmovupd (%rcx,%rax), %zmm0 > vpermpd (%rsi,%rax), %zmm1, %zmm2 > vpermpd %zmm0, %zmm1, %zmm0 > vmovupd %zmm2, (%rdi,%rax,2) > vmovupd %zmm0, 64(%rdi,%rax,2) > addq $64, %rax > cmpq %rax, %rdx > jne .L3 > This particular kernel on AVX512 is less interesting, because under AVX512 a natural AoSoA layout is different. typedef struct { double re, im; } dcmlx_t; typedef struct { double re[8], im[8]; } dcmlx8_t; void foo512(dcmlx8_t dst[], const dcmlx_t src[], int n) { for (int i = 0; i < n; ++i) { dcmlx_t s0 = src[i*8+0]; dcmlx_t s1 = src[i*8+1]; dcmlx_t s2 = src[i*8+2]; dcmlx_t s3 = src[i*8+3]; dcmlx_t s4 = src[i*8+4]; dcmlx_t s5 = src[i*8+5]; dcmlx_t s6 = src[i*8+6]; dcmlx_t s7 = src[i*8+7]; dst[i].re[0] = s0.re; dst[i].re[1] = s1.re; dst[i].re[2] = s2.re; dst[i].re[3] = s3.re; dst[i].re[4] = s4.re; dst[i].re[5] = s5.re; dst[i].re[6] = s6.re; dst[i].re[7] = s7.re; dst[i].im[0] = s0.im; dst[i].im[1] = s1.im; dst[i].im[2] = s2.im; dst[i].im[3] = s3.im; dst[i].im[4] = s4.im; dst[i].im[5] = s5.im; dst[i].im[6] = s6.im; dst[i].im[7] = s7.im; } } And, respectively: typedef struct { double re, im; } dcmlx_t; typedef struct { double re[8], im[8]; } dcmlx8_t; void foo512_i2(dcmlx8_t dst[], const dcmlx_t src[], int n) { for (int i = 0; i < n; ++i) { dcmlx_t s00 = src[i*8+0]; dcmlx_t s01 = src[i*8+1]; dcmlx_t s02 = src[i*8+2]; dcmlx_t s03 = src[i*8+3]; dcmlx_t s04 = src[i*8+4]; dcmlx_t s05 = src[i*8+5]; dcmlx_t s06 = src[i*8+6]; dcmlx_t s07 = src[i*8+7]; dcmlx_t s10 = src[i*8+0+n*8]; dcmlx_t s11 = src[i*8+1+n*8]; dcmlx_t s12 = src[i*8+2+n*8]; dcmlx_t s13 = src[i*8+3+n*8]; dcmlx_t s14 = src[i*8+4+n*8]; dcmlx_t s15 = src[i*8+5+n*8]; dcmlx_t s16 = src[i*8+6+n*8]; dcmlx_t s17 = src[i*8+7+n*8]; dst[i*2+0].re[0] = s00.re; dst[i*2+0].re[1] = s01.re; dst[i*2+0].re[2] = s02.re; dst[i*2+0].re[3] = s03.re; dst[i*2+0].re[4] = s04.re; dst[i*2+0].re[5] = s05.re; dst[i*2+0].re[6] = s06.re; dst[i*2+0].re[7] = s07.re; dst[i*2+0].im[0] = s00.im; dst[i*2+0].im[1] = s01.im; dst[i*2+0].im[2] = s02.im; dst[i*2+0].im[3] = s03.im; dst[i*2+0].im[4] = s04.im; dst[i*2+0].im[5] = s05.im; dst[i*2+0].im[6] = s06.im; dst[i*2+0].im[7] = s07.im; dst[i*2+1].re[0] = s10.re; dst[i*2+1].re[1] = s11.re; dst[i*2+1].re[2] = s12.re; dst[i*2+1].re[3] = s13.re; dst[i*2+1].re[4] = s14.re; dst[i*2+1].re[5] = s15.re; dst[i*2+1].re[6] = s16.re; dst[i*2+1].re[7] = s17.re; dst[i*2+1].im[0] = s10.im; dst[i*2+1].im[1] = s11.im; dst[i*2+1].im[2] = s12.im; dst[i*2+1].im[3] = s13.im; dst[i*2+1].im[4] = s14.im; dst[i*2+1].im[5] = s15.im; dst[i*2+1].im[6] = s16.im; dst[i*2+1].im[7] = s17.im; } }