https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95125
Bug ID: 95125 Summary: Unoptimal code for vectorized conversions Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: ubizjak at gmail dot com Target Milestone: --- Following testcase --cut here-- float f[4]; double d[4]; int i[4]; void float_truncate (void) { for (int n = 0; n < 4; n++) f[n] = d[n]; } void float_extend (void) { for (int n = 0; n < 4; n++) d[n] = f[n]; } void float_float (void) { for (int n = 0; n < 4; n++) f[n] = i[n]; } void fix_float (void) { for (int n = 0; n < 4; n++) i[n] = f[n]; } void float_double (void) { for (int n = 0; n < 4; n++) d[n] = i[n]; } void fix_double (void) { for (int n = 0; n < 4; n++) i[n] = d[n]; } --cut here-- when compiled with "-O3 -mavx" should result in a single conversion instruction. float_truncate: vxorps %xmm0, %xmm0, %xmm0 vcvtsd2ss d+8(%rip), %xmm0, %xmm2 vmovaps %xmm2, %xmm3 vcvtsd2ss d(%rip), %xmm0, %xmm1 vcvtsd2ss d+16(%rip), %xmm0, %xmm2 vcvtsd2ss d+24(%rip), %xmm0, %xmm0 vunpcklps %xmm0, %xmm2, %xmm2 vunpcklps %xmm3, %xmm1, %xmm0 vmovlhps %xmm2, %xmm0, %xmm0 vmovaps %xmm0, f(%rip) ret float_extend: vcvtps2pd f(%rip), %xmm0 vmovapd %xmm0, d(%rip) vxorps %xmm0, %xmm0, %xmm0 vmovlps f+8(%rip), %xmm0, %xmm0 vcvtps2pd %xmm0, %xmm0 vmovapd %xmm0, d+16(%rip) ret float_float: vcvtdq2ps i(%rip), %xmm0 vmovaps %xmm0, f(%rip) ret fix_float: vcvttps2dq f(%rip), %xmm0 vmovdqa %xmm0, i(%rip) ret float_double: vcvtdq2pd i(%rip), %xmm0 vmovapd %xmm0, d(%rip) vpshufd $238, i(%rip), %xmm0 vcvtdq2pd %xmm0, %xmm0 vmovapd %xmm0, d+16(%rip) ret fix_double: pushq %rbp vmovapd d(%rip), %xmm1 vinsertf128 $0x1, d+16(%rip), %ymm1, %ymm0 movq %rsp, %rbp vcvttpd2dqy %ymm0, %xmm0 vmovdqa %xmm0, i(%rip) vzeroupper popq %rbp ret Clang manages to emit optimal code.