https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
Bug ID: 100257 Summary: poor codegen with vcvtph2ps / stride of 6 Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: witold.baryluk+gcc at gmail dot com Target Milestone: --- gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental) https://godbolt.org/z/n6ooMdnz8 This C code: ``` #include <stdint.h> #include <string.h> #include <immintrin.h> struct float3 { float f1; float f2; float f3; }; struct util_format_r16g16b16_float { uint16_t r; uint16_t g; uint16_t b; }; static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t val_1, uint16_t val_2) { #if defined(__F16C__) //const __m128i in = {val_0, val_1, val_2}; //__m128 out; //__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in)); const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0); const __m128 out = _mm_cvtph_ps(in); const struct float3 r = {out[0], out[1], out[2]}; return r; #endif } void util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width) { float *dst = dst_row; for (unsigned x = 0; x < width; x += 1) { const struct util_format_r16g16b16_float pixel; memcpy(&pixel, src, sizeof pixel); struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b); dst[0] = r.f1; /* r */ dst[1] = r.f2; /* g */ dst[2] = r.f3; /* b */ dst[3] = 1; /* a */ src += 6; dst += 4; } } ``` Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c enabled) when using -FPIE. Example: gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE util_format_r16g16b16_float_unpack_rgba_float: push ebp push edi push esi push ebx sub esp, 28 mov ecx, DWORD PTR 56[esp] mov edx, DWORD PTR 48[esp] call __x86.get_pc_thunk.ax add eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_ mov ebx, DWORD PTR 52[esp] test ecx, ecx je .L8 vmovss xmm3, DWORD PTR .LC0@GOTOFF[eax] xor esi, esi xor ebp, ebp vpxor xmm2, xmm2, xmm2 .L3: mov eax, DWORD PTR [ebx] vmovss DWORD PTR 12[edx], xmm3 add ebx, 6 add edx, 16 inc esi mov ecx, eax vmovd xmm0, eax shr ecx, 16 mov edi, ecx movzx ecx, WORD PTR -2[ebx] vpinsrw xmm0, xmm0, edi, 1 vmovd xmm1, ecx vpinsrw xmm1, xmm1, ebp, 1 vpunpckldq xmm0, xmm0, xmm1 vpunpcklqdq xmm0, xmm0, xmm2 vcvtph2ps xmm0, xmm0 vmovss DWORD PTR -16[edx], xmm0 vextractps DWORD PTR -12[edx], xmm0, 1 vextractps DWORD PTR -8[edx], xmm0, 2 cmp DWORD PTR 56[esp], esi jne .L3 .L8: add esp, 28 pop ebx pop esi pop edi pop ebp ret .LC0: .long 1065353216 __x86.get_pc_thunk.ax: mov eax, DWORD PTR [esp] ret clang: util_format_r16g16b16_float_unpack_rgba_float: # @util_format_r16g16b16_float_unpack_rgba_float mov eax, dword ptr [esp + 12] test eax, eax je .LBB0_3 mov ecx, dword ptr [esp + 8] mov edx, dword ptr [esp + 4] .LBB0_2: # =>This Inner Loop Header: Depth=1 vmovd xmm0, dword ptr [ecx] # xmm0 = mem[0],zero,zero,zero vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2 add ecx, 6 vcvtph2ps xmm0, xmm0 vmovss dword ptr [edx], xmm0 vextractps dword ptr [edx + 4], xmm0, 1 vextractps dword ptr [edx + 8], xmm0, 2 mov dword ptr [edx + 12], 1065353216 add edx, 16 dec eax jne .LBB0_2 .LBB0_3: ret clang code is essentially optimal. The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics. The issue might be the src stride, of 6, instead 8, that is confusing gcc. Additionally, constant 1065353216 (which is weird, I would expect it to be 0), is stored in data section, instead inline as immediate, this makes code actually larger, and in PIE mode, requires extra pointer trickery, and on -m32, even calling extra function. Even without -fPIE the main loop has poor codegen even on x86-64 / amd64 compared to clang or what I would considered good code. gcc -m64 -O3 -march=native util_format_r16g16b16_float_unpack_rgba_float: test edx, edx je .L8 mov edx, edx sal rdx, 4 vmovss xmm3, DWORD PTR .LC0[rip] lea rcx, [rdi+rdx] xor r9d, r9d vpxor xmm2, xmm2, xmm2 .L3: mov eax, DWORD PTR [rsi] vmovss DWORD PTR 12[rdi], xmm3 mov edx, eax shr edx, 16 mov r8d, edx movzx edx, WORD PTR 4[rsi] vmovd xmm0, eax vmovd xmm1, edx vpinsrw xmm0, xmm0, r8d, 1 vpinsrw xmm1, xmm1, r9d, 1 vpunpckldq xmm0, xmm0, xmm1 vpunpcklqdq xmm0, xmm0, xmm2 vcvtph2ps xmm0, xmm0 add rdi, 16 vmovlps QWORD PTR -16[rdi], xmm0 vextractps DWORD PTR -8[rdi], xmm0, 2 add rsi, 6 cmp rdi, rcx jne .L3 .L8: ret .LC0: .long 1065353216 If you know what is going on, please rename more accurately and reassign to proper component.