https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
Bug ID: 100257
Summary: poor codegen with vcvtph2ps / stride of 6
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: witold.baryluk+gcc at gmail dot com
Target Milestone: ---
gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental)
https://godbolt.org/z/n6ooMdnz8
This C code:
```
#include <stdint.h>
#include <string.h>
#include <immintrin.h>
struct float3 {
float f1;
float f2;
float f3;
};
struct util_format_r16g16b16_float {
uint16_t r;
uint16_t g;
uint16_t b;
};
static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t
val_1, uint16_t val_2) {
#if defined(__F16C__)
//const __m128i in = {val_0, val_1, val_2};
//__m128 out;
//__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));
const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
const __m128 out = _mm_cvtph_ps(in);
const struct float3 r = {out[0], out[1], out[2]};
return r;
#endif
}
void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const
uint8_t *restrict src, unsigned width)
{
float *dst = dst_row;
for (unsigned x = 0; x < width; x += 1) {
const struct util_format_r16g16b16_float pixel;
memcpy(&pixel, src, sizeof pixel);
struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
dst[0] = r.f1; /* r */
dst[1] = r.f2; /* g */
dst[2] = r.f3; /* b */
dst[3] = 1; /* a */
src += 6;
dst += 4;
}
}
```
Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c
enabled) when using -FPIE.
Example:
gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE
util_format_r16g16b16_float_unpack_rgba_float:
push ebp
push edi
push esi
push ebx
sub esp, 28
mov ecx, DWORD PTR 56[esp]
mov edx, DWORD PTR 48[esp]
call __x86.get_pc_thunk.ax
add eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_
mov ebx, DWORD PTR 52[esp]
test ecx, ecx
je .L8
vmovss xmm3, DWORD PTR .LC0@GOTOFF[eax]
xor esi, esi
xor ebp, ebp
vpxor xmm2, xmm2, xmm2
.L3:
mov eax, DWORD PTR [ebx]
vmovss DWORD PTR 12[edx], xmm3
add ebx, 6
add edx, 16
inc esi
mov ecx, eax
vmovd xmm0, eax
shr ecx, 16
mov edi, ecx
movzx ecx, WORD PTR -2[ebx]
vpinsrw xmm0, xmm0, edi, 1
vmovd xmm1, ecx
vpinsrw xmm1, xmm1, ebp, 1
vpunpckldq xmm0, xmm0, xmm1
vpunpcklqdq xmm0, xmm0, xmm2
vcvtph2ps xmm0, xmm0
vmovss DWORD PTR -16[edx], xmm0
vextractps DWORD PTR -12[edx], xmm0, 1
vextractps DWORD PTR -8[edx], xmm0, 2
cmp DWORD PTR 56[esp], esi
jne .L3
.L8:
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
.LC0:
.long 1065353216
__x86.get_pc_thunk.ax:
mov eax, DWORD PTR [esp]
ret
clang:
util_format_r16g16b16_float_unpack_rgba_float: #
@util_format_r16g16b16_float_unpack_rgba_float
mov eax, dword ptr [esp + 12]
test eax, eax
je .LBB0_3
mov ecx, dword ptr [esp + 8]
mov edx, dword ptr [esp + 4]
.LBB0_2: # =>This Inner Loop Header: Depth=1
vmovd xmm0, dword ptr [ecx] # xmm0 = mem[0],zero,zero,zero
vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2
add ecx, 6
vcvtph2ps xmm0, xmm0
vmovss dword ptr [edx], xmm0
vextractps dword ptr [edx + 4], xmm0, 1
vextractps dword ptr [edx + 8], xmm0, 2
mov dword ptr [edx + 12], 1065353216
add edx, 16
dec eax
jne .LBB0_2
.LBB0_3:
ret
clang code is essentially optimal.
The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics.
The issue might be the src stride, of 6, instead 8, that is confusing gcc.
Additionally, constant 1065353216 (which is weird, I would expect it to be 0),
is stored in data section, instead inline as immediate, this makes code
actually larger, and in PIE mode, requires extra pointer trickery, and on -m32,
even calling extra function.
Even without -fPIE the main loop has poor codegen even on x86-64 / amd64
compared to clang or what I would considered good code.
gcc -m64 -O3 -march=native
util_format_r16g16b16_float_unpack_rgba_float:
test edx, edx
je .L8
mov edx, edx
sal rdx, 4
vmovss xmm3, DWORD PTR .LC0[rip]
lea rcx, [rdi+rdx]
xor r9d, r9d
vpxor xmm2, xmm2, xmm2
.L3:
mov eax, DWORD PTR [rsi]
vmovss DWORD PTR 12[rdi], xmm3
mov edx, eax
shr edx, 16
mov r8d, edx
movzx edx, WORD PTR 4[rsi]
vmovd xmm0, eax
vmovd xmm1, edx
vpinsrw xmm0, xmm0, r8d, 1
vpinsrw xmm1, xmm1, r9d, 1
vpunpckldq xmm0, xmm0, xmm1
vpunpcklqdq xmm0, xmm0, xmm2
vcvtph2ps xmm0, xmm0
add rdi, 16
vmovlps QWORD PTR -16[rdi], xmm0
vextractps DWORD PTR -8[rdi], xmm0, 2
add rsi, 6
cmp rdi, rcx
jne .L3
.L8:
ret
.LC0:
.long 1065353216
If you know what is going on, please rename more accurately and reassign to
proper component.