stride of 6

witold.baryluk+gcc at gmail dot com via Gcc-bugs Sun, 25 Apr 2021 15:08:22 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257


            Bug ID: 100257
           Summary: poor codegen with vcvtph2ps / stride of 6
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental)


https://godbolt.org/z/n6ooMdnz8


This C code:

```
#include <stdint.h>
#include <string.h>
#include <immintrin.h>

struct float3 {
    float f1;
    float f2;
    float f3;
};

struct util_format_r16g16b16_float {
   uint16_t r;
   uint16_t g;
   uint16_t b;
};

static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t
val_1, uint16_t val_2) {
#if defined(__F16C__)
      //const __m128i in = {val_0, val_1, val_2};
      //__m128 out;
      //__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));

      const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
      const __m128 out = _mm_cvtph_ps(in);

      const struct float3 r = {out[0], out[1], out[2]};
      return r;
#endif
}


void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const
uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
        const struct util_format_r16g16b16_float pixel;
        memcpy(&pixel, src, sizeof pixel);

        struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
        dst[0] = r.f1; /* r */
        dst[1] = r.f2; /* g */
        dst[2] = r.f3; /* b */
        dst[3] = 1; /* a */

        src += 6;
        dst += 4;
   }
}

```

Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c
enabled) when using -FPIE.

Example:


gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE

util_format_r16g16b16_float_unpack_rgba_float:
        push    ebp
        push    edi
        push    esi
        push    ebx
        sub     esp, 28
        mov     ecx, DWORD PTR 56[esp]
        mov     edx, DWORD PTR 48[esp]
        call    __x86.get_pc_thunk.ax
        add     eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_
        mov     ebx, DWORD PTR 52[esp]
        test    ecx, ecx
        je      .L8
        vmovss  xmm3, DWORD PTR .LC0@GOTOFF[eax]
        xor     esi, esi
        xor     ebp, ebp
        vpxor   xmm2, xmm2, xmm2
.L3:
        mov     eax, DWORD PTR [ebx]
        vmovss  DWORD PTR 12[edx], xmm3
        add     ebx, 6
        add     edx, 16
        inc     esi
        mov     ecx, eax
        vmovd   xmm0, eax
        shr     ecx, 16
        mov     edi, ecx
        movzx   ecx, WORD PTR -2[ebx]
        vpinsrw xmm0, xmm0, edi, 1
        vmovd   xmm1, ecx
        vpinsrw xmm1, xmm1, ebp, 1
        vpunpckldq      xmm0, xmm0, xmm1
        vpunpcklqdq     xmm0, xmm0, xmm2
        vcvtph2ps       xmm0, xmm0
        vmovss  DWORD PTR -16[edx], xmm0
        vextractps      DWORD PTR -12[edx], xmm0, 1
        vextractps      DWORD PTR -8[edx], xmm0, 2
        cmp     DWORD PTR 56[esp], esi
        jne     .L3
.L8:
        add     esp, 28
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret
.LC0:
        .long   1065353216
__x86.get_pc_thunk.ax:
        mov     eax, DWORD PTR [esp]
        ret



clang:

util_format_r16g16b16_float_unpack_rgba_float: #
@util_format_r16g16b16_float_unpack_rgba_float
        mov     eax, dword ptr [esp + 12]
        test    eax, eax
        je      .LBB0_3
        mov     ecx, dword ptr [esp + 8]
        mov     edx, dword ptr [esp + 4]
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        vmovd   xmm0, dword ptr [ecx]           # xmm0 = mem[0],zero,zero,zero
        vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2
        add     ecx, 6
        vcvtph2ps       xmm0, xmm0
        vmovss  dword ptr [edx], xmm0
        vextractps      dword ptr [edx + 4], xmm0, 1
        vextractps      dword ptr [edx + 8], xmm0, 2
        mov     dword ptr [edx + 12], 1065353216
        add     edx, 16
        dec     eax
        jne     .LBB0_2
.LBB0_3:
        ret


clang code is essentially optimal.


The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics.

The issue might be the src stride, of 6, instead 8, that is confusing gcc.

Additionally, constant 1065353216  (which is weird, I would expect it to be 0),
is stored in data section, instead inline as immediate, this makes code
actually larger, and in PIE mode, requires extra pointer trickery, and on -m32,
even calling extra function.

Even without -fPIE the main loop has poor codegen even on x86-64 / amd64
compared to clang or what I would considered good code.

gcc -m64 -O3 -march=native

util_format_r16g16b16_float_unpack_rgba_float:
        test    edx, edx
        je      .L8
        mov     edx, edx
        sal     rdx, 4
        vmovss  xmm3, DWORD PTR .LC0[rip]
        lea     rcx, [rdi+rdx]
        xor     r9d, r9d
        vpxor   xmm2, xmm2, xmm2
.L3:
        mov     eax, DWORD PTR [rsi]
        vmovss  DWORD PTR 12[rdi], xmm3
        mov     edx, eax
        shr     edx, 16
        mov     r8d, edx
        movzx   edx, WORD PTR 4[rsi]
        vmovd   xmm0, eax
        vmovd   xmm1, edx
        vpinsrw xmm0, xmm0, r8d, 1
        vpinsrw xmm1, xmm1, r9d, 1
        vpunpckldq      xmm0, xmm0, xmm1
        vpunpcklqdq     xmm0, xmm0, xmm2
        vcvtph2ps       xmm0, xmm0
        add     rdi, 16
        vmovlps QWORD PTR -16[rdi], xmm0
        vextractps      DWORD PTR -8[rdi], xmm0, 2
        add     rsi, 6
        cmp     rdi, rcx
        jne     .L3
.L8:
        ret
.LC0:
        .long   1065353216


If you know what is going on, please rename more accurately and reassign to
proper component.

[Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6

Reply via email to