https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118019

            Bug ID: 118019
           Summary: RISC-V: Performance regression in hottest function of
                    X264
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

I recently recognize there is a big performance regression between GCC14.2 vs
GCC15.

https://godbolt.org/z/v87KxPddd

#include <stdint.h>
#include <math.h>
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
    int t0 = s0 + s1;\
    int t1 = s0 - s1;\
    int t2 = s2 + s3;\
    int t3 = s2 - s3;\
    d0 = t0 + t2;\
    d2 = t0 - t2;\
    d1 = t1 + t3;\
    d3 = t1 - t3;\
}
uint32_t abs2( uint32_t a )
{
    uint32_t s = ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}
int x264_pixel_satd_8x4( uint8_t * pix1, 
int i_pix1, uint8_t * pix2, int i_pix2 )
{
    uint32_t tmp[4][4];
    uint32_t a0, a1, a2, a3;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
    }
    for( int i = 0; i < 4; i++ )
    {
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]
);
        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}

int g_i_pix1 = 16;
int g_i_pix2 = 1344;
uint64_t g_pix1[8] = { 111, 112, 113, 114, 115, 116, 117, 118 };
uint64_t g_pix2[8] = { 211, 212, 213, 214, 215, 216, 217, 218 };
int result;

int main() {
  result =
x264_pixel_satd_8x4((uint8_t*)g_pix1,g_i_pix1,(uint8_t*)g_pix2,g_i_pix2);
  return 0;
}

It seems there is an inefficient vectorization in GCC-15:


        vsetivli        zero,16,e8,mf4,ta,ma
        lbu     a1,7(a0)
        or      a4,a4,t4
        lbu     a0,0(a2)
        lbu     t4,2(a2)
        vid.v   v1
        slli    t5,t5,8
        vsrl.vi v1,v1,2
        or      t5,t5,a0
        slli    t4,t4,16
        or      t5,t4,t5
        slli    t1,t1,56
        lbu     t4,3(a2)
        or      t1,t1,a5
        slli    a1,a1,56
        lbu     a5,4(a2)
        vsll.vi v1,v1,3
        or      a1,a1,a4
        vsetivli        zero,2,e64,m1,ta,ma
        add     a4,a2,a3
        vmv.v.x v6,a1
        slli    t3,t3,56
        add     a1,a4,a3
        slli    t4,t4,24
        or      t3,t3,t6
        or      t4,t4,t5
        lbu     t6,1(a1)
        slli    a0,t2,40
        slli    a5,a5,32
        or      a0,a0,t0
        or      a5,a5,t4
        lbu     t0,0(a1)
        lbu     t4,2(a1)
        slli    t6,t6,8
        lbu     t5,1(a4)
        or      t6,t6,t0
        slli    t4,t4,16
        lbu     t0,0(a4)
        lbu     s0,5(a2)
        or      t4,t4,t6
        lbu     t6,2(a4)
        slli    t5,t5,8
        lbu     t2,3(a1)
        or      t5,t5,t0
        slli    t6,t6,16
        slli    s0,s0,40
        or      t6,t6,t5
        or      s0,s0,a5
        lbu     t5,6(a2)
        lbu     a5,3(a4)
        slli    t2,t2,24
        or      t2,t2,t4
        slli    a5,a5,24
        lbu     t4,7(a2)
        slli    t5,t5,48
        or      t0,a5,t6
        or      a5,t5,s0
        lbu     t5,4(a4)
        add     a3,a1,a3
        slli    t4,t4,56
        lbu     t6,1(a3)
        or      t4,t4,a5
        slli    t5,t5,32
        vmv.v.x v5,t4
        or      t5,t5,t0
        lbu     t4,2(a3)
        lbu     t0,0(a3)
        lbu     a2,4(a1)
        slli    t6,t6,8
        or      t6,t6,t0
        slli    t4,t4,16
        lbu     t0,5(a1)
        or      t4,t4,t6
        lbu     t6,5(a4)
        slli    a2,a2,32
        or      a2,a2,t2
        slli    t0,t0,40
        or      t0,t0,a2
        slli    t6,t6,40
        lbu     a2,3(a3)
        or      t6,t6,t5
        lbu     t5,4(a3)
        lui     a5,%hi(.LANCHOR0)
        slli    a2,a2,24
        addi    a5,a5,%lo(.LANCHOR0)
        vslide1down.vx  v6,v6,t1
        vmv.v.x v4,t3
        or      a2,a2,t4
        vsetivli        zero,16,e8,mf4,ta,ma

The overal dynamic instruction count is 

1643991376767 (GCC 14.2) vs 174484959091 (GCC 15). I guess it is cost model
issue in RISC-V target but I am not sure about that.

Reply via email to