https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102483
--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to Hongtao.liu from comment #3) > Also for reduc_umin/umax/smin/smax_scal_v4qi. After providing expanders for reduc_umin/umax/smin/smax_scal_v4qi, perfomance for below functions are a little bit faster than before for -O2 -march=haswell, -O2 -march=skylake-avx512 and -Ofast -march=skylake-avx512. char __attribute__((noipa, optimize("Ofast"),target("sse4.1"))) reduce_add (char* p) { char sum = 0; for (int i = 0; i != 4; i++) sum += p[i]; return sum; } #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) > (b) ? (b) : (a)) unsigned char __attribute__((noipa)) reduce_umax (unsigned char* p) { unsigned char sum = p[0]; for (int i = 0; i != 4; i++) sum = MAX(sum, p[i]); return sum; } unsigned char __attribute__((noipa)) reduce_umin (unsigned char* p) { unsigned char sum = p[0]; for (int i = 0; i != 4; i++) sum = MIN(sum, p[i]); return sum; } char __attribute__((noipa)) reduce_smax (char* p) { char sum = p[0]; for (int i = 0; i != 4; i++) sum = MAX(sum, p[i]); return sum; } char __attribute__((noipa)) reduce_smin (char* p) { char sum = p[0]; for (int i = 0; i != 4; i++) sum = MIN(sum, p[i]); return sum; }