https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102483

--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #3)
> Also for reduc_umin/umax/smin/smax_scal_v4qi.


After providing expanders for reduc_umin/umax/smin/smax_scal_v4qi, perfomance
for below functions are a little bit faster than before for -O2 -march=haswell,
-O2 -march=skylake-avx512 and -Ofast -march=skylake-avx512.

char
__attribute__((noipa, optimize("Ofast"),target("sse4.1")))
reduce_add (char* p)
{
  char sum = 0;
  for (int i = 0; i != 4; i++)
    sum += p[i];
  return sum;
}

#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) > (b) ? (b) : (a))

unsigned char
__attribute__((noipa))
reduce_umax (unsigned char* p)
{
  unsigned char sum = p[0];
  for (int i = 0; i != 4; i++)
    sum = MAX(sum, p[i]);
  return sum;
}

unsigned char
__attribute__((noipa))
reduce_umin (unsigned char* p)
{
  unsigned char sum = p[0];
  for (int i = 0; i != 4; i++)
    sum = MIN(sum, p[i]);
  return sum;
}

char
__attribute__((noipa))
reduce_smax (char* p)
{
  char sum = p[0];
  for (int i = 0; i != 4; i++)
    sum = MAX(sum, p[i]);
  return sum;
}

char
__attribute__((noipa))
reduce_smin (char* p)
{
  char sum = p[0];
  for (int i = 0; i != 4; i++)
    sum = MIN(sum, p[i]);
  return sum;
}

Reply via email to