https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88547

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
More complete testcase:
typedef signed char v16qi __attribute__((vector_size(16)));
typedef unsigned char v16uqi __attribute__((vector_size(16)));
typedef short v8hi __attribute__((vector_size(16)));
typedef unsigned short v8uhi __attribute__((vector_size(16)));
typedef int v4si __attribute__((vector_size(16)));
typedef unsigned v4usi __attribute__((vector_size(16)));
typedef long long v2di __attribute__((vector_size(16)));
typedef unsigned long long v2udi __attribute__((vector_size(16)));

v16qi
f1 (v16qi x, v16qi y)
{
  return x <= y;
}

v16qi
f1a (v16qi x, v16qi y)
{
  return x < y;
}

v16uqi
f2 (v16uqi x, v16uqi y)
{
  return x <= y;
}

v16qi
f3 (v16qi x, v16qi y)
{
  return x >= y;
}

v16uqi
f4 (v16uqi x, v16uqi y)
{
  return x >= y;
}

v8hi
f5 (v8hi x, v8hi y)
{
  return x <= y;
}

v8uhi
f6 (v8uhi x, v8uhi y)
{
  return x <= y;
}

v8hi
f7 (v8hi x, v8hi y)
{
  return x >= y;
}

v8uhi
f8 (v8uhi x, v8uhi y)
{
  return x >= y;
}

v4si
f9 (v4si x, v4si y)
{
  return x <= y;
}

v4usi
f10 (v4usi x, v4usi y)
{
  return x <= y;
}

v4si
f11 (v4si x, v4si y)
{
  return x >= y;
}

v4usi
f12 (v4usi x, v4usi y)
{
  return x >= y;
}

v2di
f13 (v2di x, v2di y)
{
  return x <= y;
}

v2udi
f14 (v2udi x, v2udi y)
{
  return x <= y;
}

v2di
f15 (v2di x, v2di y)
{
  return x >= y;
}

v2udi
f16 (v2udi x, v2udi y)
{
  return x >= y;
}

plus of course we need a 32-byte and 64-byte vector variant, and test with
-msse4.1 (the first one to have pmin{s,u}b, -mavx, -mavx2, -mavx512*.

I think it could be done in ix86_expand_int_sse_cmp or in ix86_expand_int_vcond
- perhaps only for the cases where one of the vcond operands is all ones and
the other one is zero, notice that depending on which one is which the negation
is 2 instructions (though, only if we don't hoist the constant load e.g. before
a loop) and that for TARGET_SSE4_1 we can use the minimum or maximum.

Reply via email to