Hi! The attached testcase ICEs, because the vectorizer assumes that if vcond* is available, it supports all comparisons, not just a subset of them. With -mavx vcmpd etc. already support all the needed comparisons (and several more - we wouldn't even need to swap the arguments), for SSE the only missing ones (LTGT and UNEQ) can be handled as ORDERED & NE resp. UNORDERED | EQ.
Bootstrapped/regtested on x86_64-linux and i686-linux (on non-AVX host), plus regtested on x86_64-linux on AVX box. Ok for trunk and 4.6? 2011-09-07 Jakub Jelinek <ja...@redhat.com> PR target/50310 * config/i386/i386.c (ix86_prepare_sse_fp_compare_args): For TARGET_AVX return code for LTGT and UNEQ. (ix86_expand_fp_vcond): Handle LTGT and UNEQ. * gcc.c-torture/execute/ieee/pr50310.c: New test. * gcc.dg/pr50310-2.c: New test. --- gcc/config/i386/i386.c.jj 2011-09-02 16:29:38.000000000 +0200 +++ gcc/config/i386/i386.c 2011-09-07 13:34:17.000000000 +0200 @@ -18308,6 +18308,10 @@ ix86_prepare_sse_fp_compare_args (rtx de { case LTGT: case UNEQ: + /* With AVX these are supported directly. */ + if (TARGET_AVX) + break; + /* We have no LTGT as an operator. We could implement it with NE & ORDERED, but this requires an extra temporary. It's not clear that it's worth it. */ @@ -18559,7 +18563,32 @@ ix86_expand_fp_vcond (rtx operands[]) code = ix86_prepare_sse_fp_compare_args (operands[0], code, &operands[4], &operands[5]); if (code == UNKNOWN) - return false; + { + rtx temp; + switch (GET_CODE (operands[3])) + { + case LTGT: + temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], + operands[5], operands[1], operands[2]); + code = AND; + break; + case UNEQ: + temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], + operands[5], operands[1], operands[2]); + code = IOR; + break; + default: + gcc_unreachable (); + } + cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, + OPTAB_DIRECT); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; + } if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], operands[5], operands[1], operands[2])) --- gcc/testsuite/gcc.c-torture/execute/ieee/pr50310.c.jj 2011-09-07 14:16:12.000000000 +0200 +++ gcc/testsuite/gcc.c-torture/execute/ieee/pr50310.c 2011-09-07 14:40:57.000000000 +0200 @@ -0,0 +1,73 @@ +/* PR target/50310 */ + +extern void abort (void); +double s1[4], s2[4], s3[64]; + +void +foo (void) +{ + int i; + for (i = 0; i < 4; i++) + s3[0 * 4 + i] = __builtin_isgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[1 * 4 + i] = (!__builtin_isgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[2 * 4 + i] = __builtin_isgreaterequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[3 * 4 + i] = (!__builtin_isgreaterequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[4 * 4 + i] = __builtin_isless (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[5 * 4 + i] = (!__builtin_isless (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[6 * 4 + i] = __builtin_islessequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[7 * 4 + i] = (!__builtin_islessequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[8 * 4 + i] = __builtin_islessgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[9 * 4 + i] = (!__builtin_islessgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[10 * 4 + i] = __builtin_isunordered (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[11 * 4 + i] = (!__builtin_isunordered (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[12 * 4 + i] = s1[i] > s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[13 * 4 + i] = s1[i] <= s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[14 * 4 + i] = s1[i] < s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[15 * 4 + i] = s1[i] >= s2[i] ? -1.0 : 0.0; +} + +int +main () +{ + int i; + s1[0] = 5.0; + s1[1] = 6.0; + s1[2] = 5.0; + s1[3] = __builtin_nan (""); + s2[0] = 6.0; + s2[1] = 5.0; + s2[2] = 5.0; + s2[3] = 5.0; + asm volatile ("" : : : "memory"); + foo (); + asm volatile ("" : : : "memory"); + for (i = 0; i < 16 * 4; i++) + if (i >= 12 * 4 && (i & 3) == 3) + { + if (s3[i] != 0.0) abort (); + } + else + { + static int masks[] = { 2, 2|4, 1, 1|4, 1|2, 8, 2, 1 }; + if (s3[i] + != (((1 << (i & 3)) & ((i & 4) ? ~masks[i / 8] : masks[i / 8])) + ? -1.0 : 0.0)) + abort (); + } + return 0; +} --- gcc/testsuite/gcc.dg/pr50310-2.c.jj 2011-09-07 12:53:43.000000000 +0200 +++ gcc/testsuite/gcc.dg/pr50310-2.c 2011-09-07 12:53:16.000000000 +0200 @@ -0,0 +1,47 @@ +/* PR target/50310 */ +/* { dg-do run } */ +/* { dg-options "-O3" } */ +/* { dg-options "-O3 -mavx" { target avx_runtime } } */ + +double s1[4], s2[4], s3[64]; + +int +main (void) +{ + int i; + asm volatile ("" : : : "memory"); + for (i = 0; i < 4; i++) + s3[0 * 4 + i] = __builtin_isgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[1 * 4 + i] = (!__builtin_isgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[2 * 4 + i] = __builtin_isgreaterequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[3 * 4 + i] = (!__builtin_isgreaterequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[4 * 4 + i] = __builtin_isless (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[5 * 4 + i] = (!__builtin_isless (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[6 * 4 + i] = __builtin_islessequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[7 * 4 + i] = (!__builtin_islessequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[8 * 4 + i] = __builtin_islessgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[9 * 4 + i] = (!__builtin_islessgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[10 * 4 + i] = __builtin_isunordered (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[11 * 4 + i] = (!__builtin_isunordered (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[12 * 4 + i] = s1[i] > s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[13 * 4 + i] = s1[i] >= s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[14 * 4 + i] = s1[i] < s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[15 * 4 + i] = s1[i] <= s2[i] ? -1.0 : 0.0; + asm volatile ("" : : : "memory"); + return 0; +} Jakub