On Wed, Sep 07, 2011 at 09:54:03PM +0200, Uros Bizjak wrote: > > 2011-09-07 Jakub Jelinek <ja...@redhat.com> > > > > PR target/50310 > > * config/i386/i386.c (ix86_prepare_sse_fp_compare_args): For > > TARGET_AVX return code for LTGT and UNEQ. > > (ix86_expand_fp_vcond): Handle LTGT and UNEQ. > > > > * gcc.c-torture/execute/ieee/pr50310.c: New test. > > * gcc.dg/pr50310-2.c: New test. > > Please put early exit for TARGET_SSE at the beginning of > ix86_prepare_sse_fp_compare_args function. There is really no need to
You mean for TARGET_AVX, right? > swap operands - and to help reload, since AVX instructions are > three-operand instructions. > > OK for mainline with this change. Here is the updated patch, I'll bootstrap/regtest it now. 2011-09-07 Jakub Jelinek <ja...@redhat.com> PR target/50310 * config/i386/i386.c (ix86_prepare_sse_fp_compare_args): Return code early if TARGET_AVX. (ix86_expand_fp_vcond): Handle LTGT and UNEQ. * gcc.c-torture/execute/ieee/pr50310.c: New test. * gcc.dg/pr50310-2.c: New test. --- gcc/config/i386/i386.c.jj 2011-09-02 16:29:38.000000000 +0200 +++ gcc/config/i386/i386.c 2011-09-07 21:57:52.000000000 +0200 @@ -18304,6 +18304,11 @@ ix86_prepare_sse_fp_compare_args (rtx de { rtx tmp; + /* AVX supports all the needed comparisons, no need to swap arguments + nor help reload. */ + if (TARGET_AVX) + return code; + switch (code) { case LTGT: @@ -18559,7 +18564,32 @@ ix86_expand_fp_vcond (rtx operands[]) code = ix86_prepare_sse_fp_compare_args (operands[0], code, &operands[4], &operands[5]); if (code == UNKNOWN) - return false; + { + rtx temp; + switch (GET_CODE (operands[3])) + { + case LTGT: + temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], + operands[5], operands[1], operands[2]); + code = AND; + break; + case UNEQ: + temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], + operands[5], operands[1], operands[2]); + code = IOR; + break; + default: + gcc_unreachable (); + } + cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, + OPTAB_DIRECT); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; + } if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], operands[5], operands[1], operands[2])) --- gcc/testsuite/gcc.c-torture/execute/ieee/pr50310.c.jj 2011-09-07 14:16:12.000000000 +0200 +++ gcc/testsuite/gcc.c-torture/execute/ieee/pr50310.c 2011-09-07 14:40:57.000000000 +0200 @@ -0,0 +1,73 @@ +/* PR target/50310 */ + +extern void abort (void); +double s1[4], s2[4], s3[64]; + +void +foo (void) +{ + int i; + for (i = 0; i < 4; i++) + s3[0 * 4 + i] = __builtin_isgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[1 * 4 + i] = (!__builtin_isgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[2 * 4 + i] = __builtin_isgreaterequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[3 * 4 + i] = (!__builtin_isgreaterequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[4 * 4 + i] = __builtin_isless (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[5 * 4 + i] = (!__builtin_isless (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[6 * 4 + i] = __builtin_islessequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[7 * 4 + i] = (!__builtin_islessequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[8 * 4 + i] = __builtin_islessgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[9 * 4 + i] = (!__builtin_islessgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[10 * 4 + i] = __builtin_isunordered (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[11 * 4 + i] = (!__builtin_isunordered (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[12 * 4 + i] = s1[i] > s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[13 * 4 + i] = s1[i] <= s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[14 * 4 + i] = s1[i] < s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[15 * 4 + i] = s1[i] >= s2[i] ? -1.0 : 0.0; +} + +int +main () +{ + int i; + s1[0] = 5.0; + s1[1] = 6.0; + s1[2] = 5.0; + s1[3] = __builtin_nan (""); + s2[0] = 6.0; + s2[1] = 5.0; + s2[2] = 5.0; + s2[3] = 5.0; + asm volatile ("" : : : "memory"); + foo (); + asm volatile ("" : : : "memory"); + for (i = 0; i < 16 * 4; i++) + if (i >= 12 * 4 && (i & 3) == 3) + { + if (s3[i] != 0.0) abort (); + } + else + { + static int masks[] = { 2, 2|4, 1, 1|4, 1|2, 8, 2, 1 }; + if (s3[i] + != (((1 << (i & 3)) & ((i & 4) ? ~masks[i / 8] : masks[i / 8])) + ? -1.0 : 0.0)) + abort (); + } + return 0; +} --- gcc/testsuite/gcc.dg/pr50310-2.c.jj 2011-09-07 12:53:43.000000000 +0200 +++ gcc/testsuite/gcc.dg/pr50310-2.c 2011-09-07 12:53:16.000000000 +0200 @@ -0,0 +1,47 @@ +/* PR target/50310 */ +/* { dg-do run } */ +/* { dg-options "-O3" } */ +/* { dg-options "-O3 -mavx" { target avx_runtime } } */ + +double s1[4], s2[4], s3[64]; + +int +main (void) +{ + int i; + asm volatile ("" : : : "memory"); + for (i = 0; i < 4; i++) + s3[0 * 4 + i] = __builtin_isgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[1 * 4 + i] = (!__builtin_isgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[2 * 4 + i] = __builtin_isgreaterequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[3 * 4 + i] = (!__builtin_isgreaterequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[4 * 4 + i] = __builtin_isless (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[5 * 4 + i] = (!__builtin_isless (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[6 * 4 + i] = __builtin_islessequal (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[7 * 4 + i] = (!__builtin_islessequal (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[8 * 4 + i] = __builtin_islessgreater (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[9 * 4 + i] = (!__builtin_islessgreater (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[10 * 4 + i] = __builtin_isunordered (s1[i], s2[i]) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[11 * 4 + i] = (!__builtin_isunordered (s1[i], s2[i])) ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[12 * 4 + i] = s1[i] > s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[13 * 4 + i] = s1[i] >= s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[14 * 4 + i] = s1[i] < s2[i] ? -1.0 : 0.0; + for (i = 0; i < 4; i++) + s3[15 * 4 + i] = s1[i] <= s2[i] ? -1.0 : 0.0; + asm volatile ("" : : : "memory"); + return 0; +} Jakub