On Tue, Oct 22, 2013 at 8:11 PM, <pins...@gmail.com> wrote: > > > Sent from my iPad > >> On Oct 22, 2013, at 7:23 PM, Cong Hou <co...@google.com> wrote: >> >> This patch aims at PR58762. >> >> Currently GCC could not vectorize abs() operation for integers on x86 >> with only SSE2 support. For int type, the reason is that the expand on >> abs() is not defined for vector type. This patch defines such an >> expand so that abs(int) will be vectorized with only SSE2. >> >> For abs(char/short), type conversions are needed as the current abs() >> function/operation does not accept argument of char/short type. >> Therefore when we want to get the absolute value of a char_val using >> abs (char_val), it will be converted into abs ((int) char_val). It >> then can be vectorized, but the generated code is not efficient as >> lots of packings and unpackings are envolved. But if we convert >> (char) abs ((int) char_val) to abs (char_val), the vectorizer will be >> able to generate better code. Same for short. >> >> This conversion also enables vectorizing abs(char/short) operation >> with PABSB and PABSW instructions in SSE3. >> >> With only SSE2 support, I developed three methods to expand >> abs(char/short/int) seperately: >> >> 1. For 32 bit int value x, we can get abs (x) from (((signed) x >> >> (W-1)) ^ x) - ((signed) x >> (W-1)). This is better than max (x, -x), >> which needs bit masking. >> >> 2. For 16 bit int value x, we can get abs (x) from max (x, -x), as >> SSE2 provides PMAXSW instruction. >> >> 3. For 8 bit int value x, we can get abs (x) from min ((unsigned char) >> x, (unsigned char) (-x)), as SSE2 provides PMINUB instruction. >> >> >> The patch is pasted below. Please point out any problem in my patch >> and analysis. >> >> >> thanks, >> Cong >> >> >> >> >> diff --git a/gcc/ChangeLog b/gcc/ChangeLog >> index 8a38316..e0f33ee 100644 >> --- a/gcc/ChangeLog >> +++ b/gcc/ChangeLog >> @@ -1,3 +1,13 @@ >> +2013-10-22 Cong Hou <co...@google.com> >> + >> + PR target/58762 >> + * convert.c (convert_to_integer): Convert (char) abs ((int) char_val) >> + into abs (char_val). Also convert (short) abs ((int) short_val) >> + into abs (short_val). > > I don't like this optimization in convert. I think it should be submitted > separately and should be done in tree-ssa-forwprop.
Yes. This patch can be split into two: one for vectorization and one for abs conversion. The reason why I put abs conversion to convert.c is because fabs conversion is also done there. > > Also I think you should have a generic (non x86) test case for the above > optimization. For vectorization I need to do it on x86 since the define_expand is only for it. But for abs conversion, yes, I should make a generic test case. Thank you for your comments! Cong > > Thanks, > Andrew > > >> + * config/i386/i386-protos.h (ix86_expand_sse2_absvxsi2): New function. >> + * config/i386/i386.c (ix86_expand_sse2_absvxsi2): New function. >> + * config/i386/sse.md: Add SSE2 support to abs (char/int/short). > > > >> + >> 2013-10-14 David Malcolm <dmalc...@redhat.com> >> >> * dumpfile.h (gcc::dump_manager): New class, to hold state >> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h >> index 3ab2f3a..e85f663 100644 >> --- a/gcc/config/i386/i386-protos.h >> +++ b/gcc/config/i386/i386-protos.h >> @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, >> rtx, rtx, bool, bool); >> extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); >> extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); >> extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); >> +extern void ix86_expand_sse2_absvxsi2 (rtx, rtx); >> >> /* In i386-c.c */ >> extern void ix86_target_macros (void); >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> index 02cbbbd..8050e02 100644 >> --- a/gcc/config/i386/i386.c >> +++ b/gcc/config/i386/i386.c >> @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx >> op2) >> gen_rtx_MULT (mode, op1, op2)); >> } >> >> +void >> +ix86_expand_sse2_absvxsi2 (rtx op0, rtx op1) >> +{ >> + enum machine_mode mode = GET_MODE (op0); >> + rtx tmp0, tmp1; >> + >> + switch (mode) >> + { >> + /* For 32-bit signed integer X, the best way to calculate the absolute >> + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ >> + case V4SImode: >> + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1, >> + GEN_INT (GET_MODE_BITSIZE >> + (GET_MODE_INNER (mode)) - 1), >> + NULL, 0, OPTAB_DIRECT); >> + if (tmp0) >> + tmp1 = expand_simple_binop (mode, XOR, op1, tmp0, >> + NULL, 0, OPTAB_DIRECT); >> + if (tmp0 && tmp1) >> + expand_simple_binop (mode, MINUS, tmp1, tmp0, >> + op0, 0, OPTAB_DIRECT); >> + break; >> + >> + /* For 16-bit signed integer X, the best way to calculate the absolute >> + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ >> + case V8HImode: >> + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); >> + if (tmp0) >> + expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0, >> + OPTAB_DIRECT); >> + break; >> + >> + /* For 8-bit signed integer X, the best way to calculate the absolute >> + value of X is min ((unsigned char) X, (unsigned char) (-X)), >> + as SSE2 provides the PMINUB insn. */ >> + case V16QImode: >> + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); >> + if (tmp0) >> + expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0, >> + OPTAB_DIRECT); >> + break; >> + >> + default: >> + break; >> + } >> +} >> + >> /* Expand an insert into a vector register through pinsr insn. >> Return true if successful. */ >> >> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md >> index c3f6c94..bd90f2d 100644 >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -8721,7 +8721,7 @@ >> (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p >> (insn)")) >> (set_attr "mode" "DI")]) >> >> -(define_insn "abs<mode>2" >> +(define_insn "*abs<mode>2" >> [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") >> (abs:VI124_AVX2_48_AVX512F >> (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] >> @@ -8733,6 +8733,20 @@ >> (set_attr "prefix" "maybe_vex") >> (set_attr "mode" "<sseinsnmode>")]) >> >> +(define_expand "abs<mode>2" >> + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") >> + (abs:VI124_AVX2_48_AVX512F >> + (match_operand:VI124_AVX2_48_AVX512F 1 "register_operand")))] >> + "TARGET_SSE2" >> +{ >> + if (TARGET_SSE2 && !TARGET_SSSE3) >> + ix86_expand_sse2_absvxsi2 (operands[0], operands[1]); >> + else if (TARGET_SSSE3) >> + emit_insn (gen_rtx_SET (VOIDmode, operands[0], >> + gen_rtx_ABS (<MODE>mode, operands[1]))); >> + DONE; >> +}) >> + >> (define_insn "abs<mode>2" >> [(set (match_operand:MMXMODEI 0 "register_operand" "=y") >> (abs:MMXMODEI >> diff --git a/gcc/convert.c b/gcc/convert.c >> index b07f0ef..8c60038 100644 >> --- a/gcc/convert.c >> +++ b/gcc/convert.c >> @@ -798,6 +798,20 @@ convert_to_integer (tree type, tree expr) >> ? TREE_OPERAND (expr, 2) >> : convert (type, TREE_OPERAND (expr, 2))); >> >> + case ABS_EXPR: >> + { >> + /* Convert (char) abs ((int) char_val) into abs (char_val). >> + Convert (short) abs ((int) short_val) into abs (short_val). */ >> + tree op = TREE_OPERAND (expr, 0); >> + if (optimize && CONVERT_EXPR_P (op)) >> + { >> + tree op2 = TREE_OPERAND (op, 0); >> + if (TREE_TYPE (op2) == type) >> + return build1 (TREE_CODE (expr), type, op2); >> + } >> + break; >> + } >> + >> default: >> break; >> } >> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog >> index 075d071..cf5b942 100644 >> --- a/gcc/testsuite/ChangeLog >> +++ b/gcc/testsuite/ChangeLog >> @@ -1,3 +1,8 @@ >> +2013-10-22 Cong Hou <co...@google.com> >> + >> + PR target/58762 >> + * gcc.dg/vect/pr58762.c: New test. >> + >> 2013-10-14 Tobias Burnus <bur...@net-b.de> >> >> PR fortran/58658 >> diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c >> b/gcc/testsuite/gcc.dg/vect/pr58762.c >> new file mode 100644 >> index 0000000..4e62de6 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c >> @@ -0,0 +1,27 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -ftree-vectorize" } */ >> + >> +void test1 (char* a, char* b) >> +{ >> + int i; >> + for (i = 0; i < 10000; ++i) >> + a[i] = abs (b[i]); >> +} >> + >> +void test2 (short* a, short* b) >> +{ >> + int i; >> + for (i = 0; i < 10000; ++i) >> + a[i] = abs (b[i]); >> +} >> + >> +void test3 (int* a, int* b) >> +{ >> + int i; >> + for (i = 0; i < 10000; ++i) >> + a[i] = abs (b[i]); >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" >> + { target i?86-*-* x86_64-*-* ia64-*-* } } } */ >> +/* { dg-final { cleanup-tree-dump "vect" } } */ >> <patch-abs.txt>