This patch aims at PR58762. Currently GCC could not vectorize abs() operation for integers on x86 with only SSE2 support. For int type, the reason is that the expand on abs() is not defined for vector type. This patch defines such an expand so that abs(int) will be vectorized with only SSE2.
For abs(char/short), type conversions are needed as the current abs() function/operation does not accept argument of char/short type. Therefore when we want to get the absolute value of a char_val using abs (char_val), it will be converted into abs ((int) char_val). It then can be vectorized, but the generated code is not efficient as lots of packings and unpackings are envolved. But if we convert (char) abs ((int) char_val) to abs (char_val), the vectorizer will be able to generate better code. Same for short. This conversion also enables vectorizing abs(char/short) operation with PABSB and PABSW instructions in SSE3. With only SSE2 support, I developed three methods to expand abs(char/short/int) seperately: 1. For 32 bit int value x, we can get abs (x) from (((signed) x >> (W-1)) ^ x) - ((signed) x >> (W-1)). This is better than max (x, -x), which needs bit masking. 2. For 16 bit int value x, we can get abs (x) from max (x, -x), as SSE2 provides PMAXSW instruction. 3. For 8 bit int value x, we can get abs (x) from min ((unsigned char) x, (unsigned char) (-x)), as SSE2 provides PMINUB instruction. The patch is pasted below. Please point out any problem in my patch and analysis. thanks, Cong diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8a38316..e0f33ee 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * convert.c (convert_to_integer): Convert (char) abs ((int) char_val) + into abs (char_val). Also convert (short) abs ((int) short_val) + into abs (short_val). + * config/i386/i386-protos.h (ix86_expand_sse2_absvxsi2): New function. + * config/i386/i386.c (ix86_expand_sse2_absvxsi2): New function. + * config/i386/sse.md: Add SSE2 support to abs (char/int/short). + 2013-10-14 David Malcolm <dmalc...@redhat.com> * dumpfile.h (gcc::dump_manager): New class, to hold state diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 3ab2f3a..e85f663 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool); extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); +extern void ix86_expand_sse2_absvxsi2 (rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 02cbbbd..8050e02 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) gen_rtx_MULT (mode, op1, op2)); } +void +ix86_expand_sse2_absvxsi2 (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + rtx tmp0, tmp1; + + switch (mode) + { + /* For 32-bit signed integer X, the best way to calculate the absolute + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ + case V4SImode: + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1, + GEN_INT (GET_MODE_BITSIZE + (GET_MODE_INNER (mode)) - 1), + NULL, 0, OPTAB_DIRECT); + if (tmp0) + tmp1 = expand_simple_binop (mode, XOR, op1, tmp0, + NULL, 0, OPTAB_DIRECT); + if (tmp0 && tmp1) + expand_simple_binop (mode, MINUS, tmp1, tmp0, + op0, 0, OPTAB_DIRECT); + break; + + /* For 16-bit signed integer X, the best way to calculate the absolute + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ + case V8HImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + /* For 8-bit signed integer X, the best way to calculate the absolute + value of X is min ((unsigned char) X, (unsigned char) (-X)), + as SSE2 provides the PMINUB insn. */ + case V16QImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + default: + break; + } +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c3f6c94..bd90f2d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -8721,7 +8721,7 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "abs<mode>2" +(define_insn "*abs<mode>2" [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") (abs:VI124_AVX2_48_AVX512F (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] @@ -8733,6 +8733,20 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "abs<mode>2" + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") + (abs:VI124_AVX2_48_AVX512F + (match_operand:VI124_AVX2_48_AVX512F 1 "register_operand")))] + "TARGET_SSE2" +{ + if (TARGET_SSE2 && !TARGET_SSSE3) + ix86_expand_sse2_absvxsi2 (operands[0], operands[1]); + else if (TARGET_SSSE3) + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_ABS (<MODE>mode, operands[1]))); + DONE; +}) + (define_insn "abs<mode>2" [(set (match_operand:MMXMODEI 0 "register_operand" "=y") (abs:MMXMODEI diff --git a/gcc/convert.c b/gcc/convert.c index b07f0ef..8c60038 100644 --- a/gcc/convert.c +++ b/gcc/convert.c @@ -798,6 +798,20 @@ convert_to_integer (tree type, tree expr) ? TREE_OPERAND (expr, 2) : convert (type, TREE_OPERAND (expr, 2))); + case ABS_EXPR: + { + /* Convert (char) abs ((int) char_val) into abs (char_val). + Convert (short) abs ((int) short_val) into abs (short_val). */ + tree op = TREE_OPERAND (expr, 0); + if (optimize && CONVERT_EXPR_P (op)) + { + tree op2 = TREE_OPERAND (op, 0); + if (TREE_TYPE (op2) == type) + return build1 (TREE_CODE (expr), type, op2); + } + break; + } + default: break; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 075d071..cf5b942 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * gcc.dg/vect/pr58762.c: New test. + 2013-10-14 Tobias Burnus <bur...@net-b.de> PR fortran/58658 diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c b/gcc/testsuite/gcc.dg/vect/pr58762.c new file mode 100644 index 0000000..4e62de6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void test1 (char* a, char* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test2 (short* a, short* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test3 (int* a, int* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" + { target i?86-*-* x86_64-*-* ia64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8a38316..e0f33ee 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * convert.c (convert_to_integer): Convert (char) abs ((int) char_val) + into abs (char_val). Also convert (short) abs ((int) short_val) + into abs (short_val). + * config/i386/i386-protos.h (ix86_expand_sse2_absvxsi2): New function. + * config/i386/i386.c (ix86_expand_sse2_absvxsi2): New function. + * config/i386/sse.md: Add SSE2 support to abs (char/int/short). + 2013-10-14 David Malcolm <dmalc...@redhat.com> * dumpfile.h (gcc::dump_manager): New class, to hold state diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 3ab2f3a..e85f663 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool); extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); +extern void ix86_expand_sse2_absvxsi2 (rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 02cbbbd..8050e02 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) gen_rtx_MULT (mode, op1, op2)); } +void +ix86_expand_sse2_absvxsi2 (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + rtx tmp0, tmp1; + + switch (mode) + { + /* For 32-bit signed integer X, the best way to calculate the absolute + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ + case V4SImode: + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1, + GEN_INT (GET_MODE_BITSIZE + (GET_MODE_INNER (mode)) - 1), + NULL, 0, OPTAB_DIRECT); + if (tmp0) + tmp1 = expand_simple_binop (mode, XOR, op1, tmp0, + NULL, 0, OPTAB_DIRECT); + if (tmp0 && tmp1) + expand_simple_binop (mode, MINUS, tmp1, tmp0, + op0, 0, OPTAB_DIRECT); + break; + + /* For 16-bit signed integer X, the best way to calculate the absolute + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ + case V8HImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + /* For 8-bit signed integer X, the best way to calculate the absolute + value of X is min ((unsigned char) X, (unsigned char) (-X)), + as SSE2 provides the PMINUB insn. */ + case V16QImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + default: + break; + } +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c3f6c94..bd90f2d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -8721,7 +8721,7 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "abs<mode>2" +(define_insn "*abs<mode>2" [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") (abs:VI124_AVX2_48_AVX512F (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] @@ -8733,6 +8733,20 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "abs<mode>2" + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") + (abs:VI124_AVX2_48_AVX512F + (match_operand:VI124_AVX2_48_AVX512F 1 "register_operand")))] + "TARGET_SSE2" +{ + if (TARGET_SSE2 && !TARGET_SSSE3) + ix86_expand_sse2_absvxsi2 (operands[0], operands[1]); + else if (TARGET_SSSE3) + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_ABS (<MODE>mode, operands[1]))); + DONE; +}) + (define_insn "abs<mode>2" [(set (match_operand:MMXMODEI 0 "register_operand" "=y") (abs:MMXMODEI diff --git a/gcc/convert.c b/gcc/convert.c index b07f0ef..8c60038 100644 --- a/gcc/convert.c +++ b/gcc/convert.c @@ -798,6 +798,20 @@ convert_to_integer (tree type, tree expr) ? TREE_OPERAND (expr, 2) : convert (type, TREE_OPERAND (expr, 2))); + case ABS_EXPR: + { + /* Convert (char) abs ((int) char_val) into abs (char_val). + Convert (short) abs ((int) short_val) into abs (short_val). */ + tree op = TREE_OPERAND (expr, 0); + if (optimize && CONVERT_EXPR_P (op)) + { + tree op2 = TREE_OPERAND (op, 0); + if (TREE_TYPE (op2) == type) + return build1 (TREE_CODE (expr), type, op2); + } + break; + } + default: break; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 075d071..cf5b942 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * gcc.dg/vect/pr58762.c: New test. + 2013-10-14 Tobias Burnus <bur...@net-b.de> PR fortran/58658 diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c b/gcc/testsuite/gcc.dg/vect/pr58762.c new file mode 100644 index 0000000..4e62de6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void test1 (char* a, char* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test2 (short* a, short* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test3 (int* a, int* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" + { target i?86-*-* x86_64-*-* ia64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */