Hi All, This turns an inversion of the sign bit + arithmetic right shift into a comparison with 0.
i.e. void fun1(int32_t *x, int n) { for (int i = 0; i < (n & -16); i++) x[i] = (-x[i]) >> 31; } now generates: .L3: ldr q0, [x0] cmgt v0.4s, v0.4s, #0 str q0, [x0], 16 cmp x0, x1 bne .L3 instead of: .L3: ldr q0, [x0] neg v0.4s, v0.4s sshr v0.4s, v0.4s, 31 str q0, [x0], 16 cmp x0, x1 bne .L3 Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no regressions. Ok for master? Thanks, Tamar gcc/ChangeLog: * match.pd: New negate+shift pattern. gcc/testsuite/ChangeLog: * gcc.dg/signbit-2.c: New test. * gcc.dg/signbit-3.c: New test. * gcc.target/aarch64/signbit-1.c: New test. --- inline copy of patch -- diff --git a/gcc/match.pd b/gcc/match.pd index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7190c96d14398143 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) { tree utype = unsigned_type_for (type); } (convert (rshift (lshift (convert:utype @0) @2) @3)))))) +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1. */ +(for cst (INTEGER_CST VECTOR_CST) + (simplify + (rshift (negate:s @0) cst@1) + (with { tree ctype = TREE_TYPE (@0); + tree stype = TREE_TYPE (@1); + tree bt = truth_type_for (ctype); } + (switch + /* Handle scalar case. */ + (if (INTEGRAL_TYPE_P (ctype) + && !VECTOR_TYPE_P (ctype) + && !TYPE_UNSIGNED (ctype) + && canonicalize_math_after_vectorization_p () + && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (stype); }))) + /* Handle vector case with a scalar immediate. */ + (if (VECTOR_INTEGER_TYPE_P (ctype) + && !VECTOR_TYPE_P (stype) + && !TYPE_UNSIGNED (ctype) + && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))) + /* Handle vector case with a vector immediate. */ + (if (VECTOR_INTEGER_TYPE_P (ctype) + && VECTOR_TYPE_P (stype) + && !TYPE_UNSIGNED (ctype) + && uniform_vector_p (@1)) + (with { tree cst = vector_cst_elt (@1, 0); + tree t = TREE_TYPE (cst); } + (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))))))))) + /* Fold (C1/X)*C2 into (C1*C2)/X. */ (simplify (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c new file mode 100644 index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb --- /dev/null +++ b/gcc/testsuite/gcc.dg/signbit-2.c @@ -0,0 +1,19 @@ +/* { dg-do assemble } */ +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +void fun2(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 30; +} + +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */ +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */ diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c new file mode 100644 index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/signbit-3.c @@ -0,0 +1,13 @@ +/* { dg-do assemble } */ +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */ +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c new file mode 100644 index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-options "-O3 --save-temps" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +void fun2(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 30; +} + +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */ --
diff --git a/gcc/match.pd b/gcc/match.pd index 7d2a24dbc5e9644a09968f877e12a824d8ba1caa..581436fe36dbacdcb0c2720b7190c96d14398143 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -826,6 +826,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) { tree utype = unsigned_type_for (type); } (convert (rshift (lshift (convert:utype @0) @2) @3)))))) +/* Fold (-x >> C) into x > 0 where C = precision(type) - 1. */ +(for cst (INTEGER_CST VECTOR_CST) + (simplify + (rshift (negate:s @0) cst@1) + (with { tree ctype = TREE_TYPE (@0); + tree stype = TREE_TYPE (@1); + tree bt = truth_type_for (ctype); } + (switch + /* Handle scalar case. */ + (if (INTEGRAL_TYPE_P (ctype) + && !VECTOR_TYPE_P (ctype) + && !TYPE_UNSIGNED (ctype) + && canonicalize_math_after_vectorization_p () + && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (stype); }))) + /* Handle vector case with a scalar immediate. */ + (if (VECTOR_INTEGER_TYPE_P (ctype) + && !VECTOR_TYPE_P (stype) + && !TYPE_UNSIGNED (ctype) + && wi::eq_p (wi::to_wide (@1), TYPE_PRECISION (stype) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))) + /* Handle vector case with a vector immediate. */ + (if (VECTOR_INTEGER_TYPE_P (ctype) + && VECTOR_TYPE_P (stype) + && !TYPE_UNSIGNED (ctype) + && uniform_vector_p (@1)) + (with { tree cst = vector_cst_elt (@1, 0); + tree t = TREE_TYPE (cst); } + (if (wi::eq_p (wi::to_wide (cst), TYPE_PRECISION (t) - 1)) + (convert:bt (gt:bt @0 { build_zero_cst (ctype); }))))))))) + /* Fold (C1/X)*C2 into (C1*C2)/X. */ (simplify (mult (rdiv@3 REAL_CST@0 @1) REAL_CST@2) diff --git a/gcc/testsuite/gcc.dg/signbit-2.c b/gcc/testsuite/gcc.dg/signbit-2.c new file mode 100644 index 0000000000000000000000000000000000000000..fc0157cbc5c7996b481f2998bc30176c96a669bb --- /dev/null +++ b/gcc/testsuite/gcc.dg/signbit-2.c @@ -0,0 +1,19 @@ +/* { dg-do assemble } */ +/* { dg-options "-O3 --save-temps -fdump-tree-optimized" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +void fun2(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 30; +} + +/* { dg-final { scan-tree-dump-times {\s+>\s+\{ 0, 0, 0, 0 \}} 1 optimized } } */ +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */ diff --git a/gcc/testsuite/gcc.dg/signbit-3.c b/gcc/testsuite/gcc.dg/signbit-3.c new file mode 100644 index 0000000000000000000000000000000000000000..19e9c06c349b3287610f817628f00938ece60bf7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/signbit-3.c @@ -0,0 +1,13 @@ +/* { dg-do assemble } */ +/* { dg-options "-O1 --save-temps -fdump-tree-optimized" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +/* { dg-final { scan-tree-dump-times {\s+>\s+0;} 1 optimized } } */ +/* { dg-final { scan-tree-dump-not {\s+>>\s+31} optimized } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/signbit-1.c b/gcc/testsuite/gcc.target/aarch64/signbit-1.c new file mode 100644 index 0000000000000000000000000000000000000000..3ebfb0586f37de29cf58635b27fe48503714447e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/signbit-1.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-options "-O3 --save-temps" } */ + +#include <stdint.h> + +void fun1(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 31; +} + +void fun2(int32_t *x, int n) +{ + for (int i = 0; i < (n & -16); i++) + x[i] = (-x[i]) >> 30; +} + +/* { dg-final { scan-assembler-times {\tcmgt\t} 1 } } */