The easiest way to motivate these additions to match.pd is with the
following example:
unsigned int foo(unsigned char i) {
return i | (i<<8) | (i<<16) | (i<<24);
}
which mainline with -O2 on x86_64 currently generates:
foo: movzbl %dil, %edi
movl %edi, %eax
movl %edi, %edx
sall $8, %eax
sall $16, %edx
orl %edx, %eax
orl %edi, %eax
sall $24, %edi
orl %edi, %eax
ret
but with this patch now becomes:
foo: movzbl %dil, %eax
imull $16843009, %eax, %eax
ret
Interestingly, this transformation is already applied when using
addition, allowing synth_mult to select an optimal sequence, but
not when using the equivalent bit-wise ior or xor operators.
The solution is to use tree_nonzero_bits to check that the
potentially non-zero bits of each operand don't overlap, which
ensures that BIT_IOR_EXPR and BIT_XOR_EXPR produce the same
results as PLUS_EXPR, which effectively generalizes the old
fold_plusminus_mult_expr. Technically, the transformation
is to canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to
X*(C1+C2) where X and X<<C are considered special cases.
I have a related follow-up patch, but these are all of the
match.pd changes. The one aspect that's a little odd is
that each transform is paired with a convert@1 variant,
using the efficient match machinery to expose any zero
extension to fold-const.c's tree_nonzero_bits functionality.
The following patch has been tested on x86_64-pc-linux-gnu
with a "make bootstrap" and "make -k check" with no new
failures.
Ok for mainline?
2021-07-26 Roger Sayle <[email protected]>
gcc/ChangeLog
* match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
(X*C1)^(X*C2) as X*(C1+C2), and related variants, using
tree_nonzero_bits to ensure that operands are bit-wise disjoint.
gcc/testsuite/ChangeLog
* gcc.dg/fold-ior-4.c: New test.
Roger
--
Roger Sayle
NextMove Software
Cambridge, UK
diff --git a/gcc/match.pd b/gcc/match.pd
index beb8d27..0347bea 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2833,6 +2833,113 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(convert (mult (convert:t @0) { cst; })))))
#endif
+/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when
+ tree_nonzero_bits allows IOR and XOR to be treated like PLUS.
+ Likewise, handle (X<<C3) and X as legitimate variants of X*C. */
+(for op (bit_ior bit_xor)
+ (simplify
+ (op (mult:s@0 @1 INTEGER_CST@2)
+ (mult:s@3 @1 INTEGER_CST@4))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+ (mult @1
+ { wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); })))
+ (simplify
+ (op (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+ (mult:s@4 (convert@1 @2) INTEGER_CST@5))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+ (mult @1
+ { wide_int_to_tree (type, wi::to_wide (@3) + wi::to_wide (@5)); })))
+ (simplify
+ (op:c (mult:s@0 @1 INTEGER_CST@2)
+ (lshift:s@3 @1 INTEGER_CST@4))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && tree_int_cst_sgn (@4) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+ (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+ wide_int c = wi::add (wi::to_wide (@2),
+ wi::lshift (wone, wi::to_wide (@4))); }
+ (mult @1 { wide_int_to_tree (type, c); }))))
+ (simplify
+ (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+ (lshift:s@4 (convert@1 @2) INTEGER_CST@5))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && tree_int_cst_sgn (@5) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+ (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+ wide_int c = wi::add (wi::to_wide (@3),
+ wi::lshift (wone, wi::to_wide (@5))); }
+ (mult @1 { wide_int_to_tree (type, c); }))))
+ (simplify
+ (op:c (mult:s@0 @1 INTEGER_CST@2)
+ @1)
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+ (mult @1
+ { wide_int_to_tree (type,
+ wi::add (wi::to_wide (@2), 1)); })))
+ (simplify
+ (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+ (convert@1 @2))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+ (mult @1
+ { wide_int_to_tree (type,
+ wi::add (wi::to_wide (@3), 1)); })))
+ (simplify
+ (op (lshift:s@0 @1 INTEGER_CST@2)
+ (lshift:s@3 @1 INTEGER_CST@4))
+ (if (INTEGRAL_TYPE_P (type)
+ && tree_int_cst_sgn (@2) > 0
+ && tree_int_cst_sgn (@4) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+ (with { tree t = type;
+ if (!TYPE_OVERFLOW_WRAPS (t))
+ t = unsigned_type_for (t);
+ wide_int wone = wi::one (TYPE_PRECISION (t));
+ wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)),
+ wi::lshift (wone, wi::to_wide (@4))); }
+ (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })))))
+ (simplify
+ (op (lshift:s@0 (convert@1 @2) INTEGER_CST@3)
+ (lshift:s@4 (convert@1 @2) INTEGER_CST@5))
+ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
+ && tree_int_cst_sgn (@3) > 0
+ && tree_int_cst_sgn (@5) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+ (with { tree t = type;
+ if (!TYPE_OVERFLOW_WRAPS (t))
+ t = unsigned_type_for (t);
+ wide_int wone = wi::one (TYPE_PRECISION (t));
+ wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@3)),
+ wi::lshift (wone, wi::to_wide (@5))); }
+ (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })))))
+ (simplify
+ (op:c (lshift:s@0 @1 INTEGER_CST@2)
+ @1)
+ (if (INTEGRAL_TYPE_P (type)
+ && tree_int_cst_sgn (@2) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+ (with { tree t = type;
+ if (!TYPE_OVERFLOW_WRAPS (t))
+ t = unsigned_type_for (t);
+ wide_int wone = wi::one (TYPE_PRECISION (t));
+ wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); }
+ (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); })))))
+ (simplify
+ (op:c (lshift:s@0 (convert@1 @2) INTEGER_CST@3)
+ (convert@1 @2))
+ (if (INTEGRAL_TYPE_P (type)
+ && tree_int_cst_sgn (@3) > 0
+ && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+ (with { tree t = type;
+ if (!TYPE_OVERFLOW_WRAPS (t))
+ t = unsigned_type_for (t);
+ wide_int wone = wi::one (TYPE_PRECISION (t));
+ wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@3)), wone); }
+ (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))))
+
/* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */
(for minmax (min max FMIN_ALL FMAX_ALL)
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-optimized" } */
unsigned int test_ior(unsigned char i)
{
return i | (i<<8) | (i<<16) | (i<<24);
}
unsigned int test_xor(unsigned char i)
{
return i ^ (i<<8) ^ (i<<16) ^ (i<<24);
}
unsigned int test_ior_1s(unsigned char i)
{
return i | (i<<8);
}
unsigned int test_ior_1u(unsigned char i)
{
unsigned int t = i;
return t | (t<<8);
}
unsigned int test_xor_1s(unsigned char i)
{
return i ^ (i<<8);
}
unsigned int test_xor_1u(unsigned char i)
{
unsigned int t = i;
return t ^ (t<<8);
}
unsigned int test_ior_2s(unsigned char i)
{
return (i<<8) | (i<<16);
}
unsigned int test_ior_2u(unsigned char i)
{
unsigned int t = i;
return (t<<8) | (t<<16);
}
unsigned int test_xor_2s(unsigned char i)
{
return (i<<8) ^ (i<<16);
}
unsigned int test_xor_2u(unsigned char i)
{
unsigned int t = i;
return (t<<8) ^ (t<<16);
}
/* { dg-final { scan-tree-dump-not " \\^ " "optimized" } } */
/* { dg-final { scan-tree-dump-not " \\| " "optimized" } } */
/* { dg-final { scan-tree-dump-times " \\* 16843009" 2 "optimized" } } */