The easiest way to motivate these additions to match.pd is with the following example:
unsigned int foo(unsigned char i) { return i | (i<<8) | (i<<16) | (i<<24); } which mainline with -O2 on x86_64 currently generates: foo: movzbl %dil, %edi movl %edi, %eax movl %edi, %edx sall $8, %eax sall $16, %edx orl %edx, %eax orl %edi, %eax sall $24, %edi orl %edi, %eax ret but with this patch now becomes: foo: movzbl %dil, %eax imull $16843009, %eax, %eax ret Interestingly, this transformation is already applied when using addition, allowing synth_mult to select an optimal sequence, but not when using the equivalent bit-wise ior or xor operators. The solution is to use tree_nonzero_bits to check that the potentially non-zero bits of each operand don't overlap, which ensures that BIT_IOR_EXPR and BIT_XOR_EXPR produce the same results as PLUS_EXPR, which effectively generalizes the old fold_plusminus_mult_expr. Technically, the transformation is to canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to X*(C1+C2) where X and X<<C are considered special cases. I have a related follow-up patch, but these are all of the match.pd changes. The one aspect that's a little odd is that each transform is paired with a convert@1 variant, using the efficient match machinery to expose any zero extension to fold-const.c's tree_nonzero_bits functionality. The following patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap" and "make -k check" with no new failures. Ok for mainline? 2021-07-26 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) as X*(C1+C2), and related variants, using tree_nonzero_bits to ensure that operands are bit-wise disjoint. gcc/testsuite/ChangeLog * gcc.dg/fold-ior-4.c: New test. Roger -- Roger Sayle NextMove Software Cambridge, UK
diff --git a/gcc/match.pd b/gcc/match.pd index beb8d27..0347bea 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2833,6 +2833,113 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (convert (mult (convert:t @0) { cst; }))))) #endif +/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when + tree_nonzero_bits allows IOR and XOR to be treated like PLUS. + Likewise, handle (X<<C3) and X as legitimate variants of X*C. */ +(for op (bit_ior bit_xor) + (simplify + (op (mult:s@0 @1 INTEGER_CST@2) + (mult:s@3 @1 INTEGER_CST@4)) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0) + (mult @1 + { wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); }))) + (simplify + (op (mult:s@0 (convert@1 @2) INTEGER_CST@3) + (mult:s@4 (convert@1 @2) INTEGER_CST@5)) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0) + (mult @1 + { wide_int_to_tree (type, wi::to_wide (@3) + wi::to_wide (@5)); }))) + (simplify + (op:c (mult:s@0 @1 INTEGER_CST@2) + (lshift:s@3 @1 INTEGER_CST@4)) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && tree_int_cst_sgn (@4) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0) + (with { wide_int wone = wi::one (TYPE_PRECISION (type)); + wide_int c = wi::add (wi::to_wide (@2), + wi::lshift (wone, wi::to_wide (@4))); } + (mult @1 { wide_int_to_tree (type, c); })))) + (simplify + (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3) + (lshift:s@4 (convert@1 @2) INTEGER_CST@5)) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && tree_int_cst_sgn (@5) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0) + (with { wide_int wone = wi::one (TYPE_PRECISION (type)); + wide_int c = wi::add (wi::to_wide (@3), + wi::lshift (wone, wi::to_wide (@5))); } + (mult @1 { wide_int_to_tree (type, c); })))) + (simplify + (op:c (mult:s@0 @1 INTEGER_CST@2) + @1) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0) + (mult @1 + { wide_int_to_tree (type, + wi::add (wi::to_wide (@2), 1)); }))) + (simplify + (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3) + (convert@1 @2)) + (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0) + (mult @1 + { wide_int_to_tree (type, + wi::add (wi::to_wide (@3), 1)); }))) + (simplify + (op (lshift:s@0 @1 INTEGER_CST@2) + (lshift:s@3 @1 INTEGER_CST@4)) + (if (INTEGRAL_TYPE_P (type) + && tree_int_cst_sgn (@2) > 0 + && tree_int_cst_sgn (@4) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) + t = unsigned_type_for (t); + wide_int wone = wi::one (TYPE_PRECISION (t)); + wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), + wi::lshift (wone, wi::to_wide (@4))); } + (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); }))))) + (simplify + (op (lshift:s@0 (convert@1 @2) INTEGER_CST@3) + (lshift:s@4 (convert@1 @2) INTEGER_CST@5)) + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) + && tree_int_cst_sgn (@3) > 0 + && tree_int_cst_sgn (@5) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) + t = unsigned_type_for (t); + wide_int wone = wi::one (TYPE_PRECISION (t)); + wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@3)), + wi::lshift (wone, wi::to_wide (@5))); } + (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); }))))) + (simplify + (op:c (lshift:s@0 @1 INTEGER_CST@2) + @1) + (if (INTEGRAL_TYPE_P (type) + && tree_int_cst_sgn (@2) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) + t = unsigned_type_for (t); + wide_int wone = wi::one (TYPE_PRECISION (t)); + wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); } + (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))) + (simplify + (op:c (lshift:s@0 (convert@1 @2) INTEGER_CST@3) + (convert@1 @2)) + (if (INTEGRAL_TYPE_P (type) + && tree_int_cst_sgn (@3) > 0 + && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) + t = unsigned_type_for (t); + wide_int wone = wi::one (TYPE_PRECISION (t)); + wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@3)), wone); } + (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); })))))) + /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ (for minmax (min max FMIN_ALL FMAX_ALL)
/* { dg-do compile } */ /* { dg-options "-O2 -fdump-tree-optimized" } */ unsigned int test_ior(unsigned char i) { return i | (i<<8) | (i<<16) | (i<<24); } unsigned int test_xor(unsigned char i) { return i ^ (i<<8) ^ (i<<16) ^ (i<<24); } unsigned int test_ior_1s(unsigned char i) { return i | (i<<8); } unsigned int test_ior_1u(unsigned char i) { unsigned int t = i; return t | (t<<8); } unsigned int test_xor_1s(unsigned char i) { return i ^ (i<<8); } unsigned int test_xor_1u(unsigned char i) { unsigned int t = i; return t ^ (t<<8); } unsigned int test_ior_2s(unsigned char i) { return (i<<8) | (i<<16); } unsigned int test_ior_2u(unsigned char i) { unsigned int t = i; return (t<<8) | (t<<16); } unsigned int test_xor_2s(unsigned char i) { return (i<<8) ^ (i<<16); } unsigned int test_xor_2u(unsigned char i) { unsigned int t = i; return (t<<8) ^ (t<<16); } /* { dg-final { scan-tree-dump-not " \\^ " "optimized" } } */ /* { dg-final { scan-tree-dump-not " \\| " "optimized" } } */ /* { dg-final { scan-tree-dump-times " \\* 16843009" 2 "optimized" } } */