On Thu, Nov 25, 2021 at 09:21:37AM +0100, Richard Biener wrote: > OK if you can add a testcase that exercises this "feature".
Sure, that is easy. Here is what I've committed. f2 tests the x | x = x handling in it, f3 tests x | y = unknown instead of punting, f4 tests x ^ x = 0 and f5 tests x ^ y = unknown. Without the patch only f2 is optimized to __builtin_bswap32, with the patch all of them. 2021-11-25 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/103376 * gimple-ssa-store-merging.c (perform_symbolic_merge): For BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN. For BIT_XOR_EXPR similarly and if masked1 == masked2 and the byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to 0. * gcc.dg/optimize-bswapsi-7.c: New test. --- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100 +++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100 @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s n->bytepos = n_start->bytepos; n->type = n_start->type; size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; + uint64_t res_n = n1->n | n2->n; for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) { @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s masked1 = n1->n & mask; masked2 = n2->n & mask; - /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 - has to be 0, for BIT_IOR_EXPR x | x is still x. */ - if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) - return NULL; + /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */ + if (masked1 && masked2) + { + /* + can carry into upper bits, just punt. */ + if (code == PLUS_EXPR) + return NULL; + /* x | x is still x. */ + if (code == BIT_IOR_EXPR && masked1 == masked2) + continue; + if (code == BIT_XOR_EXPR) + { + /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for + unknown values and unknown ^ unknown is unknown. */ + if (masked1 == masked2 + && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN + << i * BITS_PER_MARKER)) + { + res_n &= ~mask; + continue; + } + } + /* Otherwise set the byte to unknown, it might still be + later masked off. */ + res_n |= mask; + } } - n->n = n1->n | n2->n; + n->n = res_n; n->n_ops = n1->n_ops + n2->n_ops; return source_stmt; --- gcc/testsuite/gcc.dg/optimize-bswapsi-7.c.jj 2021-11-25 10:36:03.847529686 +0100 +++ gcc/testsuite/gcc.dg/optimize-bswapsi-7.c 2021-11-25 10:35:46.522778192 +0100 @@ -0,0 +1,37 @@ +/* PR tree-optimization/103376 */ +/* { dg-do compile } */ +/* { dg-require-effective-target bswap } */ +/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-optimized" } */ +/* { dg-additional-options "-march=z900" { target s390-*-* } } */ + +static unsigned int +f1 (unsigned int x) +{ + return (x << 24) | (x >> 8); +} + +unsigned int +f2 (unsigned *p) +{ + return ((f1 (p[0]) | (p[0] >> 8)) & 0xff000000U) | (p[0] >> 24) | ((p[0] & 0xff00U) << 8) | ((p[0] & 0xff0000U) >> 8); +} + +unsigned int +f3 (unsigned *p) +{ + return ((f1 (p[0]) | (p[0] & 0x00ff00ffU)) & 0xff00ff00U) | (f1 (f1 (f1 (p[0]))) & 0x00ff00ffU); +} + +unsigned int +f4 (unsigned *p) +{ + return (f1 (p[0]) ^ (p[0] >> 8)) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8); +} + +unsigned int +f5 (unsigned *p) +{ + return (((f1 (p[0]) | (p[0] >> 16)) ^ (p[0] >> 8)) & 0xffff0000U) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8); +} + +/* { dg-final { scan-tree-dump-times "= __builtin_bswap32 \\\(" 4 "optimized" } } */ Jakub