Hi! The following patch is first step towards fixing PR93582. vn_reference_lookup_3 right now punts on anything that isn't byte aligned, so to be able to lookup a constant bitfield store, one needs to use the exact same COMPONENT_REF, otherwise it isn't found.
This patch lifts up that that restriction if the bits to be loaded are covered by a single store of a constant (keeps the restriction so far for the multiple store case, can tweak that incrementally, but I think for bisection etc. it is worth to do it one step at a time). Bootstrapped/regtested on x86_64-linux, i686-linux, powerpc64le-linux and powerpc64-linux (the last one regtested with \{-m32,-m64\}). Additionally, I've bootstrapped/regtested it on those with statistics gathering on when val was non-NULL (i.e. we managed to see something through) when any of offseti, offset2i, maxsizei or size2i wasn't multiple of BITS_PER_UNIT (i.e. when this optimization triggered). Below is a list of the unique cases where it triggered, the first column says on which of the 5 ABIs it triggered, qmath stands for those that enable libquadmath, i.e. ia32,x86_64,ppc64le. Ok for trunk? all ../../gcc/gimple-ssa-backprop.c {anonymous}::backprop::process_var all gcc/gcc/testsuite/gcc.c-torture/compile/20191015-1.c f all gcc/gcc/testsuite/gcc.c-torture/compile/20191015-2.c f all gcc/gcc/testsuite/gcc.c-torture/compile/20200105-1.c g all gcc/gcc/testsuite/gcc.c-torture/compile/20200105-2.c g all gcc/gcc/testsuite/gcc.c-torture/compile/20200105-3.c g all gcc/gcc/testsuite/gcc.c-torture/execute/20181120-1.c main all gcc/gcc/testsuite/gcc.c-torture/execute/921204-1.c main all gcc/gcc/testsuite/gcc.c-torture/execute/pr58726.c main all gcc/gcc/testsuite/gcc.c-torture/execute/pr86492.c foo all gcc/gcc/testsuite/gcc.dg/store_merging_24.c foo all gcc/gcc/testsuite/gcc.dg/store_merging_25.c foo all gcc/gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c foo all gcc/gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c foo all gcc/gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c foo all gcc/gcc/testsuite/g++.dg/other/pr89692.C foo all ../../../libgcc/unwind-dw2-fde-dip.c init_object all ../../../libgcc/unwind-dw2-fde-dip.c __register_frame_info all ../../../libgcc/unwind-dw2-fde-dip.c __register_frame_info_bases all ../../../libgcc/unwind-dw2-fde-dip.c __register_frame_info_table all ../../../libgcc/unwind-dw2-fde-dip.c __register_frame_info_table_bases all ../../../libgcc/unwind-dw2-fde-dip.c __register_frame_table all ../../../libgcc/unwind-dw2-fde-dip.c search_object all ../../../libgcc/unwind-dw2-fde-dip.c _Unwind_IteratePhdrCallback ia32 /tmp/921204-1.exe.NdZMiZ.ltrans0.o main ia32 /tmp/ccK7HiiN.o main ia32 /tmp/ccWtcuID.o foo ia32 /tmp/pr86492.exe.IKs2SA.ltrans0.o foo lp64 gcc/gcc/testsuite/gnat.dg/pack22.adb pack22 ppc32 /tmp/921204-1.exe.1GoDpE.ltrans0.o main ppc32 /tmp/ccivx4sg.o foo ppc32 /tmp/ccLFNqjQ.o main ppc32 /tmp/pr86492.exe.VkJDwY.ltrans0.o foo ppc32 /tmp/pr88739.exe.y33n0X.ltrans0.o main ppc64le cd2a32a.adb cd2a32a ppc64le /tmp/921204-1.exe.la923O.ltrans0.o main ppc64le /tmp/ccH3NcwE.o main ppc64le /tmp/ccmHysWx.o foo ppc64le /tmp/pr86492.exe.EYd24l.ltrans0.o foo ppc64le /tmp/pr88739.exe.uAT6pA.ltrans0.o main ppc64 /tmp/921204-1.exe.vtoTSo.ltrans0.o main ppc64 /tmp/ccm4RGtK.o main ppc64 /tmp/cczZpRCD.o foo ppc64 /tmp/pr86492.exe.UdbN8I.ltrans0.o foo ppc64 /tmp/pr88739.exe.DtQe1H.ltrans0.o main qmath ../../../libquadmath/math/expq.c expq qmath ../../../libquadmath/math/fmaq.c fmaq qmath ../../../libquadmath/math/nanq.c nanq qmath ../../../libquadmath/strtod/strtoflt128.c ____strtoflt128_internal x86_64 cd2a32a.adb cd2a32a x86_64 /tmp/921204-1.exe.0059fB.ltrans0.o main x86_64 /tmp/cci9lHhD.o main x86_64 /tmp/ccTlWSbV.o foo x86_64 /tmp/pr86492.exe.NE0yez.ltrans0.o foo x86_64 /tmp/pr88739.exe.PKCG2M.ltrans0.o main x86 gcc/gcc/testsuite/gcc.dg/torture/pr45017.c main x86 gcc/gcc/testsuite/gcc.target/i386/pr37870.c main 2020-02-13 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/93582 * fold-const.h (shift_bytes_in_array_left, shift_bytes_in_array_right): Declare. * fold-const.c (shift_bytes_in_array_left, shift_bytes_in_array_right): New function, moved from gimple-ssa-store-merging.c, no longer static. * gimple-ssa-store-merging.c (shift_bytes_in_array): Move to gimple-ssa-store-merging.c and rename to shift_bytes_in_array_left. (shift_bytes_in_array_right): Move to gimple-ssa-store-merging.c. (encode_tree_to_bitpos): Use shift_bytes_in_array_left instead of shift_bytes_in_array. (verify_shift_bytes_in_array): Rename to ... (verify_shift_bytes_in_array_left): ... this. Use shift_bytes_in_array_left instead of shift_bytes_in_array. (store_merging_c_tests): Call verify_shift_bytes_in_array_left instead of verify_shift_bytes_in_array. * tree-ssa-sccvn.c (vn_reference_lookup_3): For native_encode_expr / native_interpret_expr where the store covers all needed bits, punt on PDP-endian, otherwise allow all involved offsets and sizes not to be byte-aligned. * gcc.dg/tree-ssa/pr93582-1.c: New test. * gcc.dg/tree-ssa/pr93582-2.c: New test. * gcc.dg/tree-ssa/pr93582-3.c: New test. --- gcc/fold-const.h.jj 2020-02-12 11:43:57.533684956 +0100 +++ gcc/fold-const.h 2020-02-12 15:19:38.737897631 +0100 @@ -30,6 +30,10 @@ extern int native_encode_initializer (tr int off = -1); extern tree native_interpret_expr (tree, const unsigned char *, int); extern bool can_native_interpret_type_p (tree); +extern void shift_bytes_in_array_left (unsigned char *, unsigned int, + unsigned int); +extern void shift_bytes_in_array_right (unsigned char *, unsigned int, + unsigned int); /* Fold constants as much as possible in an expression. Returns the simplified expression. --- gcc/fold-const.c.jj 2020-02-12 11:43:57.532684971 +0100 +++ gcc/fold-const.c 2020-02-12 15:19:38.740897586 +0100 @@ -8354,6 +8354,70 @@ can_native_interpret_type_p (tree type) } } +/* Routines for manipulation of native_encode_expr encoded data if the encoded + or extracted constant positions and/or sizes aren't byte aligned. */ + +/* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the + bits between adjacent elements. AMNT should be within + [0, BITS_PER_UNIT). + Example, AMNT = 2: + 00011111|11100000 << 2 = 01111111|10000000 + PTR[1] | PTR[0] PTR[1] | PTR[0]. */ + +void +shift_bytes_in_array_left (unsigned char *ptr, unsigned int sz, + unsigned int amnt) +{ + if (amnt == 0) + return; + + unsigned char carry_over = 0U; + unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt); + unsigned char clear_mask = (~0U) << amnt; + + for (unsigned int i = 0; i < sz; i++) + { + unsigned prev_carry_over = carry_over; + carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt); + + ptr[i] <<= amnt; + if (i != 0) + { + ptr[i] &= clear_mask; + ptr[i] |= prev_carry_over; + } + } +} + +/* Like shift_bytes_in_array_left but for big-endian. + Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the + bits between adjacent elements. AMNT should be within + [0, BITS_PER_UNIT). + Example, AMNT = 2: + 00011111|11100000 >> 2 = 00000111|11111000 + PTR[0] | PTR[1] PTR[0] | PTR[1]. */ + +void +shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz, + unsigned int amnt) +{ + if (amnt == 0) + return; + + unsigned char carry_over = 0U; + unsigned char carry_mask = ~(~0U << amnt); + + for (unsigned int i = 0; i < sz; i++) + { + unsigned prev_carry_over = carry_over; + carry_over = ptr[i] & carry_mask; + + carry_over <<= (unsigned char) BITS_PER_UNIT - amnt; + ptr[i] >>= amnt; + ptr[i] |= prev_carry_over; + } +} + /* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating directly on the VECTOR_CST encoding, in a way that works for variable- length vectors. Return the resulting VECTOR_CST on success or null --- gcc/gimple-ssa-store-merging.c.jj 2020-02-12 11:43:57.558684581 +0100 +++ gcc/gimple-ssa-store-merging.c 2020-02-12 15:19:38.742897557 +0100 @@ -1475,66 +1475,6 @@ dump_char_array (FILE *fd, unsigned char fprintf (fd, "\n"); } -/* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the - bits between adjacent elements. AMNT should be within - [0, BITS_PER_UNIT). - Example, AMNT = 2: - 00011111|11100000 << 2 = 01111111|10000000 - PTR[1] | PTR[0] PTR[1] | PTR[0]. */ - -static void -shift_bytes_in_array (unsigned char *ptr, unsigned int sz, unsigned int amnt) -{ - if (amnt == 0) - return; - - unsigned char carry_over = 0U; - unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt); - unsigned char clear_mask = (~0U) << amnt; - - for (unsigned int i = 0; i < sz; i++) - { - unsigned prev_carry_over = carry_over; - carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt); - - ptr[i] <<= amnt; - if (i != 0) - { - ptr[i] &= clear_mask; - ptr[i] |= prev_carry_over; - } - } -} - -/* Like shift_bytes_in_array but for big-endian. - Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the - bits between adjacent elements. AMNT should be within - [0, BITS_PER_UNIT). - Example, AMNT = 2: - 00011111|11100000 >> 2 = 00000111|11111000 - PTR[0] | PTR[1] PTR[0] | PTR[1]. */ - -static void -shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz, - unsigned int amnt) -{ - if (amnt == 0) - return; - - unsigned char carry_over = 0U; - unsigned char carry_mask = ~(~0U << amnt); - - for (unsigned int i = 0; i < sz; i++) - { - unsigned prev_carry_over = carry_over; - carry_over = ptr[i] & carry_mask; - - carry_over <<= (unsigned char) BITS_PER_UNIT - amnt; - ptr[i] >>= amnt; - ptr[i] |= prev_carry_over; - } -} - /* Clear out LEN bits starting from bit START in the byte array PTR. This clears the bits to the *right* from START. START must be within [0, BITS_PER_UNIT) and counts starting from @@ -1793,7 +1733,7 @@ encode_tree_to_bitpos (tree expr, unsign /* Create the shifted version of EXPR. */ if (!BYTES_BIG_ENDIAN) { - shift_bytes_in_array (tmpbuf, byte_size, shift_amnt); + shift_bytes_in_array_left (tmpbuf, byte_size, shift_amnt); if (shift_amnt == 0) byte_size--; } @@ -5092,11 +5032,11 @@ verify_array_eq (unsigned char *x, unsig } } -/* Test shift_bytes_in_array and that it carries bits across between +/* Test shift_bytes_in_array_left and that it carries bits across between bytes correctly. */ static void -verify_shift_bytes_in_array (void) +verify_shift_bytes_in_array_left (void) { /* byte 1 | byte 0 00011111 | 11100000. */ @@ -5105,13 +5045,13 @@ verify_shift_bytes_in_array (void) memcpy (in, orig, sizeof orig); unsigned char expected[2] = { 0x80, 0x7f }; - shift_bytes_in_array (in, sizeof (in), 2); + shift_bytes_in_array_left (in, sizeof (in), 2); verify_array_eq (in, expected, sizeof (in)); memcpy (in, orig, sizeof orig); memcpy (expected, orig, sizeof orig); /* Check that shifting by zero doesn't change anything. */ - shift_bytes_in_array (in, sizeof (in), 0); + shift_bytes_in_array_left (in, sizeof (in), 0); verify_array_eq (in, expected, sizeof (in)); } @@ -5196,7 +5136,7 @@ verify_clear_bit_region_be (void) void store_merging_c_tests (void) { - verify_shift_bytes_in_array (); + verify_shift_bytes_in_array_left (); verify_shift_bytes_in_array_right (); verify_clear_bit_region (); verify_clear_bit_region_be (); --- gcc/tree-ssa-sccvn.c.jj 2020-02-12 11:43:57.604683889 +0100 +++ gcc/tree-ssa-sccvn.c 2020-02-12 18:35:14.311359233 +0100 @@ -2586,13 +2586,13 @@ vn_reference_lookup_3 (ao_ref *ref, tree && is_gimple_reg_type (vr->type) && !contains_storage_order_barrier_p (vr->operands) && gimple_assign_single_p (def_stmt) - && CHAR_BIT == 8 && BITS_PER_UNIT == 8 + && CHAR_BIT == 8 + && BITS_PER_UNIT == 8 + && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN /* native_encode and native_decode operate on arrays of bytes and so fundamentally need a compile-time size and offset. */ && maxsize.is_constant (&maxsizei) - && maxsizei % BITS_PER_UNIT == 0 && offset.is_constant (&offseti) - && offseti % BITS_PER_UNIT == 0 && (is_gimple_min_invariant (gimple_assign_rhs1 (def_stmt)) || (TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME && is_gimple_min_invariant (SSA_VAL (gimple_assign_rhs1 (def_stmt)))))) @@ -2617,8 +2617,6 @@ vn_reference_lookup_3 (ao_ref *ref, tree && !reverse && !storage_order_barrier_p (lhs) && known_eq (maxsize2, size2) - && multiple_p (size2, BITS_PER_UNIT) - && multiple_p (offset2, BITS_PER_UNIT) && adjust_offsets_for_equal_base_address (base, &offset, base2, &offset2) && offset.is_constant (&offseti) @@ -2629,37 +2627,80 @@ vn_reference_lookup_3 (ao_ref *ref, tree && known_subrange_p (offseti, maxsizei, offset2, size2)) { /* We support up to 512-bit values (for V8DFmode). */ - unsigned char buffer[64]; + unsigned char buffer[65]; int len; tree rhs = gimple_assign_rhs1 (def_stmt); if (TREE_CODE (rhs) == SSA_NAME) rhs = SSA_VAL (rhs); - unsigned pad = 0; - if (BYTES_BIG_ENDIAN - && is_a <scalar_mode> (TYPE_MODE (TREE_TYPE (rhs)))) - { - /* On big-endian the padding is at the 'front' so - just skip the initial bytes. */ - fixed_size_mode mode - = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (rhs))); - pad = GET_MODE_SIZE (mode) - size2i / BITS_PER_UNIT; - } len = native_encode_expr (rhs, - buffer, sizeof (buffer), - ((offseti - offset2i) / BITS_PER_UNIT - + pad)); + buffer, sizeof (buffer) - 1, + (offseti - offset2i) / BITS_PER_UNIT); if (len > 0 && len * BITS_PER_UNIT >= maxsizei) { tree type = vr->type; + unsigned char *buf = buffer; + unsigned int amnt = 0; /* Make sure to interpret in a type that has a range covering the whole access size. */ if (INTEGRAL_TYPE_P (vr->type) && maxsizei != TYPE_PRECISION (vr->type)) type = build_nonstandard_integer_type (maxsizei, TYPE_UNSIGNED (type)); - tree val = native_interpret_expr (type, buffer, - maxsizei / BITS_PER_UNIT); + if (BYTES_BIG_ENDIAN) + { + /* For big-endian native_encode_expr stored the rhs + such that the LSB of it is the LSB of buffer[len - 1]. + That bit is stored into memory at position + offset2 + size2 - 1, i.e. in byte + base + (offset2 + size2 - 1) / BITS_PER_UNIT. + E.g. for offset2 1 and size2 14, rhs -1 and memory + previously cleared that is: + 0 1 + 01111111|11111110 + Now, if we want to extract offset 2 and size 12 from + it using native_interpret_expr (which actually works + for integral bitfield types in terms of byte size of + the mode), the native_encode_expr stored the value + into buffer as + XX111111|11111111 + and returned len 2 (the X bits are outside of + precision). + Let sz be maxsize / BITS_PER_UNIT if not extracting + a bitfield, and GET_MODE_SIZE otherwise. + We need to align the LSB of the value we want to + extract as the LSB of buf[sz - 1]. + The LSB from memory we need to read is at position + offset + maxsize - 1. */ + HOST_WIDE_INT sz = maxsizei / BITS_PER_UNIT; + if (INTEGRAL_TYPE_P (type)) + sz = GET_MODE_SIZE (SCALAR_INT_TYPE_MODE (type)); + amnt = ((unsigned HOST_WIDE_INT) offset2i + size2i + - offseti - maxsizei) % BITS_PER_UNIT; + if (amnt) + shift_bytes_in_array_right (buffer, len, amnt); + amnt = ((unsigned HOST_WIDE_INT) offset2i + size2i + - offseti - maxsizei - amnt) / BITS_PER_UNIT; + if ((unsigned HOST_WIDE_INT) sz + amnt > (unsigned) len) + len = 0; + else + { + buf = buffer + len - sz - amnt; + len -= (buf - buffer); + } + } + else + { + amnt = ((unsigned HOST_WIDE_INT) offset2i + - offseti) % BITS_PER_UNIT; + if (amnt) + { + buffer[len] = 0; + shift_bytes_in_array_left (buffer, len + 1, amnt); + buf = buffer + 1; + } + } + tree val = native_interpret_expr (type, buf, len); /* If we chop off bits because the types precision doesn't match the memory access size this is ok when optimizing reads but not when called from the DSE code during @@ -2677,7 +2718,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree return data->finish (get_alias_set (lhs), val); } } - else if (ranges_known_overlap_p (offseti, maxsizei, offset2i, size2i)) + else if (ranges_known_overlap_p (offseti, maxsizei, offset2i, + size2i) + && maxsizei % BITS_PER_UNIT == 0 + && offseti % BITS_PER_UNIT == 0 + && size2i % BITS_PER_UNIT == 0 + && offset2i % BITS_PER_UNIT == 0) { pd_data pd; tree rhs = gimple_assign_rhs1 (def_stmt); --- gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c.jj 2020-02-12 18:23:02.522285857 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c 2020-02-12 18:24:51.284661882 +0100 @@ -0,0 +1,17 @@ +/* PR tree-optimization/93582 */ +/* { dg-do compile { target int32 } } */ +/* { dg-options "-O2 -fdump-tree-fre1" } */ +/* { dg-final { scan-tree-dump "return 1;" "fre1" } } */ + +union U { + struct S { int a : 1, b : 4, c : 27; } s; + struct T { int d : 2; int e : 2; int f : 28; } t; +}; + +int +foo (void) +{ + union U u; + u.s.b = 10; + return u.t.e; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c.jj 2020-02-12 18:23:05.608239781 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c 2020-02-12 18:31:12.443970643 +0100 @@ -0,0 +1,17 @@ +/* PR tree-optimization/93582 */ +/* { dg-do compile { target int32 } } */ +/* { dg-options "-O2 -fdump-tree-fre1" } */ +/* { dg-final { scan-tree-dump "return 593;" "fre1" } } */ + +union U { + struct S { int a : 1, b : 14, c : 17; } s; + struct T { int d : 2; int e : 12; int f : 18; } t; +}; + +int +foo (void) +{ + union U u; + u.s.b = -7005; + return u.t.e; +} --- gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c.jj 2020-02-12 18:30:55.372225545 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c 2020-02-12 18:31:20.136855767 +0100 @@ -0,0 +1,18 @@ +/* PR tree-optimization/93582 */ +/* { dg-do compile { target int32 } } */ +/* { dg-options "-O2 -fdump-tree-fre1" } */ +/* { dg-final { scan-tree-dump "return 1;" "fre1" { target be } } } */ +/* { dg-final { scan-tree-dump "return 2;" "fre1" { target le } } } */ + +union U { + struct S { int a : 1, b : 14, c : 17; } s; + struct T { int d : 10; int e : 4; int f : 18; } t; +}; + +int +foo (void) +{ + union U u; + u.s.b = -7005; + return u.t.e; +} Jakub