On Wed, 16 Sep 2020, Jakub Jelinek wrote: > Hi! > > As the testcases show, if we have something like: > MEM <char[12]> [&b + 8B] = {}; > MEM[(short *) &b] = 5; > _5 = *x_4(D); > MEM <long long unsigned int> [&b + 2B] = _5; > MEM[(char *)&b + 16B] = 88; > MEM[(int *)&b + 20B] = 1; > then in sort_by_bitpos the stores are almost like in the given order, > except the first store is after the = _5; store. > We can't coalesce the = 5; store with = _5;, because the latter is MEM_REF, > while the former INTEGER_CST, and we can't coalesce the = _5 store with > the = {} store because the former is MEM_REF, the latter INTEGER_CST. > But we happily coalesce the remaining 3 stores, which is wrong, because the > = _5; store overlaps those and is in between them in the program order. > We already have code to deal with similar cases in check_no_overlap, but we > deal only with the following stores in sort_by_bitpos order, not the earlier > ones. > > The following patch checks also the earlier ones. In > coalesce_immediate_stores > it computes the first one that needs to be checked (all the ones whose > bitpos + bitsize is smaller or equal to merged_store->start don't need to be > checked and don't need to be checked even for any following attempts because > of the sort_by_bitpos sorting) and the end of that (that is the first store > in the merged_store). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. Thanks, Richard. > 2020-09-16 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/97053 > * gimple-ssa-store-merging.c (check_no_overlap): Add FIRST_ORDER, > START, FIRST_EARLIER and LAST_EARLIER arguments. Return false if > any stores between FIRST_EARLIER inclusive and LAST_EARLIER exclusive > has order in between FIRST_ORDER and LAST_ORDER and overlaps the to > be merged store. > (imm_store_chain_info::try_coalesce_bswap): Add FIRST_EARLIER argument. > Adjust check_no_overlap caller. > (imm_store_chain_info::coalesce_immediate_stores): Add first_earlier > and last_earlier variables, adjust them during iterations. Adjust > check_no_overlap callers, call check_no_overlap even when extending > overlapping stores by extra INTEGER_CST stores. > > * gcc.dg/store_merging_31.c: New test. > * gcc.dg/store_merging_32.c: New test. > > --- gcc/gimple-ssa-store-merging.c.jj 2020-08-12 12:45:46.000000000 +0200 > +++ gcc/gimple-ssa-store-merging.c 2020-09-15 16:51:11.393453396 +0200 > @@ -2116,7 +2116,8 @@ public: > } > } > bool terminate_and_process_chain (); > - bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int); > + bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int, > + unsigned int); > bool coalesce_immediate_stores (); > bool output_merged_store (merged_store_group *); > bool output_merged_stores (); > @@ -2443,14 +2444,39 @@ gather_bswap_load_refs (vec<tree> *refs, > into the group. That way it will be its own store group and will > not be touched. If ALL_INTEGER_CST_P and there are overlapping > INTEGER_CST stores, those are mergeable using merge_overlapping, > - so don't return false for those. */ > + so don't return false for those. > + > + Similarly, check stores from FIRST_EARLIER (inclusive) to END_EARLIER > + (exclusive), whether they don't overlap the bitrange START to END > + and have order in between FIRST_ORDER and LAST_ORDER. This is to > + prevent merging in cases like: > + MEM <char[12]> [&b + 8B] = {}; > + MEM[(short *) &b] = 5; > + _5 = *x_4(D); > + MEM <long long unsigned int> [&b + 2B] = _5; > + MEM[(char *)&b + 16B] = 88; > + MEM[(int *)&b + 20B] = 1; > + The = {} store comes in sort_by_bitpos before the = 88 store, and can't > + be merged with it, because the = _5 store overlaps these and is in between > + them in sort_by_order ordering. If it was merged, the merged store would > + go after the = _5 store and thus change behavior. */ > > static bool > check_no_overlap (vec<store_immediate_info *> m_store_info, unsigned int i, > - bool all_integer_cst_p, unsigned int last_order, > - unsigned HOST_WIDE_INT end) > + bool all_integer_cst_p, unsigned int first_order, > + unsigned int last_order, unsigned HOST_WIDE_INT start, > + unsigned HOST_WIDE_INT end, unsigned int first_earlier, > + unsigned end_earlier) > { > unsigned int len = m_store_info.length (); > + for (unsigned int j = first_earlier; j < end_earlier; j++) > + { > + store_immediate_info *info = m_store_info[j]; > + if (info->order > first_order > + && info->order < last_order > + && info->bitpos + info->bitsize > start) > + return false; > + } > for (++i; i < len; ++i) > { > store_immediate_info *info = m_store_info[i]; > @@ -2471,7 +2497,8 @@ check_no_overlap (vec<store_immediate_in > bool > imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store, > unsigned int first, > - unsigned int try_size) > + unsigned int try_size, > + unsigned int first_earlier) > { > unsigned int len = m_store_info.length (), last = first; > unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize; > @@ -2611,7 +2638,8 @@ imm_store_chain_info::try_coalesce_bswap > if (n.base_addr == NULL_TREE && !is_gimple_val (n.src)) > return false; > > - if (!check_no_overlap (m_store_info, last, false, last_order, end)) > + if (!check_no_overlap (m_store_info, last, false, first_order, last_order, > + merged_store->start, end, first_earlier, first)) > return false; > > /* Don't handle memory copy this way if normal non-bswap processing > @@ -2703,6 +2731,8 @@ imm_store_chain_info::coalesce_immediate > > store_immediate_info *info; > unsigned int i, ignore = 0; > + unsigned int first_earlier = 0; > + unsigned int end_earlier = 0; > > /* Order the stores by the bitposition they write to. */ > m_store_info.qsort (sort_by_bitpos); > @@ -2719,6 +2749,12 @@ imm_store_chain_info::coalesce_immediate > if (i <= ignore) > goto done; > > + while (first_earlier < end_earlier > + && (m_store_info[first_earlier]->bitpos > + + m_store_info[first_earlier]->bitsize > + <= merged_store->start)) > + first_earlier++; > + > /* First try to handle group of stores like: > p[0] = data >> 24; > p[1] = data >> 16; > @@ -2733,7 +2769,8 @@ imm_store_chain_info::coalesce_immediate > { > unsigned int try_size; > for (try_size = 64; try_size >= 16; try_size >>= 1) > - if (try_coalesce_bswap (merged_store, i - 1, try_size)) > + if (try_coalesce_bswap (merged_store, i - 1, try_size, > + first_earlier)) > break; > > if (try_size >= 16) > @@ -2741,7 +2778,10 @@ imm_store_chain_info::coalesce_immediate > ignore = i + merged_store->stores.length () - 1; > m_merged_store_groups.safe_push (merged_store); > if (ignore < m_store_info.length ()) > - merged_store = new merged_store_group (m_store_info[ignore]); > + { > + merged_store = new merged_store_group (m_store_info[ignore]); > + end_earlier = ignore; > + } > else > merged_store = NULL; > goto done; > @@ -2776,12 +2816,16 @@ imm_store_chain_info::coalesce_immediate > && merged_store->only_constants > && info->lp_nr == merged_store->lp_nr) > { > + unsigned int first_order > + = MIN (merged_store->first_order, info->order); > unsigned int last_order > = MAX (merged_store->last_order, info->order); > unsigned HOST_WIDE_INT end > = MAX (merged_store->start + merged_store->width, > info->bitpos + info->bitsize); > - if (check_no_overlap (m_store_info, i, true, last_order, end)) > + if (check_no_overlap (m_store_info, i, true, first_order, > + last_order, merged_store->start, end, > + first_earlier, end_earlier)) > { > /* check_no_overlap call above made sure there are no > overlapping stores with non-INTEGER_CST rhs_code > @@ -2810,6 +2854,7 @@ imm_store_chain_info::coalesce_immediate > do > { > unsigned int max_order = 0; > + unsigned int min_order = first_order; > unsigned first_nonmergeable_int_order = ~0U; > unsigned HOST_WIDE_INT this_end = end; > k = i; > @@ -2836,6 +2881,7 @@ imm_store_chain_info::coalesce_immediate > break; > } > k = j; > + min_order = MIN (min_order, info2->order); > this_end = MAX (this_end, > info2->bitpos + info2->bitsize); > } > @@ -2852,6 +2898,12 @@ imm_store_chain_info::coalesce_immediate > first_nonmergeable_order > = MIN (first_nonmergeable_order, info2->order); > } > + if (k > i > + && !check_no_overlap (m_store_info, len - 1, true, > + min_order, try_order, > + merged_store->start, this_end, > + first_earlier, end_earlier)) > + k = 0; > if (k == 0) > { > if (last_order == try_order) > @@ -2937,9 +2989,12 @@ imm_store_chain_info::coalesce_immediate > info->ops_swapped_p = true; > } > if (check_no_overlap (m_store_info, i, false, > + MIN (merged_store->first_order, info->order), > MAX (merged_store->last_order, info->order), > + merged_store->start, > MAX (merged_store->start + merged_store->width, > - info->bitpos + info->bitsize))) > + info->bitpos + info->bitsize), > + first_earlier, end_earlier)) > { > /* Turn MEM_REF into BIT_INSERT_EXPR for bit-field stores. */ > if (info->rhs_code == MEM_REF && infof->rhs_code != MEM_REF) > @@ -2985,6 +3040,7 @@ imm_store_chain_info::coalesce_immediate > delete merged_store; > > merged_store = new merged_store_group (info); > + end_earlier = i; > if (dump_file && (dump_flags & TDF_DETAILS)) > fputs ("New store group\n", dump_file); > > --- gcc/testsuite/gcc.dg/store_merging_31.c.jj 2020-09-15 > 16:18:47.451623297 +0200 > +++ gcc/testsuite/gcc.dg/store_merging_31.c 2020-09-15 16:18:31.460846430 > +0200 > @@ -0,0 +1,27 @@ > +/* PR tree-optimization/97053 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +struct S { short a; char b[9]; int c; char d; int e; }; > + > +__attribute__((noipa)) void > +foo (char *x, char *y) > +{ > + if (__builtin_strcmp (x, "ABCDXXXX") != 0 > + || __builtin_strcmp (y, "ABCDXXXX") != 0) > + __builtin_abort (); > +} > + > +int > +main () > +{ > + char a[9] = "XXXXXXXX"; > + struct S b = {}; > + __builtin_memcpy (a, "ABCD", 4); > + b.a = 5; > + __builtin_memcpy (b.b, a, 8); > + b.d = 'X'; > + b.e = 1; > + foo (a, b.b); > + return 0; > +} > --- gcc/testsuite/gcc.dg/store_merging_32.c.jj 2020-09-15 > 16:18:51.381568453 +0200 > +++ gcc/testsuite/gcc.dg/store_merging_32.c 2020-09-15 15:38:27.827420970 > +0200 > @@ -0,0 +1,129 @@ > +/* PR tree-optimization/97053 */ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fno-tree-dse" } */ > + > +struct __attribute__((packed, may_alias)) S { long long s; }; > +struct __attribute__((packed, may_alias)) T { short t; }; > + > +__attribute__((noipa)) void > +test (char *p, char *q, int s) > +{ > + if ((s & 1) == 0) > + { > + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)] > + != *(short __attribute__((may_alias)) *) &q[sizeof (short)] > + || (((struct S __attribute__((may_alias)) *) &p[1])->s > + != ((struct S __attribute__((may_alias)) *) &q[1])->s) > + || (*(short __attribute__((may_alias)) *) &p[2 * sizeof (short)] > + != *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)])) > + __builtin_abort (); > + } > + else > + { > + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)] > + != *(short __attribute__((may_alias)) *) &q[sizeof (short)] > + || (((struct S __attribute__((may_alias)) *) &p[1])->s > + != ((struct S __attribute__((may_alias)) *) &q[1])->s) > + || (((struct T __attribute__((may_alias)) *) &p[2 * sizeof (short) - > 1])->t > + != ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) > - 1])->t) > + || p[3 * sizeof (short) - 2] != q[3 * sizeof (short) - 2]) > + __builtin_abort (); > + } > +} > + > +__attribute__((noipa)) void > +foo (long long *p, char *q, char *r, char *s) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2; > + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &s[2 * sizeof (short)] = 2; > + test (a, q, 0); > +} > + > +__attribute__((noipa)) void > +bar (long long *p, char *q, char *r, char *s, char *t) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + ((struct T __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1])->t = > 2; > + a[3 * sizeof (short) - 2] = 3; > + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + ((struct T __attribute__((may_alias)) *) &s[2 * sizeof (short) - 1])->t = > 2; > + t[3 * sizeof (short) - 2] = 3; > + test (a, q, 1); > +} > + > +__attribute__((noipa)) void > +baz (long long *p, char *q, char *r, char *s) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; > + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = 2; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = 1; > + test (a, q, 2); > +} > + > +__attribute__((noipa)) void > +qux (long long *p, char *q, char *r, char *s, char *t) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1] = 2; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + a[3 * sizeof (short) - 2] = 3; > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; > + ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t = > 2; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + s[3 * sizeof (short) - 2] = 3; > + ((struct T __attribute__((may_alias)) *) &t[sizeof (short)])->t = 1; > + test (a, q, 3); > +} > + > +__attribute__((noipa)) void > +corge (long long *p, char *q, char *r, char *s, short u[3]) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2]; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1]; > + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2]; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1]; > + test (a, q, 4); > +} > + > +__attribute__((noipa)) void > +garply (long long *p, char *q, char *r, char *s, short u[3]) > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1]; > + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2]; > + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1]; > + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; > + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2]; > + test (a, q, 6); > +} > + > +int > +main () > +{ > + char a[64] __attribute__((aligned (__alignof (short)))); > + long long p = -1LL; > + short u[] = { 1, 2, 3 }; > + foo (&p, &a[0], &a[0], &a[0]); > + bar (&p, &a[0], &a[0], &a[0], &a[0]); > + baz (&p, &a[0], &a[0], &a[0]); > + qux (&p, &a[0], &a[0], &a[0], &a[0]); > + corge (&p, &a[0], &a[0], &a[0], u); > + garply (&p, &a[0], &a[0], &a[0], u); > + return 0; > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)