And the patch. On Fri, Jun 23, 2017 at 11:24 AM, Bin.Cheng <amker.ch...@gmail.com> wrote: > On Tue, Jun 20, 2017 at 12:34 PM, Richard Biener > <richard.guent...@gmail.com> wrote: >> On Tue, Jun 20, 2017 at 11:18 AM, Bin.Cheng <amker.ch...@gmail.com> wrote: >>> On Fri, Jun 16, 2017 at 11:10 AM, Richard Biener >>> <richard.guent...@gmail.com> wrote: >>>> On Mon, Jun 12, 2017 at 7:03 PM, Bin Cheng <bin.ch...@arm.com> wrote: >>>>> Hi, >>>>> This patch checks and records if partition can be executed in parallel by >>>>> looking if there exists data dependence cycles. The information is needed >>>>> for distribution because the idea is to distribute parallel type >>>>> partitions >>>>> away from sequential ones. I believe current distribution doesn't work >>>>> very well because it does blind distribution/fusion. >>>>> Bootstrap and test on x86_64 and AArch64. Is it OK? >>>> >>>> + /* In case of no data dependence. */ >>>> + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) >>>> + return false; >>>> + /* Or the data dependence can be resolved by compilation time alias >>>> + check. */ >>>> + else if (!alias_sets_conflict_p (get_alias_set (DR_REF (dr1)), >>>> + get_alias_set (DR_REF (dr2)))) >>>> + return false; >>>> >>>> dependence analysis should use TBAA already, in which cases do you need >>>> this? >>>> It seems to fall foul of the easy mistake of not honoring GCCs memory model >>>> as well ... see dr_may_alias_p. >>> I see. Patch updated with this branch removed. >>> >>>> >>>> + /* Further check if any data dependence prevents us from executing the >>>> + partition parallelly. */ >>>> + EXECUTE_IF_SET_IN_BITMAP (partition->reads, 0, i, bi) >>>> + { >>>> + dr1 = (*datarefs_vec)[i]; >>>> + EXECUTE_IF_SET_IN_BITMAP (partition->writes, 0, j, bj) >>>> + { >>>> >>>> what about write-write dependences? >>>> >>>> + EXECUTE_IF_SET_IN_BITMAP (partition->reads, 0, i, bi) >>>> + { >>>> + dr1 = (*datarefs_vec)[i]; >>>> + EXECUTE_IF_SET_IN_BITMAP (partition->writes, i + 1, j, bj) >>>> + { >>>> + dr2 = (*datarefs_vec)[j]; >>>> + /* Partition can only be executed sequentially if there is any >>>> + data dependence cycle. */ >>>> >>>> exact copy of the loop nest follows?! Maybe you meant to iterate >>>> over writes in the first loop. >>> Yes, this is a copy-paste typo. Patch is also simplified because >>> read/write are recorded together now. Is it OK? >> >> Ok. > Sorry I have to update this patch because one of my mistake. I didn't > update partition type when fusing them. For some partition fusion, > the update is necessary otherwise we end up with inaccurate type and > inaccurate fusion later. Is it Ok? > > Thanks, > bin > 2017-06-20 Bin Cheng <bin.ch...@arm.com> > > * tree-loop-distribution.c (enum partition_type): New. > (struct partition): New field type. > (partition_merge_into): Add parameter. Update partition type. > (data_dep_in_cycle_p, update_type_for_merge): New functions. > (build_rdg_partition_for_vertex): Compute partition type. > (rdg_build_partitions): Dump partition type. > (distribute_loop): Update calls to partition_merge_into.
From 3a00323b1773eaeab368d29fd5995d09afc0cb4e Mon Sep 17 00:00:00 2001 From: Bin Cheng <binch...@e108451-lin.cambridge.arm.com> Date: Fri, 23 Jun 2017 10:43:05 +0100 Subject: [PATCH 10/13] partition-type-20170608.txt
--- gcc/tree-loop-distribution.c | 139 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 123 insertions(+), 16 deletions(-) diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 516d5f7..87fdc15 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -528,11 +528,19 @@ build_rdg (struct loop *loop, control_dependences *cd) } - +/* Kind of distributed loop. */ enum partition_kind { PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE }; +/* Type of distributed loop. */ +enum partition_type { + /* The distributed loop can be executed parallelly. */ + PTYPE_PARALLEL = 0, + /* The distributed loop has to be executed sequentially. */ + PTYPE_SEQUENTIAL +}; + /* Partition for loop distribution. */ struct partition { @@ -546,6 +554,7 @@ struct partition number of loop (latch) iterations. */ bool plus_one; enum partition_kind kind; + enum partition_type type; /* data-references a kind != PKIND_NORMAL partition is about. */ data_reference_p main_dr; data_reference_p secondary_dr; @@ -615,18 +624,16 @@ static const char *fuse_message[] = { "they are in the same dependence scc", "there is no point to distribute loop"}; -/* Merge PARTITION into the partition DEST. */ - static void -partition_merge_into (partition *dest, partition *partition, enum fuse_type ft) -{ - dest->kind = PKIND_NORMAL; - bitmap_ior_into (dest->stmts, partition->stmts); - if (partition_reduction_p (partition)) - dest->reduction_p = true; +update_type_for_merge (struct graph *, partition *, partition *); - bitmap_ior_into (dest->datarefs, partition->datarefs); +/* Merge PARTITION into the partition DEST. RDG is the reduced dependence + graph and we update type for result partition if it is non-NULL. */ +static void +partition_merge_into (struct graph *rdg, partition *dest, + partition *partition, enum fuse_type ft) +{ if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]); @@ -635,6 +642,21 @@ partition_merge_into (partition *dest, partition *partition, enum fuse_type ft) fprintf (dump_file, " Part 2: "); dump_bitmap (dump_file, partition->stmts); } + + dest->kind = PKIND_NORMAL; + if (dest->type == PTYPE_PARALLEL) + dest->type = partition->type; + + bitmap_ior_into (dest->stmts, partition->stmts); + if (partition_reduction_p (partition)) + dest->reduction_p = true; + + /* Further check if any data dependence prevents us from executing the + new partition parallelly. */ + if (dest->type == PTYPE_PARALLEL && rdg != NULL) + update_type_for_merge (rdg, dest, partition); + + bitmap_ior_into (dest->datarefs, partition->datarefs); } @@ -1117,6 +1139,75 @@ get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b) return *slot; } +/* In reduced dependence graph RDG for loop distribution, return true if + dependence between references DR1 and DR2 leads to a dependence cycle + and such dependence cycle can't be resolved by runtime alias check. */ + +static bool +data_dep_in_cycle_p (struct graph *rdg, + data_reference_p dr1, data_reference_p dr2) +{ + struct data_dependence_relation *ddr; + + /* Re-shuffle data-refs to be in topological order. */ + if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) + > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) + std::swap (dr1, dr2); + + ddr = get_data_dependence (rdg, dr1, dr2); + + /* In case of no data dependence. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + return false; + /* For unknown data dependence or known data dependence which can't be + expressed in classic distance vector, we check if it can be resolved + by runtime alias check. If yes, we still consider data dependence + as won't introduce data dependence cycle. */ + else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know + || DDR_NUM_DIST_VECTS (ddr) == 0) + return !runtime_alias_check_p (ddr, NULL, true); + else if (DDR_NUM_DIST_VECTS (ddr) > 1) + return true; + else if (DDR_REVERSED_P (ddr) + || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) + return false; + + return true; +} + +/* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update + PARTITION1's type after merging PARTITION2 into PARTITION1. */ + +static void +update_type_for_merge (struct graph *rdg, + partition *partition1, partition *partition2) +{ + unsigned i, j; + bitmap_iterator bi, bj; + data_reference_p dr1, dr2; + + EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) + { + unsigned start = (partition1 == partition2) ? i + 1 : 0; + + dr1 = datarefs_vec[i]; + EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) + { + dr2 = datarefs_vec[j]; + if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) + continue; + + /* Partition can only be executed sequentially if there is any + data dependence cycle. */ + if (data_dep_in_cycle_p (rdg, dr1, dr2)) + { + partition1->type = PTYPE_SEQUENTIAL; + return; + } + } + } +} + /* Returns a partition with all the statements needed for computing the vertex V of the RDG, also including the loop exit conditions. */ @@ -1142,10 +1233,23 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v) unsigned idx = (unsigned) DR_INDEX (dr); gcc_assert (idx < datarefs_vec.length ()); + /* Partition can only be executed sequentially if there is any + unknown data reference. */ + if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) + || !DR_INIT (dr) || !DR_STEP (dr)) + partition->type = PTYPE_SEQUENTIAL; + bitmap_set_bit (partition->datarefs, idx); } } + if (partition->type == PTYPE_SEQUENTIAL) + return partition; + + /* Further check if any data dependence prevents us from executing the + partition parallelly. */ + update_type_for_merge (rdg, partition, partition); + return partition; } @@ -1388,8 +1492,9 @@ rdg_build_partitions (struct graph *rdg, if (dump_file && (dump_flags & TDF_DETAILS)) { - fprintf (dump_file, "ldist useful partition:\n"); - dump_bitmap (dump_file, partition->stmts); + fprintf (dump_file, "ldist creates useful %s partition:\n", + partition->type == PTYPE_PARALLEL ? "parallel" : "sequent"); + bitmap_print (dump_file, partition->stmts, " ", "\n"); } partitions->safe_push (partition); @@ -1655,7 +1760,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts, for (++i; partitions.iterate (i, &partition); ++i) if (!partition_builtin_p (partition)) { - partition_merge_into (into, partition, FUSE_NON_BUILTIN); + partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); partitions.unordered_remove (i); partition_free (partition); i--; @@ -1671,7 +1776,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts, for (i = i + 1; partitions.iterate (i, &partition); ++i) if (partition_reduction_p (partition)) { - partition_merge_into (into, partition, FUSE_REDUCTION); + partition_merge_into (rdg, into, partition, FUSE_REDUCTION); partitions.unordered_remove (i); partition_free (partition); i--; @@ -1689,7 +1794,7 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts, { if (share_memory_accesses (rdg, into, partition)) { - partition_merge_into (into, partition, FUSE_SHARE_REF); + partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); partitions.unordered_remove (j); partition_free (partition); j--; @@ -1759,7 +1864,9 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts, for (j = j + 1; partitions.iterate (j, &partition); ++j) if (pg->vertices[j].component == i) { - partition_merge_into (first, partition, FUSE_SAME_SCC); + partition_merge_into (NULL, first, + partition, FUSE_SAME_SCC); + first->type = PTYPE_SEQUENTIAL; partitions[j] = NULL; partition_free (partition); PGDATA (j)->partition = NULL; -- 1.9.1