Ping?
> -----Original Message-----
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> Sent: Thursday, August 07, 2014 6:56 PM
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] Cancel bswap opt when intermediate stmts are reused
>
> Hi all,
>
> Currently, when an expression doing manual load or bswap is detected, the
> bswap optimization replace it by an actual load and/or bswap instruction
> without considering whether the intermediate values computed in the
> expressions are reused later. If that is the case, the construction of these
> values has to be retained and the sum of the load and/or bswap instruction
> and the instruction to contruct those values might be more expensive than
> the initial fully manual expression. This patch aims at detecting such a case
> and cancel the bswap optimization. In addition, it takes advantage of the
> infrastructure necessary for the detection to also cleanup the stmts that
> become useless when the bswap optimization is performed.
>
> *** gcc/ChangeLog ***
>
> 2014-08-01 Thomas Preud'homme <thomas.preudho...@arm.com>
>
> * tree-ssa-math-opts.c (struct usedtree): New.
> (find_bswap_or_nop_1): Change prototype to take a hashtable and
> a list
> of struct usedtree. Fill respectively these with visited stmts and
> trees (along with the stmts where they are defined) that would not
> be
> defined if bswap optimization is applied. Adapt recursion call to
> prototype change.
> (find_bswap_or_nop): Adapt to find_bswap_or_nop_1 prototype
> change.
> Pass the hashtable and the list of struct usedtree from
> pass_optimize_bswap::execute ().
> (do_bswap_p): New.
> (bswap_replace): Fix typo in heading comment. Stop returning
> whether
> the bswap optimization was performed since this is now handled by
> do_bswap_p (). Move alignment check to do_bswap_p ().
> (free_usedtrees_list): New.
> (pass_optimize_bswap::execute): Add allocation and deallocation
> handling of the hashtable of browsed stmts. Free the list of struct
> usedtree at the end of each iteration using free_usedtrees_list ()
> and
> the new bswap_check_end_iter label. Move some of the logic to
> perform
> the bswap optimization to do_bswap_p (). Set gsi after performing
> the
> bswap optimization and loop manually via the new
> bswap_check_start_iter label so that all stmts are checked for
> load/bswap even when cur_stmt is moved around by bswap_replace
> ().
>
> *** gcc/testsuite/ChangeLog ***
>
> 2014-08-01 Thomas Preud'homme <thomas.preudho...@arm.com>
>
> * gcc.dg/optimize-bswapsi-2.c (read_le32_1): Add an intermediate
> variable in the mix to check that it is optimized when there is no
> use outside the expression computing a load/bswap.
> (read_be32_1): Likewise.
> * gcc.dg/optimize-bswapsi-3.c: New. Create read_le32_1 () and
> read_be32_1 () based on identically named function in
> gcc.dg/optimize-bswapsi-2.c but reusing the intermediate variable so
> as to check that bswap optimization is not performed in these cases.
>
> diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
> b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
> index de6e697..df7856b 100644
> --- a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
> +++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
> @@ -14,7 +14,9 @@ struct uint32_st {
>
> uint32_t read_le32_1 (void)
> {
> - return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
> + uint32_t low = data[0] | (data[1] << 8);
> + uint32_t ret = low | (data[2] << 16) | (data[3] << 24);
> + return ret;
> }
>
> uint32_t read_le32_2 (struct uint32_st data)
> @@ -30,7 +32,9 @@ uint32_t read_le32_3 (unsigned char *data)
>
> uint32_t read_be32_1 (void)
> {
> - return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
> + uint32_t low = data[3] | (data[2] << 8);
> + uint32_t ret = low | (data[1] << 16) | (data[0] << 24);
> + return ret;
> }
>
> uint32_t read_be32_2 (struct uint32_st data)
> diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c
> b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c
> new file mode 100644
> index 0000000..dc48d9d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target bswap32 } */
> +/* { dg-require-effective-target stdint_types } */
> +/* { dg-options "-O2 -fdump-tree-bswap" } */
> +/* { dg-additional-options "-march=z900" { target s390-*-* } } */
> +
> +#include <stdint.h>
> +
> +extern unsigned char data[4];
> +
> +/* No "bswap" optimization as low is reused */
> +uint32_t read_le32_1 (unsigned char *data, uint32_t *low_neg)
> +{
> + uint32_t low = data[0] | (data[1] << 8);
> + uint32_t ret = low | (data[2] << 16) | (data[3] << 24);
> + *low_neg = low;
> + return ret;
> +}
> +
> +/* No "bswap" optimization as low is reused */
> +uint32_t read_be32_1 (unsigned char *data, uint32_t *low_neg)
> +{
> + uint32_t low = data[3] | (data[2] << 8);
> + uint32_t ret = low | (data[1] << 16) | (data[0] << 24);
> + *low_neg = low;
> + return ret;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "32 bit load in target endianness found
> at" "bswap" } } */
> +/* { dg-final { scan-tree-dump-not "32 bit bswap implementation found at"
> "bswap" } } */
> +/* { dg-final { cleanup-tree-dump "bswap" } } */
> diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
> index 705793b..9f71145 100644
> --- a/gcc/tree-ssa-math-opts.c
> +++ b/gcc/tree-ssa-math-opts.c
> @@ -1641,6 +1641,18 @@ struct symbolic_number {
> #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
> (uint64_t)0x01020304 << 32 | 0x05060708)
>
> +/* Structure to memorize all the intermediate trees created to compute a
> bswap
> + or a load along with the stmt where they are used. This allows to check
> + whether some trees are reused outside the bswap or load performed and
> avoid
> + doing the optimization in that case since it might lead to more
> computation.
> + It also allows to remove all the stmts that create these trees in case the
> + optimization is performed. */
> +struct usedtree {
> + tree t;
> + gimple stmt;
> + struct usedtree *next;
> +};
> +
> /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
> number N. Return false if the requested operation is not permitted
> on a symbolic number. */
> @@ -1803,19 +1815,26 @@ find_bswap_or_nop_load (gimple stmt, tree ref,
> struct symbolic_number *n)
> return true;
> }
>
> -/* find_bswap_or_nop_1 invokes itself recursively with N and tries to
> perform
> - the operation given by the rhs of STMT on the result. If the operation
> - could successfully be executed the function returns a gimple stmt whose
> - rhs's first tree is the expression of the source operand and NULL
> - otherwise. */
> +/* find_bswap_or_nop_1 invokes itself recursively trying to perform the
> + operation given by the rhs of STMT on the symbolic number represented
> by N
> + until LIMIT recursions have been done or the variable that is the source
> of
> + the successive expressions has been found. The function also keeps track
> of
> + the intermediate trees defined in the expression with the stmt where
> they
> + are defined (in USED_TREES) and records the stmt that were visited (in
> + BROWSED_STMTS_HTAB). If the operation could successfully be
> executed the
> + function returns a gimple stmt whose rhs's first tree is the expression of
> + the source operand and NULL otherwise. */
>
> static gimple
> -find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
> +find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit,
> + htab_t browsed_stmts_htab, struct usedtree
> **used_trees)
> {
> + void **slot;
> enum tree_code code;
> tree rhs1, rhs2 = NULL;
> gimple rhs1_stmt, rhs2_stmt, source_stmt1;
> enum gimple_rhs_class rhs_class;
> + struct usedtree *new_used;
>
> if (!limit || !is_gimple_assign (stmt))
> return NULL;
> @@ -1823,7 +1842,20 @@ find_bswap_or_nop_1 (gimple stmt, struct
> symbolic_number *n, int limit)
> rhs1 = gimple_assign_rhs1 (stmt);
>
> if (find_bswap_or_nop_load (stmt, rhs1, n))
> - return stmt;
> + {
> + slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT);
> + gcc_assert (slot);
> + if (!*slot)
> + {
> + *slot = stmt;
> + new_used = (struct usedtree *) xmalloc (sizeof (*used_trees));
> + new_used->t = gimple_assign_lhs (stmt);
> + new_used->stmt = stmt;
> + new_used->next = *used_trees;
> + *used_trees = new_used;
> + }
> + return stmt;
> + }
>
> if (TREE_CODE (rhs1) != SSA_NAME)
> return NULL;
> @@ -1835,6 +1867,18 @@ find_bswap_or_nop_1 (gimple stmt, struct
> symbolic_number *n, int limit)
> if (rhs_class == GIMPLE_BINARY_RHS)
> rhs2 = gimple_assign_rhs2 (stmt);
>
> + slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT);
> + gcc_assert (slot);
> + if (!*slot)
> + {
> + *slot = stmt;
> + new_used = (struct usedtree *) xmalloc (sizeof (*used_trees));
> + new_used->t = gimple_assign_lhs (stmt);
> + new_used->stmt = stmt;
> + new_used->next = *used_trees;
> + *used_trees = new_used;
> + }
> +
> /* Handle unary rhs and binary rhs with integer constants as second
> operand. */
>
> @@ -1851,12 +1895,14 @@ find_bswap_or_nop_1 (gimple stmt, struct
> symbolic_number *n, int limit)
> && code != CONVERT_EXPR)
> return NULL;
>
> - source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
> + source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1,
> + browsed_stmts_htab, used_trees);
>
> /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
> we have to initialize the symbolic number. */
> if (!source_stmt1)
> {
> +
> if (gimple_assign_load_p (stmt)
> || !init_symbolic_number (n, rhs1))
> return NULL;
> @@ -1942,12 +1988,14 @@ find_bswap_or_nop_1 (gimple stmt, struct
> symbolic_number *n, int limit)
> switch (code)
> {
> case BIT_IOR_EXPR:
> - source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
> + source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1,
> + browsed_stmts_htab,
> used_trees);
>
> if (!source_stmt1)
> return NULL;
>
> - source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
> + source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1,
> + browsed_stmts_htab,
> used_trees);
>
> if (!source_stmt2)
> return NULL;
> @@ -2044,16 +2092,19 @@ find_bswap_or_nop_1 (gimple stmt, struct
> symbolic_number *n, int limit)
> return NULL;
> }
>
> -/* Check if STMT completes a bswap implementation or a read in a given
> - endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
> - accordingly. It also sets N to represent the kind of operations
> - performed: size of the resulting expression and whether it works on
> - a memory source, and if so alias-set and vuse. At last, the
> - function returns a stmt whose rhs's first tree is the source
> - expression. */
> +/* Check if STMT completes a bswap implementation or a load in a given
> + endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
> accordingly.
> + It also sets N to represent the kind of operations performed: size of the
> + resulting expression and whether it works on a memory source, and if so
> + alias-set and vuse. All the intermediate trees defined in such an
> + implementation are recorded along the stmt where they are defined (in
> + USED_TREES) as well as the stmts that are part of that implementation (in
> + BROWSED_STMTS_HTAB). At last, the function returns a stmt whose rhs's
> first
> + tree is the source expression. */
>
> static gimple
> -find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
> +find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool
> *bswap,
> + htab_t browsed_stmts_htab, struct usedtree
> **used_trees)
> {
> /* The number which the find_bswap_or_nop_1 result should match in
> order
> to have a full byte swap. The number is shifted to the right
> @@ -2071,7 +2122,8 @@ find_bswap_or_nop (gimple stmt, struct
> symbolic_number *n, bool *bswap)
> in libgcc, and for initial shift/and operation of the src operand. */
> limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
> limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
> - source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
> + source_stmt = find_bswap_or_nop_1 (stmt, n, limit,
> browsed_stmts_htab,
> + used_trees);
>
> if (!source_stmt)
> return NULL;
> @@ -2146,16 +2198,73 @@ public:
>
> }; // class pass_optimize_bswap
>
> +/* Tests whether the bswap optimization can be performed and is likely to
> bring
> + performance improvements. This decision is made based on:
> + - whether the operation performed is a BSWAP or a simple load
> + - whether the operation can be replaced by an instruction (FNDECL is non
> + NULL)
> + - the stmt reading/loading from source operand (SRC_STMT)
> + - whether a the source is in memory (given by N->base_addr)
> + - whether intermediate trees defined in the expression being optimized
> are
> + reused elsewhere. This is determined from the list of tree defined along
> + with the stmt where they are defined (in USED_TREES), a hashtable of
> the
> + stmts that are part of the expression (in BROWSED_STMTS_HTAB) and
> the
> + outermost stmt of the expression (in STMT). */
> +
> +static bool
> +do_bswap_p (gimple stmt, gimple src_stmt, tree fndecl,
> + tree load_type ATTRIBUTE_UNUSED, struct symbolic_number *n,
> + bool bswap, htab_t browsed_stmts_htab, struct usedtree
> *used_trees)
> +{
> + unsigned align ATTRIBUTE_UNUSED;
> + struct usedtree *used;
> + bool ret;
> +
> + if (bswap && !fndecl)
> + return false;
> +
> + if (n->base_addr)
> + {
> + tree src = gimple_assign_rhs1 (src_stmt);
> + align = get_object_alignment (src);
> + if (bswap
> + && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
> + && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
> + return false;
> + }
> +
> + for (ret = true, used = used_trees; ret && used; used = used->next)
> + {
> + gimple using_stmt;
> + imm_use_iterator imm_use;
> +
> + if (used->stmt == stmt)
> + continue;
> +
> + FOR_EACH_IMM_USE_STMT(using_stmt, imm_use, used->t)
> + {
> + if (!is_gimple_debug (using_stmt)
> + && !htab_find_slot (browsed_stmts_htab, using_stmt,
> NO_INSERT))
> + {
> + ret = false;
> + BREAK_FROM_IMM_USE_STMT (imm_use);
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> /* Perform the bswap optimization: replace the statement CUR_STMT at
> GSI with a load of type, VUSE and set-alias as described by N if a
> memory source is involved (N->base_addr is non null), followed by
> the builtin bswap invocation in FNDECL if BSWAP is true. SRC_STMT
> gives where should the replacement be made. It also gives the
> - source on which CUR_STMT is operating via its rhs's first tree nad
> + source on which CUR_STMT is operating via its rhs's first tree and
> N->range gives the size of the expression involved for maintaining
> some statistics. */
>
> -static bool
> +static void
> bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt,
> tree fndecl, tree bswap_type, tree load_type,
> struct symbolic_number *n, bool bswap)
> @@ -2175,12 +2284,6 @@ bswap_replace (gimple cur_stmt,
> gimple_stmt_iterator gsi, gimple src_stmt,
> gimple addr_stmt, load_stmt;
> unsigned align;
>
> - align = get_object_alignment (src);
> - if (bswap
> - && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
> - && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
> - return false;
> -
> gsi_move_before (&gsi, &gsi_ins);
> gsi = gsi_for_stmt (cur_stmt);
>
> @@ -2198,6 +2301,7 @@ bswap_replace (gimple cur_stmt,
> gimple_stmt_iterator gsi, gimple src_stmt,
> }
>
> /* Perform the load. */
> + align = get_object_alignment (src);
> aligned_load_type = load_type;
> if (align < TYPE_ALIGN (load_type))
> aligned_load_type = build_aligned_type (load_type, align);
> @@ -2243,7 +2347,7 @@ bswap_replace (gimple cur_stmt,
> gimple_stmt_iterator gsi, gimple src_stmt,
> (int)n->range);
> print_gimple_stmt (dump_file, cur_stmt, 0, 0);
> }
> - return true;
> + return;
> }
> else
> {
> @@ -2300,7 +2404,32 @@ bswap_replace (gimple cur_stmt,
> gimple_stmt_iterator gsi, gimple src_stmt,
>
> gsi_insert_after (&gsi, call, GSI_SAME_STMT);
> gsi_remove (&gsi, true);
> - return true;
> +}
> +
> +/* Free the struct usedtree elements from the list USED_TREES. If
> REMOVE_STMTS
> + is true, the stmts they reference (CUR_STMT excepted) are removed and
> the
> + ssa names of their lhs is released from the function FUN. */
> +
> +static void
> +free_usedtrees_list (struct usedtree *used_trees, bool remove_stmts,
> + gimple cur_stmt, function *fun)
> +{
> + gimple_stmt_iterator gsi_rm;
> + struct usedtree *used;
> +
> + while ((used = used_trees))
> + {
> + /* Do not mark stmt for removal if it's the replaced one. */
> + if (remove_stmts && used->stmt != cur_stmt)
> + {
> + tree lhs = gimple_assign_lhs (used->stmt);
> + gsi_rm = gsi_for_stmt (used->stmt);
> + gsi_remove (&gsi_rm, true);
> + release_ssa_name_fn (fun, lhs);
> + }
> + used_trees = used->next;
> + free (used);
> + }
> }
>
> /* Find manual byte swap implementations as well as load in a given
> @@ -2315,6 +2444,7 @@ pass_optimize_bswap::execute (function *fun)
> bool bswap16_p, bswap32_p, bswap64_p;
> bool changed = false;
> tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE,
> bswap64_type = NULL_TREE;
> + htab_t browsed_stmts_htab;
>
> if (BITS_PER_UNIT != 8)
> return 0;
> @@ -2350,6 +2480,9 @@ pass_optimize_bswap::execute (function *fun)
> memset (&nop_stats, 0, sizeof (nop_stats));
> memset (&bswap_stats, 0, sizeof (bswap_stats));
>
> + browsed_stmts_htab = htab_create_alloc (16, htab_hash_pointer,
> + htab_eq_pointer, 0, xcalloc, free);
> +
> FOR_EACH_BB_FN (bb, fun)
> {
> gimple_stmt_iterator gsi;
> @@ -2360,19 +2493,29 @@ pass_optimize_bswap::execute (function *fun)
> patterns, the wider variant wouldn't be detected. */
> for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
> {
> - gimple src_stmt, cur_stmt = gsi_stmt (gsi);
> + gimple_stmt_iterator gsi_cont;
> + gimple src_stmt, cur_stmt;
> tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
> struct symbolic_number n;
> - bool bswap;
> + struct usedtree *used_trees;
> + bool bswap, rm_stmts;
> +
> +bswap_check_start_iter:
> + rm_stmts = false;
> + cur_stmt = gsi_stmt (gsi);
>
> if (!is_gimple_assign (cur_stmt)
> || gimple_assign_rhs_code (cur_stmt) != BIT_IOR_EXPR)
> continue;
>
> - src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
> + htab_empty (browsed_stmts_htab);
> + used_trees = NULL;
> +
> + src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap,
> + browsed_stmts_htab, &used_trees);
>
> if (!src_stmt)
> - continue;
> + goto bswap_check_end_iter;
>
> switch (n.range)
> {
> @@ -2401,15 +2544,37 @@ pass_optimize_bswap::execute (function *fun)
> }
> break;
> default:
> - continue;
> + goto bswap_check_end_iter;
> }
>
> - if (bswap && !fndecl)
> - continue;
> -
> - if (bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type,
> - load_type, &n, bswap))
> - changed = true;
> + if (!do_bswap_p (cur_stmt, src_stmt, fndecl, load_type, &n, bswap,
> + browsed_stmts_htab, used_trees))
> + goto bswap_check_end_iter;
> +
> + changed = true;
> + gsi_cont = gsi_for_stmt (cur_stmt);
> + gsi_next (&gsi_cont);
> + bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type,
> + load_type, &n, bswap);
> + rm_stmts = true;
> +
> +bswap_check_end_iter:
> + free_usedtrees_list (used_trees, rm_stmts, cur_stmt, fun);
> +
> + /* cur_stmt may have been moved in bswap_replace (in case of a
> + load) and/or replaced (in case of bswap). Ideally, we want the
> + next iteration to check the first stmt that was before cur_stmt
> + initially that is not an unused stmt. However it is easier and
> + simpler to allow the next iteration to check the changed stmt. */
> + if (rm_stmts)
> + {
> + if (gsi_end_p (gsi_cont))
> + gsi_cont = gsi_last_bb (bb);
> + else
> + gsi_prev (&gsi_cont);
> + gsi = gsi_cont;
> + goto bswap_check_start_iter;
> + }
> }
> }
>
> Tested via bootstrap on x86_64-linux-gnu and no regression of the
> testsuite (new tests passing) as well as through running the testsuite
> compiled by arm-none-eabi-gcc cross compiler under qemu emulating
> Cortex M3 without any regression (new tests passing).
>
> Ok for trunk?
>
> Best regards,
>
> Thomas
>
>