Hi Segher, Thanks for the review!
>> * config/rs6000/rs6000-p8swap.c (insn_rtx_pair_t): New type. > > Please don't do that. The "first" and "second" are completely > meaningless. Also, keeping it separate arrays can very well result in > better machine code, and certainly makes easier to read source code. OK, use separate arrays instead. Here the first is the AND rtx_insn while the second is its fully-expanded rtx, I thought it's better to bundle them together before, make_pair is an easy way for that. > >> +static bool >> +find_alignment_op (rtx_insn *insn, rtx base_reg, >> + vec<insn_rtx_pair_t> *and_insn_vec) > > Don't name vecs "_vec" (just keep it "and_insn" here, or sometimes > and_insns is clearer). Done, also for those ommitted below. Thanks! > >> - rtx and_operation = 0; >> + rtx and_operation = NULL_RTX; > > Don't change code randomly (to something arguably worse, even). Done. I may think too much and thought NULL_RTX may be preferred since it has the potential to be changed by defining it as nullptr in the current C++11 context. Bootstrapped/regtested on powerpc64le-linux-gnu P8 again. Does the attached vesion look better? BR, Kewen ----- gcc/ChangeLog: * config/rs6000/rs6000-p8swap.c (find_alignment_op): Adjust to support multiple defintions which are all AND operations with the mask -16B. (recombine_lvx_pattern): Adjust to handle multiple AND operations from find_alignment_op. (recombine_stvx_pattern): Likewise. gcc/testsuite/ChangeLog: * gcc.target/powerpc/pr97019.c: New test.
diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c index 3d5dc7d8aae..be863aa479e 100644 --- a/gcc/config/rs6000/rs6000-p8swap.c +++ b/gcc/config/rs6000/rs6000-p8swap.c @@ -2095,11 +2095,15 @@ alignment_mask (rtx_insn *insn) return alignment_with_canonical_addr (SET_SRC (body)); } -/* Given INSN that's a load or store based at BASE_REG, look for a - feeding computation that aligns its address on a 16-byte boundary. - Return the rtx and its containing AND_INSN. */ -static rtx -find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn) +/* Given INSN that's a load or store based at BASE_REG, check if + all of its feeding computations align its address on a 16-byte + boundary. If so, return true and add all definition insns into + AND_INSNS and their corresponding fully-expanded rtxes for the + masking operations into AND_OPS. */ + +static bool +find_alignment_op (rtx_insn *insn, rtx base_reg, vec<rtx_insn *> *and_insns, + vec<rtx> *and_ops) { df_ref base_use; struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); @@ -2111,19 +2115,28 @@ find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn) continue; struct df_link *base_def_link = DF_REF_CHAIN (base_use); - if (!base_def_link || base_def_link->next) - break; + if (!base_def_link) + return false; - /* With stack-protector code enabled, and possibly in other - circumstances, there may not be an associated insn for - the def. */ - if (DF_REF_IS_ARTIFICIAL (base_def_link->ref)) - break; + while (base_def_link) + { + /* With stack-protector code enabled, and possibly in other + circumstances, there may not be an associated insn for + the def. */ + if (DF_REF_IS_ARTIFICIAL (base_def_link->ref)) + return false; - *and_insn = DF_REF_INSN (base_def_link->ref); - and_operation = alignment_mask (*and_insn); - if (and_operation != 0) - break; + rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref); + and_operation = alignment_mask (and_insn); + + /* Stop if we find any one which doesn't align. */ + if (!and_operation) + return false; + + and_insns->safe_push (and_insn); + and_ops->safe_push (and_operation); + base_def_link = base_def_link->next; + } } return and_operation; @@ -2143,11 +2156,14 @@ recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete) rtx mem = XEXP (SET_SRC (body), 0); rtx base_reg = XEXP (mem, 0); - rtx_insn *and_insn; - rtx and_operation = find_alignment_op (insn, base_reg, &and_insn); + auto_vec<rtx_insn *> and_insns; + auto_vec<rtx> and_ops; + bool is_any_def_and + = find_alignment_op (insn, base_reg, &and_insns, &and_ops); - if (and_operation != 0) + if (is_any_def_and) { + gcc_assert (and_insns.length () == and_ops.length ()); df_ref def; struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); FOR_EACH_INSN_INFO_DEF (def, insn_info) @@ -2168,25 +2184,35 @@ recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete) to_delete[INSN_UID (swap_insn)].replace = true; to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; - /* However, first we must be sure that we make the - base register from the AND operation available - in case the register has been overwritten. Copy - the base register to a new pseudo and use that - as the base register of the AND operation in - the new LVX instruction. */ - rtx and_base = XEXP (and_operation, 0); - rtx new_reg = gen_reg_rtx (GET_MODE (and_base)); - rtx copy = gen_rtx_SET (new_reg, and_base); - rtx_insn *new_insn = emit_insn_after (copy, and_insn); - set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn)); - df_insn_rescan (new_insn); - - XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg, - XEXP (and_operation, 1)); + rtx new_reg = 0; + rtx and_mask = 0; + for (unsigned i = 0; i < and_insns.length (); ++i) + { + /* However, first we must be sure that we make the + base register from the AND operation available + in case the register has been overwritten. Copy + the base register to a new pseudo and use that + as the base register of the AND operation in + the new LVX instruction. */ + rtx_insn *and_insn = and_insns[i]; + rtx and_op = and_ops[i]; + rtx and_base = XEXP (and_op, 0); + if (!new_reg) + { + new_reg = gen_reg_rtx (GET_MODE (and_base)); + and_mask = XEXP (and_op, 1); + } + rtx copy = gen_rtx_SET (new_reg, and_base); + rtx_insn *new_insn = emit_insn_after (copy, and_insn); + set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn)); + df_insn_rescan (new_insn); + } + + XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask); SET_SRC (body) = mem; INSN_CODE (insn) = -1; /* Force re-recognition. */ df_insn_rescan (insn); - + if (dump_file) fprintf (dump_file, "lvx opportunity found at %d\n", INSN_UID (insn)); @@ -2205,11 +2231,14 @@ recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete) rtx mem = SET_DEST (body); rtx base_reg = XEXP (mem, 0); - rtx_insn *and_insn; - rtx and_operation = find_alignment_op (insn, base_reg, &and_insn); + auto_vec<rtx_insn *> and_insns; + auto_vec<rtx> and_ops; + bool is_any_def_and + = find_alignment_op (insn, base_reg, &and_insns, &and_ops); - if (and_operation != 0) + if (is_any_def_and) { + gcc_assert (and_insns.length () == and_ops.length ()); rtx src_reg = XEXP (SET_SRC (body), 0); df_ref src_use; struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); @@ -2234,25 +2263,35 @@ recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete) to_delete[INSN_UID (swap_insn)].replace = true; to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; - /* However, first we must be sure that we make the - base register from the AND operation available - in case the register has been overwritten. Copy - the base register to a new pseudo and use that - as the base register of the AND operation in - the new STVX instruction. */ - rtx and_base = XEXP (and_operation, 0); - rtx new_reg = gen_reg_rtx (GET_MODE (and_base)); - rtx copy = gen_rtx_SET (new_reg, and_base); - rtx_insn *new_insn = emit_insn_after (copy, and_insn); - set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn)); - df_insn_rescan (new_insn); - - XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg, - XEXP (and_operation, 1)); + rtx new_reg = 0; + rtx and_mask = 0; + for (unsigned i = 0; i < and_insns.length (); ++i) + { + /* However, first we must be sure that we make the + base register from the AND operation available + in case the register has been overwritten. Copy + the base register to a new pseudo and use that + as the base register of the AND operation in + the new STVX instruction. */ + rtx_insn *and_insn = and_insns[i]; + rtx and_op = and_ops[i]; + rtx and_base = XEXP (and_op, 0); + if (!new_reg) + { + new_reg = gen_reg_rtx (GET_MODE (and_base)); + and_mask = XEXP (and_op, 1); + } + rtx copy = gen_rtx_SET (new_reg, and_base); + rtx_insn *new_insn = emit_insn_after (copy, and_insn); + set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn)); + df_insn_rescan (new_insn); + } + + XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask); SET_SRC (body) = src_reg; INSN_CODE (insn) = -1; /* Force re-recognition. */ df_insn_rescan (insn); - + if (dump_file) fprintf (dump_file, "stvx opportunity found at %d\n", INSN_UID (insn)); diff --git a/gcc/testsuite/gcc.target/powerpc/pr97019.c b/gcc/testsuite/gcc.target/powerpc/pr97019.c new file mode 100644 index 00000000000..cb4cba4a284 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr97019.c @@ -0,0 +1,82 @@ +/* This issue can only exist on little-endian P8 targets, since + the built-in functions vec_ld/vec_st will use lxvd2x/stxvd2x + (P8 big-endian) or lxv/stxv (P9 and later). */ +/* { dg-do compile { target { powerpc_p8vector_ok && le } } } */ +/* { dg-options "-O2 -mdejagnu-cpu=power8" } */ + +/* Test there are no useless instructions "rldicr x,y,0,59" + to align the addresses for lvx/stvx. */ + +extern int a, b, c; +extern vector unsigned long long ev5, ev6, ev7, ev8; +extern int dummy (vector unsigned long long); + +int test_vec_ld(unsigned char *pe) { + + vector unsigned long long v1, v2, v3, v4, v9; + vector unsigned long long v5 = ev5; + vector unsigned long long v6 = ev6; + vector unsigned long long v7 = ev7; + vector unsigned long long v8 = ev8; + + unsigned char *e = pe; + + do { + if (a) { + v1 = __builtin_vec_ld(16, (unsigned long long *)e); + v2 = __builtin_vec_ld(32, (unsigned long long *)e); + v3 = __builtin_vec_ld(48, (unsigned long long *)e); + e = e + 8; + for (int i = 0; i < a; i++) { + v4 = v5; + v5 = __builtin_crypto_vpmsumd(v1, v6); + v6 = __builtin_crypto_vpmsumd(v2, v7); + v7 = __builtin_crypto_vpmsumd(v3, v8); + e = e + 8; + } + } + v5 = __builtin_vec_ld(16, (unsigned long long *)e); + v6 = __builtin_vec_ld(32, (unsigned long long *)e); + v7 = __builtin_vec_ld(48, (unsigned long long *)e); + if (c) + b = 1; + } while (b); + + return dummy(v4); +} + +int test_vec_st(unsigned char *pe) { + + vector unsigned long long v1, v2, v3, v4; + vector unsigned long long v5 = ev5; + vector unsigned long long v6 = ev6; + vector unsigned long long v7 = ev7; + vector unsigned long long v8 = ev8; + + unsigned char *e = pe; + + do { + if (a) { + __builtin_vec_st(v1, 16, (unsigned long long *)e); + __builtin_vec_st(v2, 32, (unsigned long long *)e); + __builtin_vec_st(v3, 48, (unsigned long long *)e); + e = e + 8; + for (int i = 0; i < a; i++) { + v4 = v5; + v5 = __builtin_crypto_vpmsumd(v1, v6); + v6 = __builtin_crypto_vpmsumd(v2, v7); + v7 = __builtin_crypto_vpmsumd(v3, v8); + e = e + 8; + } + } + __builtin_vec_st(v5, 16, (unsigned long long *)e); + __builtin_vec_st(v6, 32, (unsigned long long *)e); + __builtin_vec_st(v7, 48, (unsigned long long *)e); + if (c) + b = 1; + } while (b); + + return dummy(v4); +} + +/* { dg-final { scan-assembler-not "rldicr\[ \t\]+\[0-9\]+,\[0-9\]+,0,59" } } */