Re: [PATCH,rs6000] Replace swap of a loaded vector constant with load of a swapped vector constant

Bill Schmidt Tue, 19 Sep 2017 14:17:30 -0700

Hi Kelvin,

This is in quite good shape.  In addition to Segher's comments, I have a few
questions/requests below.


> On Sep 15, 2017, at 4:04 PM, Kelvin Nilsen <kdnil...@linux.vnet.ibm.com> 
> wrote:
> 
> @@ -385,6 +396,25 @@ const_load_sequence_p (swap_web_entry *insn_entry,
>         split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
>         if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base))
>           return false;
> +       else
> +         {
> +           /* FIXME: The conditions under which
> +            *  ((GET_CODE (const_vector) == SYMBOL_REF) &&
> +            *   !CONSTANT_POOL_ADDRESS_P (const_vector))
> +            * are not well understood.  This code prevents
> +            * an internal compiler error which will occur in
> +            * replace_swapped_load_constant () if we were to return
> +            * true.  Some day, we should figure out how to properly
> +            * handle this condition in
> +            * replace_swapped_load_constant () and then we can
> +            * remove this special test.  */
> +           rtx const_vector = get_pool_constant (base);
> +           if (GET_CODE (const_vector) == SYMBOL_REF)
> +             {
> +               if (!CONSTANT_POOL_ADDRESS_P (const_vector))
> +                 return false;
> +             }

Rewrite as

        if (GET_CODE (const_vector) == SYMBOL_REF
            && !CONSTANT_POOL_ADDRESS_P (const_vector))
          return false;

> +         }
>       }
>     }
>   return true;
> @@ -1281,6 +1311,189 @@ replace_swap_with_copy (swap_web_entry *insn_entry
>   insn->set_deleted ();
> }
> 
> +/* Given that swap_insn represents a swap of a load of a constant
> +   vector value, replace with a single instruction that loads a
> +   swapped variant of the original constant. 
> +
> +   The "natural" representation of a byte array in memory is the same
> +   for big endian and little endian.
> +
> +   unsigned char byte_array[] =
> +     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f };
> +
> +   However, when loaded into a vector register, the representation
> +   depends on endian conventions.
> +
> +   In big-endian mode, the register holds:
> +
> +     MSB                                            LSB
> +     [ f, e, d, c, b, a, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]
> +
> +   In little-endian mode, the register holds:
> +
> +     MSB                                            LSB
> +     [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
> +
> +
> +   Word arrays require different handling.  Consider the word array:
> +
> +   unsigned int word_array[] =
> +     { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
> +
> +   The in-memory representation depends on endian configuration.  The
> +   equivalent array, declared as a byte array, in memory would be:
> +
> +   unsigned char big_endian_word_array_data[] =
> +     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f }
> +
> +   unsigned char little_endian_word_array_data[] =
> +     { 3, 2, 1, 0, 7, 6, 5, 4, b, a, 9, 8, f, e, d, c }
> +
> +   In big-endian mode, the register holds:
> +
> +     MSB                                            LSB
> +     [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
> +
> +   In little-endian mode, the register holds:
> +
> +     MSB                                            LSB
> +     [ c, d, e, f, 8, 9, a, b, 4, 5, 6, 7, 0, 1, 2, 3 ]
> +
> +
> +  Similar transformations apply to the vector of half-word and vector
> +  of double-word representations.

Thanks for providing these examples.

> +
> +  For now, don't handle vectors of quad-precision values.  Just return.
> +  A better solution is to fix the code generator to emit lvx/stvx for
> +  those.  */
> +static void
> +replace_swapped_load_constant (swap_web_entry *insn_entry, rtx swap_insn)
> +{
> +  /* Find the load.  */
> +  struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
> +  rtx_insn *load_insn = 0;
> +  df_ref use;
> +
> +  FOR_EACH_INSN_INFO_USE (use, insn_info)
> +    {
> +      struct df_link *def_link = DF_REF_CHAIN (use);
> +      gcc_assert (def_link && !def_link->next);
> +      load_insn = DF_REF_INSN (def_link->ref);
> +      break;
> +    }
> +  gcc_assert (load_insn);

I agree with Segher, please rewrite the above to not use a loop.  The
code here is correct, just awkward.  You are assuming that the first
use in a swap instruction will be the register you're looking for, and
that is correct.

> +
> +  /* Find the TOC-relative symbol access.  */
> +  insn_info = DF_INSN_INFO_GET (load_insn);
> +  rtx_insn *tocrel_insn = 0;
> +  FOR_EACH_INSN_INFO_USE (use, insn_info)
> +    {
> +      struct df_link *def_link = DF_REF_CHAIN (use);
> +      gcc_assert (def_link && !def_link->next);
> +      tocrel_insn = DF_REF_INSN (def_link->ref);
> +      break;
> +    }
> +  gcc_assert (tocrel_insn);

Please check all the uses here.  I am a little nervous about making
a similar assumption about a base register being the first use in
the load.  While this is likely provably true, it isn't so obvious to me
that it will always stay true.

> +
> +  /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
> +     to set tocrel_base; otherwise it would be unnecessary as we've
> +     already established it will return true.  */
> +  rtx base, offset;
> +  rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
> +  const_rtx tocrel_base;
> +  /* There is an extra level of indirection for small/large code models.  */
> +  if (GET_CODE (tocrel_expr) == MEM)
> +    tocrel_expr = XEXP (tocrel_expr, 0);
> +  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
> +    gcc_unreachable ();
> +  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
> +  rtx const_vector = get_pool_constant (base);
> +  /* With the extra indirection, get_pool_constant will produce the
> +     real constant from the reg_equal expression, so get the real
> +     constant.  */
> +  if (GET_CODE (const_vector) == SYMBOL_REF)
> +    const_vector = get_pool_constant (const_vector);
> +  gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
> +
> +  rtx new_mem;
> +  enum machine_mode mode = GET_MODE (const_vector);
> +  /* Create an adjusted constant from the original constant.  */
> +
> +  if (mode == V1TImode)
> +    /* Leave this code as is.  */
> +    return;
> +  else if (mode == V16QImode)
> +    {
> +      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (16));
> +      int i;
> +      for (i = 0; i < 16; i++)
> +     XVECEXP (vals, 0, ((i+8) % 16)) = XVECEXP (const_vector, 0, i);
> +      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> +      new_mem = force_const_mem (mode, new_const_vector);
> +    }
> +  else if ((mode == V4SImode) || (mode == V4SFmode))
> +    {
> +      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (4));
> +      int i;
> +      for (i = 0; i < 4; i++)
> +     XVECEXP (vals, 0, ((i+2) % 4)) = XVECEXP (const_vector, 0, i);
> +      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> +      new_mem = force_const_mem (mode, new_const_vector);
> +    }

Please move the following V8HI code above the V4SF/V4SI code.

> +  else if ((mode == V8HImode)
> +#ifdef HAVE_V8HFmode
> +        || (mode == V8HFmode)
> +#endif

Anticipating possible _Float16?  I guess that doesn't hurt, we can leave that 
in.

> +        )
> +    {
> +      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (8));
> +      int i;
> +      for (i = 0; i < 8; i++)
> +     XVECEXP (vals, 0, ((i+4) % 8)) = XVECEXP (const_vector, 0, i);
> +      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> +      new_mem = force_const_mem (mode, new_const_vector);
> +    }
> +  else if ((mode == V2DImode) || (mode == V2DFmode))
> +    {
> +      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (2));
> +      int i;
> +      /* Sometimes, load of a V1TImode vector is represented as a load
> +      of two double words with a swap on top of it.  */
> +      for (i = 0; i < 2; i++)
> +     XVECEXP (vals, 0, ((i+1) % 2)) = XVECEXP (const_vector, 0, i);
> +      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> +      new_mem = force_const_mem (mode, new_const_vector);
> +    }
> +  else
> +    {
> +      /* We do not expect other modes to be constant-load-swapped.  */
> +      gcc_assert (0);
> +    }
> +
> +  /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
> +     can't recognize.  Force the SYMBOL_REF into a register.  */
> +  if (!REG_P (XEXP (new_mem, 0))) {
> +    rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
> +    XEXP (new_mem, 0) = base_reg;
> +    /* Move the newly created insn ahead of the load insn.  */
> +    rtx_insn *force_insn = get_last_insn ();
> +    remove_insn (force_insn);

Uh, I don't understand what remove_insn is doing here.  The force_insn is still
"dangling" (it hasn't been inserted anywhere), so there's no reason to remove
it.  I guess this must be harmless, but please remove the call, as it is 
confusing...

> +    rtx_insn *before_load_insn = PREV_INSN (load_insn);
> +    add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN 
> (load_insn));
> +    df_insn_rescan (before_load_insn);
> +    df_insn_rescan (force_insn);
> +  }
> +
> +  /* Replace the MEM in the load instruction and rescan it.  */
> +  XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
> +  INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
> +  df_insn_rescan (load_insn);
> +
> +  unsigned int uid = INSN_UID (swap_insn);
> +  mark_swaps_for_removal (insn_entry, uid);
> +  replace_swap_with_copy (insn_entry, uid);
> +}
> +
> /* Dump the swap table to DUMP_FILE.  */
> static void
> dump_swap_insn_table (swap_web_entry *insn_entry)
> @@ -1873,6 +2086,47 @@ rs6000_analyze_swaps (function *fun)
> 
>   /* Clean up.  */
>   free (insn_entry);
> +
> +  /* Use additional pass over rtl to replace swap(load(vector constant))
> +     with load(swapped vector constant). */
> +  swap_web_entry *pass2_insn_entry;
> +  pass2_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
> +
> +  /* Walk the insns to gather basic data.  */
> +  FOR_ALL_BB_FN (bb, fun)
> +    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
> +    {
> +      unsigned int uid = INSN_UID (insn);
> +      if (NONDEBUG_INSN_P (insn))
> +     {
> +       pass2_insn_entry[uid].insn = insn;
> +
> +       pass2_insn_entry[uid].is_relevant = 1;
> +       pass2_insn_entry[uid].is_load = insn_is_load_p (insn);
> +       pass2_insn_entry[uid].is_store = insn_is_store_p (insn);
> +
> +       /* Determine if this is a doubleword swap.  If not,
> +          determine whether it can legally be swapped.  */
> +       if (insn_is_swap_p (insn))
> +         pass2_insn_entry[uid].is_swap = 1;
> +     }
> +    }
> +
> +  {
> +    unsigned e = get_max_uid (), i;
> +    for (i = 0; i < e; ++i)
> +      {
> +     if (pass2_insn_entry[i].is_swap && !pass2_insn_entry[i].is_load)

  && !pass2_insn_entry[i].is_store

As is, you are letting through the insns representing stxvd2x.  I assume
that const_load_sequence_p is able to silently ignore this, but it's better
not to give it a bomb to play with in the first place.

> +       {
> +         insn = pass2_insn_entry[i].insn;
> +         if (const_load_sequence_p (pass2_insn_entry, insn))
> +           replace_swapped_load_constant (pass2_insn_entry, insn);
> +       }
> +      }
> +  }
> +
> +  /* Clean up.  */
> +  free (pass2_insn_entry);
>   return 0;
> }
> 
Below tests look good modulo Segher's comments.  Please add V8HI, V4SI, V4SF,
V2DI, and V2DF variants inside each test so we get coverage for those cases.

Thanks for fixing this longstanding issue!

Bill

> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-28.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-28.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-28.c    (working copy)
> @@ -0,0 +1,30 @@
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-do run { target { powerpc64*-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3 " } */
> +
> +#include <altivec.h>
> +
> +extern void abort (void);
> +
> +vector char y = { 0, 1, 2, 3,
> +               4, 5, 6, 7,
> +               8, 9, 10, 11,
> +               12, 13, 14, 15 };
> +
> +vector char
> +foo (void)
> +{
> +  vector char x = vec_xl (0, &y);
> +  return y;
> +}
> +
> +int
> +main (int argc, char *argv[])
> +{
> +  vector char fetched_value = foo ();
> +  if (fetched_value[0] != 0 || fetched_value[15] != 15)
> +    abort ();
> +  else
> +    return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swaps-p8-29.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swaps-p8-29.c    (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swaps-p8-29.c    (working copy)
> @@ -0,0 +1,30 @@
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-do run { target { powerpc64*-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3 " } */
> +
> +#include <altivec.h>
> +
> +extern void abort (void);
> +
> +const vector char y = { 0, 1, 2, 3,
> +                     4, 5, 6, 7,
> +                     8, 9, 10, 11,
> +                     12, 13, 14, 15 };
> +
> +vector char
> +foo (void)
> +{
> +  vector char x = vec_xl (0, &y);
> +  return y;
> +}
> +
> +int
> +main (int argc, char *argv[])
> +{
> +  vector char fetched_value = foo ();
> +  if (fetched_value[0] != 0 || fetched_value[15] != 15)
> +    abort ();
> +  else
> +    return 0;
> +}
> Index: gcc/testsuite/gcc.target/powerpc/swps-p8-30.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/swps-p8-30.c     (revision 0)
> +++ gcc/testsuite/gcc.target/powerpc/swps-p8-30.c     (working copy)
> @@ -0,0 +1,36 @@
> +/* This file's name was changed from swaps-p8-30.c so that I could
> +   search the assembler for "not swap", since swap is an alias for
> +   xxpermdi.  With the original file name, the assembler search would
> +   get a false positive on the name of the file.  */
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-do compile { target { powerpc64le-*-* } } } */
> +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
> "-mcpu=power8" } } */
> +/* { dg-options "-mcpu=power8 -O3 " } */
> +/* { dg-final { scan-assembler-not "xxpermdi" } } */
> +/* { dg-final { scan-assembler-not "swap" } } */
> +
> +#include <altivec.h>
> +
> +extern void abort (void);
> +
> +const vector char y = { 0, 1, 2, 3,
> +                     4, 5, 6, 7,
> +                     8, 9, 10, 11,
> +                     12, 13, 14, 15 };
> +
> +vector char
> +foo (void)
> +{
> +  vector char x = vec_xl (0, &y);
> +  return y;
> +}
> +
> +int
> +main (int argc, char *argv[])
> +{
> +  vector char fetched_value = foo ();
> +  if (fetched_value[0] != 0 || fetched_value[15] != 15)
> +    abort ();
> +  else
> +    return 0;
> +}
>

Re: [PATCH,rs6000] Replace swap of a loaded vector constant with load of a swapped vector constant

Reply via email to