Re: [PATCH v4] LoongArch: Optimize immediate load.

Xi Ruoyao via Gcc-patches Tue, 22 Nov 2022 08:44:40 -0800

On Tue, 2022-11-22 at 22:03 +0800, Xi Ruoyao via Gcc-patches wrote:
> While I still can't fully understand the immediate load issue and how
> this patch fix it, I've tested this patch (alongside the prefetch
> instruction patch) with bootstrap-ubsan.  And the compiled result of
> imm-load1.c seems OK.


And it's doing correct thing for Glibc "improved generic string
functions" patch, producing some really tight loop now.

> 
> On Thu, 2022-11-17 at 17:59 +0800, Lulu Cheng wrote:
> > v1 -> v2:
> > 1. Change the code format.
> > 2. Fix bugs in the code.
> > 
> > v2 -> v3:
> > Modifying a code implementation of an undefined behavior.
> > 
> > v3 -> v4:
> > Move the part of the immediate number decomposition from expand pass
> > to split
> > pass.
> > 
> > Both regression tests and spec2006 passed.
> > 
> > The problem mentioned in the link does not move the four immediate
> > load
> > instructions out of the loop. It has been optimized. Now, as in the
> > test case,
> > four immediate load instructions are generated outside the loop.
> > (
> > https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html
> > )
> > 
> > --------------------------------------------------------------------
> > Because loop2_invariant pass will extract the instructions that do
> > not
> > change
> > in the loop out of the loop, some instructions will not meet the
> > extraction
> > conditions if the machine performs immediate decomposition while
> > expand pass,
> > so the immediate decomposition will be transferred to the split
> > process.
> > 
> > gcc/ChangeLog:
> > 
> >         * config/loongarch/loongarch.cc (enum
> > loongarch_load_imm_method):
> >         Remove the member METHOD_INSV that is not currently used.
> >         (struct loongarch_integer_op): Define a new member
> > curr_value,
> >         that records the value of the number stored in the
> > destination
> >         register immediately after the current instruction has run.
> >         (loongarch_build_integer): Assign a value to the curr_value
> > member variable.
> >         (loongarch_move_integer): Adds information for the immediate
> > load instruction.
> >         * config/loongarch/loongarch.md (*movdi_32bit): Redefine as
> > define_insn_and_split.
> >         (*movdi_64bit): Likewise.
> >         (*movsi_internal): Likewise.
> >         (*movhi_internal): Likewise.
> >         * config/loongarch/predicates.md: Return true as long as it
> > is
> > CONST_INT, ensure
> >         that the immediate number is not optimized by decomposition
> > during expand
> >         optimization loop.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >         * gcc.target/loongarch/imm-load.c: New test.
> >         * gcc.target/loongarch/imm-load1.c: New test.
> > ---
> >  gcc/config/loongarch/loongarch.cc             | 62 ++++++++++------
> > --
> > -
> >  gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
> >  gcc/config/loongarch/predicates.md            |  2 +-
> >  gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
> >  .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
> >  5 files changed, 110 insertions(+), 34 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
> >  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > 
> > diff --git a/gcc/config/loongarch/loongarch.cc
> > b/gcc/config/loongarch/loongarch.cc
> > index 8ee32c90573..9e0d6c7c3ea 100644
> > --- a/gcc/config/loongarch/loongarch.cc
> > +++ b/gcc/config/loongarch/loongarch.cc
> > @@ -139,22 +139,21 @@ struct loongarch_address_info
> >  
> >     METHOD_LU52I:
> >       Load 52-63 bit of the immediate number.
> > -
> > -   METHOD_INSV:
> > -     immediate like 0xfff00000fffffxxx
> > -   */
> > +*/
> >  enum loongarch_load_imm_method
> >  {
> >    METHOD_NORMAL,
> >    METHOD_LU32I,
> > -  METHOD_LU52I,
> > -  METHOD_INSV
> > +  METHOD_LU52I
> >  };
> >  
> >  struct loongarch_integer_op
> >  {
> >    enum rtx_code code;
> >    HOST_WIDE_INT value;
> > +  /* Represent the result of the immediate count of the load
> > instruction at
> > +     each step.  */
> > +  HOST_WIDE_INT curr_value;
> >    enum loongarch_load_imm_method method;
> >  };
> >  
> > @@ -1475,24 +1474,27 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >      {
> >        /* The value of the lower 32 bit be loaded with one
> > instruction.
> >          lu12i.w.  */
> > -      codes[0].code = UNKNOWN;
> > -      codes[0].method = METHOD_NORMAL;
> > -      codes[0].value = low_part;
> > +      codes[cost].code = UNKNOWN;
> > +      codes[cost].method = METHOD_NORMAL;
> > +      codes[cost].value = low_part;
> > +      codes[cost].curr_value = low_part;
> >        cost++;
> >      }
> >    else
> >      {
> >        /* lu12i.w + ior.  */
> > -      codes[0].code = UNKNOWN;
> > -      codes[0].method = METHOD_NORMAL;
> > -      codes[0].value = low_part & ~(IMM_REACH - 1);
> > +      codes[cost].code = UNKNOWN;
> > +      codes[cost].method = METHOD_NORMAL;
> > +      codes[cost].value = low_part & ~(IMM_REACH - 1);
> > +      codes[cost].curr_value = codes[cost].value;
> >        cost++;
> >        HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
> >        if (iorv != 0)
> >         {
> > -         codes[1].code = IOR;
> > -         codes[1].method = METHOD_NORMAL;
> > -         codes[1].value = iorv;
> > +         codes[cost].code = IOR;
> > +         codes[cost].method = METHOD_NORMAL;
> > +         codes[cost].value = iorv;
> > +         codes[cost].curr_value = low_part;
> >           cost++;
> >         }
> >      }
> > @@ -1515,11 +1517,14 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >         {
> >           codes[cost].method = METHOD_LU52I;
> >           codes[cost].value = value & LU52I_B;
> > +         codes[cost].curr_value = value;
> >           return cost + 1;
> >         }
> >  
> >        codes[cost].method = METHOD_LU32I;
> >        codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B :
> > 0);
> > +      codes[cost].curr_value = (value & 0xfffffffffffff)
> > +       | (sign51 ? LU52I_B : 0);
> >        cost++;
> >  
> >        /* Determine whether the 52-61 bits are sign-extended from
> > the
> > low order,
> > @@ -1528,6 +1533,7 @@ loongarch_build_integer (struct
> > loongarch_integer_op *codes,
> >         {
> >           codes[cost].method = METHOD_LU52I;
> >           codes[cost].value = value & LU52I_B;
> > +         codes[cost].curr_value = value;
> >           cost++;
> >         }
> >      }
> > @@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest,
> > unsigned HOST_WIDE_INT value)
> >        else
> >         x = force_reg (mode, x);
> >  
> > +      set_unique_reg_note (get_last_insn (), REG_EQUAL,
> > +                          GEN_INT (codes[i-1].curr_value));
> > +
> >        switch (codes[i].method)
> >         {
> >         case METHOD_NORMAL:
> > @@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest,
> > unsigned HOST_WIDE_INT value)
> >                               GEN_INT (codes[i].value));
> >           break;
> >         case METHOD_LU32I:
> > -         emit_insn (
> > -           gen_rtx_SET (x,
> > -                        gen_rtx_IOR (DImode,
> > -                                     gen_rtx_ZERO_EXTEND (
> > -                                       DImode, gen_rtx_SUBREG
> > (SImode, x, 0)),
> > -                                     GEN_INT (codes[i].value))));
> > +         gcc_assert (mode == DImode);
> > +         x = gen_rtx_IOR (DImode,
> > +                          gen_rtx_ZERO_EXTEND (DImode,
> > +                                               gen_rtx_SUBREG
> > (SImode, x, 0)),
> > +                          GEN_INT (codes[i].value));
> >           break;
> >         case METHOD_LU52I:
> > -         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
> > -                                 GEN_INT (codes[i].value)));
> > -         break;
> > -       case METHOD_INSV:
> > -         emit_insn (
> > -           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT
> > (20),
> > -                                              GEN_INT (32)),
> > -                        gen_rtx_REG (DImode, 0)));
> > +         gcc_assert (mode == DImode);
> > +         x = gen_rtx_IOR (DImode,
> > +                          gen_rtx_AND (DImode, x, GEN_INT
> > (0xfffffffffffff)),
> > +                          GEN_INT (codes[i].value));
> >           break;
> >         default:
> >           gcc_unreachable ();
> > diff --git a/gcc/config/loongarch/loongarch.md
> > b/gcc/config/loongarch/loongarch.md
> > index 2fda5381904..f61db66d535 100644
> > --- a/gcc/config/loongarch/loongarch.md
> > +++ b/gcc/config/loongarch/loongarch.md
> > @@ -1718,23 +1718,41 @@ (define_expand "movdi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movdi_32bit"
> > +(define_insn_and_split "*movdi_32bit"
> >    [(set (match_operand:DI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m")
> >         (match_operand:DI 1 "move_operand"
> > "r,i,w,r,*J*r,*m,*f,*f"))]
> >    "!TARGET_64BIT
> >     && (register_operand (operands[0], DImode)
> >         || reg_or_0_operand (operands[1], DImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore")
> >     (set_attr "mode" "DI")])
> >  
> > -(define_insn "*movdi_64bit"
> > +(define_insn_and_split "*movdi_64bit"
> >    [(set (match_operand:DI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m")
> >         (match_operand:DI 1 "move_operand"
> > "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
> >    "TARGET_64BIT
> >     && (register_operand (operands[0], DImode)
> >         || reg_or_0_operand (operands[1], DImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore")
> >     (set_attr "mode" "DI")])
> >  
> > @@ -1749,12 +1767,21 @@ (define_expand "movsi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movsi_internal"
> > +(define_insn_and_split "*movsi_internal"
> >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > "=r,r,r,w,*f,*f,*r,*m,*r,*z")
> >         (match_operand:SI 1 "move_operand"
> > "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
> >    "(register_operand (operands[0], SImode)
> >      || reg_or_0_operand (operands[1], SImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type"
> > "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
> >     (set_attr "mode" "SI")])
> >  
> > @@ -1774,12 +1801,21 @@ (define_expand "movhi"
> >      DONE;
> >  })
> >  
> > -(define_insn "*movhi_internal"
> > +(define_insn_and_split "*movhi_internal"
> >    [(set (match_operand:HI 0 "nonimmediate_operand"
> > "=r,r,r,r,m,r,k")
> >         (match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
> >    "(register_operand (operands[0], HImode)
> >         || reg_or_0_operand (operands[1], HImode))"
> >    { return loongarch_output_move (operands[0], operands[1]); }
> > +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> > (REGNO
> > +  (operands[0]))"
> > +  [(const_int 0)]
> > +  "
> > +{
> > +  loongarch_move_integer (operands[0], operands[0], INTVAL
> > (operands[1]));
> > +  DONE;
> > +}
> > +  "
> >    [(set_attr "move_type" "move,const,const,load,store,load,store")
> >     (set_attr "mode" "HI")])
> >  
> > diff --git a/gcc/config/loongarch/predicates.md
> > b/gcc/config/loongarch/predicates.md
> > index 8bd0c1376c9..58c3dc2261c 100644
> > --- a/gcc/config/loongarch/predicates.md
> > +++ b/gcc/config/loongarch/predicates.md
> > @@ -226,7 +226,7 @@ (define_predicate "move_operand"
> >    switch (GET_CODE (op))
> >      {
> >      case CONST_INT:
> > -      return !splittable_const_int_operand (op, mode);
> > +      return true;
> >  
> >      case CONST:
> >      case SYMBOL_REF:
> > diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > new file mode 100644
> > index 00000000000..c04ca33996f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
> > +
> > +long int
> > +test (void)
> > +{
> > +  return 0x1234567890abcdef;
> > +}
> > +/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6
> > "split1" } } */
> > +
> > diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > new file mode 100644
> > index 00000000000..2ff02971239
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mabi=lp64d -O2" } */
> > +/* { dg-final { scan-assembler
> > "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
> > +
> > +
> > +extern long long b[10];
> > +static inline long long
> > +repeat_bytes (void)
> > +{
> > +  long long r = 0x0101010101010101;
> > +
> > +  return r;
> > +}
> > +
> > +static inline long long
> > +highbit_mask (long long m)
> > +{
> > +  return m & repeat_bytes ();
> > +}
> > +
> > +void test(long long *a)
> > +{
> > +  for (int i = 0; i < 10; i++)
> > +    b[i] = highbit_mask (a[i]);
> > +
> > +}
> 

-- 
Xi Ruoyao <xry...@xry111.site>
School of Aerospace Science and Technology, Xidian University

Re: [PATCH v4] LoongArch: Optimize immediate load.

Reply via email to