On Wed, 2023-04-19 at 11:03 +0800, Lulu Cheng wrote:

/* snip */

> > +loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT
> > length,
> > +                              HOST_WIDE_INT delta)
> >   {
> > -  HOST_WIDE_INT offset, delta;
> > -  unsigned HOST_WIDE_INT bits;
> > +  HOST_WIDE_INT offs, delta_cur;
> >     int i;
> >     machine_mode mode;
> >     rtx *regs;
> >   
> > -  bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN
> > (dest)));
> > -
> > -  mode = int_mode_for_size (bits, 0).require ();
> > -  delta = bits / BITS_PER_UNIT;
> > +  HOST_WIDE_INT num_reg = length / delta;
> 
> I think comments need to be added here, if it is not chasing the code,
> it is not easy to understand.

Pushed r14-70 with the following addition:

+  /* Calculate how many registers we'll need for the block move.
+     We'll emit length / delta move operations with delta as the size
+     first.  Then we may still have length % delta bytes not copied.
+     We handle these remaining bytes by move operations with smaller
+     (halfed) sizes.  For example, if length = 21 and delta = 8, we'll
+     emit two ld.d/st.d pairs, one ld.w/st.w pair, and one ld.b/st.b
+     pair.  For each load/store pair we use a dedicated register to keep
+     the pipeline as populated as possible.  */

> Otherwise LGTM!
> 
> Thanks!
> 
> > +  for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2)
> > +    num_reg += !!(length & delta_cur);
> >   
> >     /* Allocate a buffer for the temporary registers.  */
> > -  regs = XALLOCAVEC (rtx, length / delta);
> > +  regs = XALLOCAVEC (rtx, num_reg);
> >   
> > -  /* Load as many BITS-sized chunks as possible.  Use a normal load
> > if
> > -     the source has enough alignment, otherwise use left/right
> > pairs.  */
> > -  for (offset = 0, i = 0; offset + delta <= length; offset +=
> > delta, i++)
> > +  for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur
> > /= 2)
> >       {
> > -      regs[i] = gen_reg_rtx (mode);
> > -      loongarch_emit_move (regs[i], adjust_address (src, mode,
> > offset));
> > -    }
> > +      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT,
> > 0).require ();
> >   
> > -  for (offset = 0, i = 0; offset + delta <= length; offset +=
> > delta, i++)
> > -    loongarch_emit_move (adjust_address (dest, mode, offset),
> > regs[i]);
> > +      for (; offs + delta_cur <= length; offs += delta_cur, i++)
> > +       {
> > +         regs[i] = gen_reg_rtx (mode);
> > +         loongarch_emit_move (regs[i], adjust_address (src, mode,
> > offs));
> > +       }
> > +    }
> >   
> > -  /* Mop up any left-over bytes.  */
> > -  if (offset < length)
> > +  for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur
> > /= 2)
> >       {
> > -      src = adjust_address (src, BLKmode, offset);
> > -      dest = adjust_address (dest, BLKmode, offset);
> > -      move_by_pieces (dest, src, length - offset,
> > -                     MIN (MEM_ALIGN (src), MEM_ALIGN (dest)),
> > -                     (enum memop_ret) 0);
> > +      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT,
> > 0).require ();
> > +
> > +      for (; offs + delta_cur <= length; offs += delta_cur, i++)
> > +       loongarch_emit_move (adjust_address (dest, mode, offs),
> > regs[i]);
> >       }
> >   }
> >   
> > @@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem,
> > HOST_WIDE_INT length, rtx *loop_reg,
> >   
> >   static void
> >   loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT
> > length,
> > -                          HOST_WIDE_INT bytes_per_iter)
> > +                          HOST_WIDE_INT align)
> >   {
> >     rtx_code_label *label;
> >     rtx src_reg, dest_reg, final_src, test;
> > +  HOST_WIDE_INT bytes_per_iter = align *
> > LARCH_MAX_MOVE_OPS_PER_LOOP_ITER;
> >     HOST_WIDE_INT leftover;
> >   
> >     leftover = length % bytes_per_iter;
> > @@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src,
> > HOST_WIDE_INT length,
> >     emit_label (label);
> >   
> >     /* Emit the loop body.  */
> > -  loongarch_block_move_straight (dest, src, bytes_per_iter);
> > +  loongarch_block_move_straight (dest, src, bytes_per_iter, align);
> >   
> >     /* Move on to the next block.  */
> >     loongarch_emit_move (src_reg,
> > @@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src,
> > HOST_WIDE_INT length,
> >   
> >     /* Mop up any left-over bytes.  */
> >     if (leftover)
> > -    loongarch_block_move_straight (dest, src, leftover);
> > +    loongarch_block_move_straight (dest, src, leftover, align);
> >     else
> >       /* Temporary fix for PR79150.  */
> >       emit_insn (gen_nop ());
> > @@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx
> > src, HOST_WIDE_INT length,
> >      memory reference SRC to memory reference DEST.  */
> >   
> >   bool
> > -loongarch_expand_block_move (rtx dest, rtx src, rtx length)
> > +loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx
> > r_align)
> >   {
> > -  int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT;
> > +  if (!CONST_INT_P (r_length))
> > +    return false;
> > +
> > +  HOST_WIDE_INT length = INTVAL (r_length);
> > +  if (length > loongarch_max_inline_memcpy_size)
> > +    return false;
> > +
> > +  HOST_WIDE_INT align = INTVAL (r_align);
> > +
> > +  if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD)
> > +    align = UNITS_PER_WORD;
> >   
> > -  if (CONST_INT_P (length)
> > -      && INTVAL (length) <= loongarch_max_inline_memcpy_size)
> > +  if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT)
> >       {
> > -      if (INTVAL (length) <= max_move_bytes)
> > -       {
> > -         loongarch_block_move_straight (dest, src, INTVAL
> > (length));
> > -         return true;
> > -       }
> > -      else if (optimize)
> > -       {
> > -         loongarch_block_move_loop (dest, src, INTVAL (length),
> > -                                   
> > LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER);
> > -         return true;
> > -       }
> > +      loongarch_block_move_straight (dest, src, length, align);
> > +      return true;
> > +    }
> > +
> > +  if (optimize)
> > +    {
> > +      loongarch_block_move_loop (dest, src, length, align);
> > +      return true;
> >       }
> > +
> >     return false;
> >   }
> >   
> > diff --git a/gcc/config/loongarch/loongarch.h
> > b/gcc/config/loongarch/loongarch.h
> > index 7151d5cabb3..1bcd144a5d9 100644
> > --- a/gcc/config/loongarch/loongarch.h
> > +++ b/gcc/config/loongarch/loongarch.h
> > @@ -1063,13 +1063,13 @@ typedef struct {
> >   
> >   /* The maximum number of bytes that can be copied by one iteration
> > of
> >      a cpymemsi loop; see loongarch_block_move_loop.  */
> > -#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
> > +#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4
> >   
> >   /* The maximum number of bytes that can be copied by a straight-
> > line
> >      implementation of cpymemsi; see loongarch_block_move_straight. 
> > We want
> >      to make sure that any loop-based implementation will iterate at
> >      least twice.  */
> > -#define LARCH_MAX_MOVE_BYTES_STRAIGHT
> > (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
> > +#define LARCH_MAX_MOVE_OPS_STRAIGHT
> > (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
> >   
> >   /* The base cost of a memcpy call, for MOVE_RATIO and friends. 
> > These
> >      values were determined experimentally by benchmarking with
> > CSiBE.
> > @@ -1077,7 +1077,7 @@ typedef struct {
> >   #define LARCH_CALL_RATIO 8
> >   
> >   /* Any loop-based implementation of cpymemsi will have at least
> > -   LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
> > +   LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory
> >      moves, so allow individual copies of fewer elements.
> >   
> >      When cpymemsi is not available, use a value approximating
> > @@ -1088,9 +1088,7 @@ typedef struct {
> >      value of LARCH_CALL_RATIO to take that into account.  */
> >   
> >   #define MOVE_RATIO(speed) \
> > -  (HAVE_cpymemsi \
> > -   ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
> > -   : CLEAR_RATIO (speed) / 2)
> > +  (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO
> > (speed) / 2)
> >   
> >   /* For CLEAR_RATIO, when optimizing for size, give a better
> > estimate
> >      of the length of a memset call, but use the default otherwise. 
> > */
> > diff --git a/gcc/config/loongarch/loongarch.md
> > b/gcc/config/loongarch/loongarch.md
> > index 628ecc78088..816a943d155 100644
> > --- a/gcc/config/loongarch/loongarch.md
> > +++ b/gcc/config/loongarch/loongarch.md
> > @@ -2488,7 +2488,8 @@ (define_expand "cpymemsi"
> >     ""
> >   {
> >     if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P
> > -      && loongarch_expand_block_move (operands[0], operands[1],
> > operands[2]))
> > +      && loongarch_expand_block_move (operands[0], operands[1],
> > +                                     operands[2], operands[3]))
> >       DONE;
> >     else
> >       FAIL;
> > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
> > b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
> > new file mode 100644
> > index 00000000000..4cd35d13904
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */
> > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
> > +
> > +extern char a[], b[];
> > +void test() { __builtin_memcpy(a, b, 15); }
> > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
> > b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
> > new file mode 100644
> > index 00000000000..703eb951c6d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
> > @@ -0,0 +1,9 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
> > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
> > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
> > +
> > +extern long a[], b[];
> > +void test() { __builtin_memcpy(a, b, 15); }
> > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
> > b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
> > new file mode 100644
> > index 00000000000..d6a80659b31
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
> > +
> > +/* Three loop iterations each contains 4 st.b, and 3 st.b after the
> > loop */
> > +/* { dg-final { scan-assembler-times "st\\.b" 7 } } */
> > +
> > +/* { dg-final { scan-assembler-not "st\\.h" } } */
> > +/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */
> > +/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */
> > +
> > +extern char a[], b[];
> > +void test() { __builtin_memcpy(a, b, 15); }
> 

-- 
Xi Ruoyao <xry...@xry111.site>
School of Aerospace Science and Technology, Xidian University

Reply via email to