+ for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2)
+ num_reg += !!(length & delta_cur);
/* Allocate a buffer for the temporary registers. */
- regs = XALLOCAVEC (rtx, length / delta);
+ regs = XALLOCAVEC (rtx, num_reg);
- /* Load as many BITS-sized chunks as possible. Use a normal load if
- the source has enough alignment, otherwise use left/right pairs. */
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+ for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
{
- regs[i] = gen_reg_rtx (mode);
- loongarch_emit_move (regs[i], adjust_address (src, mode, offset));
- }
+ mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
- loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]);
+ for (; offs + delta_cur <= length; offs += delta_cur, i++)
+ {
+ regs[i] = gen_reg_rtx (mode);
+ loongarch_emit_move (regs[i], adjust_address (src, mode, offs));
+ }
+ }
- /* Mop up any left-over bytes. */
- if (offset < length)
+ for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
{
- src = adjust_address (src, BLKmode, offset);
- dest = adjust_address (dest, BLKmode, offset);
- move_by_pieces (dest, src, length - offset,
- MIN (MEM_ALIGN (src), MEM_ALIGN (dest)),
- (enum memop_ret) 0);
+ mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
+
+ for (; offs + delta_cur <= length; offs += delta_cur, i++)
+ loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]);
}
}
@@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
static void
loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
- HOST_WIDE_INT bytes_per_iter)
+ HOST_WIDE_INT align)
{
rtx_code_label *label;
rtx src_reg, dest_reg, final_src, test;
+ HOST_WIDE_INT bytes_per_iter = align * LARCH_MAX_MOVE_OPS_PER_LOOP_ITER;
HOST_WIDE_INT leftover;
leftover = length % bytes_per_iter;
@@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src,
HOST_WIDE_INT length,
emit_label (label);
/* Emit the loop body. */
- loongarch_block_move_straight (dest, src, bytes_per_iter);
+ loongarch_block_move_straight (dest, src, bytes_per_iter, align);
/* Move on to the next block. */
loongarch_emit_move (src_reg,
@@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src,
HOST_WIDE_INT length,
/* Mop up any left-over bytes. */
if (leftover)
- loongarch_block_move_straight (dest, src, leftover);
+ loongarch_block_move_straight (dest, src, leftover, align);
else
/* Temporary fix for PR79150. */
emit_insn (gen_nop ());
@@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx src,
HOST_WIDE_INT length,
memory reference SRC to memory reference DEST. */
bool
-loongarch_expand_block_move (rtx dest, rtx src, rtx length)
+loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align)
{
- int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT;
+ if (!CONST_INT_P (r_length))
+ return false;
+
+ HOST_WIDE_INT length = INTVAL (r_length);
+ if (length > loongarch_max_inline_memcpy_size)
+ return false;
+
+ HOST_WIDE_INT align = INTVAL (r_align);
+
+ if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD)
+ align = UNITS_PER_WORD;
- if (CONST_INT_P (length)
- && INTVAL (length) <= loongarch_max_inline_memcpy_size)
+ if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT)
{
- if (INTVAL (length) <= max_move_bytes)
- {
- loongarch_block_move_straight (dest, src, INTVAL (length));
- return true;
- }
- else if (optimize)
- {
- loongarch_block_move_loop (dest, src, INTVAL (length),
- LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER);
- return true;
- }
+ loongarch_block_move_straight (dest, src, length, align);
+ return true;
+ }
+
+ if (optimize)
+ {
+ loongarch_block_move_loop (dest, src, length, align);
+ return true;
}
+
return false;
}
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 7151d5cabb3..1bcd144a5d9 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -1063,13 +1063,13 @@ typedef struct {
/* The maximum number of bytes that can be copied by one iteration of
a cpymemsi loop; see loongarch_block_move_loop. */
-#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
+#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4
/* The maximum number of bytes that can be copied by a straight-line
implementation of cpymemsi; see loongarch_block_move_straight. We want
to make sure that any loop-based implementation will iterate at
least twice. */
-#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
+#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
/* The base cost of a memcpy call, for MOVE_RATIO and friends. These
values were determined experimentally by benchmarking with CSiBE.
@@ -1077,7 +1077,7 @@ typedef struct {
#define LARCH_CALL_RATIO 8
/* Any loop-based implementation of cpymemsi will have at least
- LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
+ LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory
moves, so allow individual copies of fewer elements.
When cpymemsi is not available, use a value approximating
@@ -1088,9 +1088,7 @@ typedef struct {
value of LARCH_CALL_RATIO to take that into account. */
#define MOVE_RATIO(speed) \
- (HAVE_cpymemsi \
- ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
- : CLEAR_RATIO (speed) / 2)
+ (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2)
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
of the length of a memset call, but use the default otherwise. */
diff --git a/gcc/config/loongarch/loongarch.md
b/gcc/config/loongarch/loongarch.md
index 628ecc78088..816a943d155 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -2488,7 +2488,8 @@ (define_expand "cpymemsi"
""
{
if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P
- && loongarch_expand_block_move (operands[0], operands[1], operands[2]))
+ && loongarch_expand_block_move (operands[0], operands[1],
+ operands[2], operands[3]))
DONE;
else
FAIL;
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
new file mode 100644
index 00000000000..4cd35d13904
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
new file mode 100644
index 00000000000..703eb951c6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+extern long a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
new file mode 100644
index 00000000000..d6a80659b31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
+
+/* Three loop iterations each contains 4 st.b, and 3 st.b after the loop */
+/* { dg-final { scan-assembler-times "st\\.b" 7 } } */
+
+/* { dg-final { scan-assembler-not "st\\.h" } } */
+/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */
+/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }