Now that we have proper alignment determination for block moves in place the case of copying a block of longword-aligned data has become real, so implement the merging of loaded data from pairs of SImode registers into single DImode registers for the purpose of using with unaligned stores efficiently, as suggested by a comment in `alpha_expand_block_move' and discard the comment. Provide test cases accordingly.
gcc/ * config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded data from pairs of SImode registers into single DImode registers if to be used with unaligned stores. gcc/testsuite/ * gcc.target/alpha/memcpy-si-aligned.c: New file. * gcc.target/alpha/memcpy-si-unaligned.c: New file. * gcc.target/alpha/memcpy-si-unaligned-dst.c: New file. * gcc.target/alpha/memcpy-si-unaligned-src.c: New file. * gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file. --- gcc/config/alpha/alpha.cc | 45 +++++++-- gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c | 16 +++ gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c | 16 +++ gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c | 11 ++ gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c | 15 +++ gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c | 51 +++++++++++ 6 files changed, 146 insertions(+), 8 deletions(-) gcc-alpha-block-move-si-unaligned.diff Index: gcc/gcc/config/alpha/alpha.cc =================================================================== --- gcc.orig/gcc/config/alpha/alpha.cc +++ gcc/gcc/config/alpha/alpha.cc @@ -3930,14 +3930,44 @@ alpha_expand_block_move (rtx operands[]) { words = bytes / 4; - for (i = 0; i < words; ++i) - data_regs[nregs + i] = gen_reg_rtx (SImode); + /* Load an even quantity of SImode data pieces only. */ + unsigned int hwords = words / 2; + for (i = 0; i / 2 < hwords; ++i) + { + data_regs[nregs + i] = gen_reg_rtx (SImode); + emit_move_insn (data_regs[nregs + i], + adjust_address (orig_src, SImode, ofs + i * 4)); + } - for (i = 0; i < words; ++i) - emit_move_insn (data_regs[nregs + i], - adjust_address (orig_src, SImode, ofs + i * 4)); + /* If we'll be using unaligned stores, merge data from pairs + of SImode registers into DImode registers so that we can + store it more efficiently via quadword unaligned stores. */ + unsigned int j; + if (dst_align < 32) + for (i = 0, j = 0; i < words / 2; ++i, j = i * 2) + { + rtx hi = expand_simple_binop (DImode, ASHIFT, + data_regs[nregs + j + 1], + GEN_INT (32), NULL_RTX, + 1, OPTAB_WIDEN); + data_regs[nregs + i] = expand_simple_binop (DImode, IOR, hi, + data_regs[nregs + j], + NULL_RTX, + 1, OPTAB_WIDEN); + } + else + j = i; - nregs += words; + /* Take care of any remaining odd trailing SImode data piece. */ + if (j < words) + { + data_regs[nregs + i] = gen_reg_rtx (SImode); + emit_move_insn (data_regs[nregs + i], + adjust_address (orig_src, SImode, ofs + j * 4)); + ++i; + } + + nregs += i; bytes -= words * 4; ofs += words * 4; } @@ -4056,13 +4086,12 @@ alpha_expand_block_move (rtx operands[]) } /* Due to the above, this won't be aligned. */ - /* ??? If we have more than one of these, consider constructing full - words in registers and using alpha_expand_unaligned_store_words. */ while (i < nregs && GET_MODE (data_regs[i]) == SImode) { alpha_expand_unaligned_store (orig_dst, data_regs[i], 4, ofs); ofs += 4; i++; + gcc_assert (i == nregs || GET_MODE (data_regs[i]) != SImode); } if (dst_align >= 16) Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c =================================================================== --- /dev/null +++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } } */ + +unsigned int aligned_src_si[17] = { [0 ... 16] = 0xeaebeced }; +unsigned int aligned_dst_si[17] = { [0 ... 16] = 0xdcdbdad9 }; + +void +memcpy_aligned_data_si (void) +{ + __builtin_memcpy (aligned_dst_si + 1, aligned_src_si + 1, 60); +} + +/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */ +/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */ +/* { dg-final { scan-assembler-not "\\s(?:ldq_u|stq_u)\\s" } } */ Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c =================================================================== --- /dev/null +++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } } */ + +unsigned int unaligned_src_si[17] = { [0 ... 16] = 0xfefdfcfb }; + +void +memcpy_unaligned_dst_si (void *dst) +{ + __builtin_memcpy (dst, unaligned_src_si + 1, 60); +} + +/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */ +/* { dg-final { scan-assembler-times "\\sldq_u\\s" 4 } } */ +/* { dg-final { scan-assembler-times "\\sstq_u\\s" 10 } } */ +/* { dg-final { scan-assembler-not "\\sstl\\s" } } */ Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c =================================================================== --- /dev/null +++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-mbwx" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } } */ + +#include "memcpy-si-unaligned-src.c" + +/* { dg-final { scan-assembler-times "\\sldbu\\s" 4 } } */ +/* { dg-final { scan-assembler-times "\\sldq_u\\s" 8 } } */ +/* { dg-final { scan-assembler-times "\\sstb\\s" 4 } } */ +/* { dg-final { scan-assembler-times "\\sstl\\s" 14 } } */ +/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */ Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c =================================================================== --- /dev/null +++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-mno-bwx" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" } } */ + +unsigned int unaligned_dst_si[17] = { [0 ... 16] = 0xc8c9cacb }; + +void +memcpy_unaligned_src_si (const void *src) +{ + __builtin_memcpy (unaligned_dst_si + 1, src, 60); +} + +/* { dg-final { scan-assembler-times "\\sldq_u\\s" 10 } } */ +/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */ +/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */ Index: gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c =================================================================== --- /dev/null +++ gcc/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-additional-sources memcpy-si-aligned.c } */ +/* { dg-additional-sources memcpy-si-unaligned-src.c } */ +/* { dg-additional-sources memcpy-si-unaligned-dst.c } */ +/* { dg-options "" } */ + +void memcpy_aligned_data_si (void); +void memcpy_unaligned_dst_si (void *); +void memcpy_unaligned_src_si (const void *); + +extern unsigned int aligned_src_si[]; +extern unsigned int aligned_dst_si[]; +extern unsigned int unaligned_src_si[]; +extern unsigned int unaligned_dst_si[]; + +int +main (void) +{ + unsigned int v; + int i; + + for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404) + unaligned_src_si[i] = v; + asm ("" : : : "memory"); + memcpy_unaligned_dst_si (aligned_src_si + 1); + asm ("" : : : "memory"); + memcpy_aligned_data_si (); + asm ("" : : : "memory"); + memcpy_unaligned_src_si (aligned_dst_si + 1); + asm ("" : : : "memory"); + for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404) + if (unaligned_dst_si[i] != v) + return 1; + if (unaligned_src_si[0] != 0xfefdfcfb) + return 1; + if (unaligned_src_si[16] != 0xfefdfcfb) + return 1; + if (aligned_src_si[0] != 0xeaebeced) + return 1; + if (aligned_src_si[16] != 0xeaebeced) + return 1; + if (aligned_dst_si[0] != 0xdcdbdad9) + return 1; + if (aligned_dst_si[16] != 0xdcdbdad9) + return 1; + if (unaligned_dst_si[0] != 0xc8c9cacb) + return 1; + if (unaligned_dst_si[16] != 0xc8c9cacb) + return 1; + return 0; +}