https://gcc.gnu.org/g:4962d1309be98585ed05980eb7064dd5cc0d113a
commit r16-5503-g4962d1309be98585ed05980eb7064dd5cc0d113a Author: zhaozhou <[email protected]> Date: Fri Nov 14 11:18:46 2025 +0800 LoongArch: Optimize V4SImode vec_construct for load index length of two. Under the V4SImode, the vec_construct with the load index {0, 1, 0, 1} use vldrepl.d, the vec_construct with the load index {0, 1, 0, 0} use vldrepl.d and vshuf4i, reduced the usage of scalar load and vinsgr2vr. gcc/ChangeLog: * config/loongarch/lsx.md (lsx_vshuf4i_mem_w_0): Add template. (lsx_vldrepl_merge_w_0): Ditto. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c: Diff: --- gcc/config/loongarch/lsx.md | 62 ++++++++++++++++++++++ .../loongarch/vector/lsx/lsx-vec-construct-opt.c | 21 ++++++-- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md index 917eca95722a..3b06d2e20cfc 100644 --- a/gcc/config/loongarch/lsx.md +++ b/gcc/config/loongarch/lsx.md @@ -1614,6 +1614,39 @@ [(set_attr "type" "simd_shf") (set_attr "mode" "<MODE>")]) +(define_insn_and_split "lsx_vshuf4i_mem_w_0" + [(set (match_operand:V4SI 0 "register_operand" "=f") + (vec_merge:V4SI + (vec_duplicate:V4SI + (mem:SI (match_operand:DI 1 "register_operand" "r"))) + (vec_duplicate:V4SI + (mem:SI (plus:DI (match_dup 1) (const_int 4)))) + (match_operand 2 "const_uimm4_operand" "")))] + "ISA_HAS_LSX" + "#" + "&& reload_completed" + [(const_int 0)] +{ + operands[0] = gen_rtx_REG (V2DImode, REGNO (operands[0])); + emit_insn (gen_lsx_vldrepl_d_insn_0 (operands[0], operands[1])); + + operands[0] = gen_rtx_REG (V4SImode, REGNO (operands[0])); + rtx sel[4]; + int op2 = INTVAL (operands[2]); + int mask = 1; + + /* Convert imm to an selection. */ + for (int i = 0; i < 4; ++i) + { + sel[i] = (op2 & mask) ? const0_rtx : const1_rtx; + mask = mask << 1; + } + + rtx shuf4i_mask = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, sel)); + emit_insn (gen_lsx_vshuf4i_w (operands[0], operands[0], shuf4i_mask)); + DONE; +}) + (define_insn "lsx_vsrar_<lsxfmt>" [(set (match_operand:ILSX 0 "register_operand" "=f") (unspec:ILSX [(match_operand:ILSX 1 "register_operand" "f") @@ -2537,6 +2570,35 @@ (set_attr "mode" "<MODE>") (set_attr "length" "4")]) +;; In 128-bits register, the template implements the load of identical +;; consecutive SImode data into both the upper 64 bits and lower 64 bits. +;; Operand[2] performs a vec_merge operation on two consecutive addresses +;; SImode data items, and places the result in either the lower 64 bits or +;; the upper 64 bits. When operand[3] is 0, the lower 64 bits are copied +;; to the upper 64 bits; when operand[3] is 1, the upper 64 bits are copied +;; to the lower 64 bits. + +(define_insn "lsx_vldrepl_merge_w_0" + [(set (match_operand:V4SI 0 "register_operand" "=f") + (unspec:V4SI + [(vec_merge:V4SI + (vec_duplicate:V4SI + (mem:SI (match_operand:DI 1 "register_operand" "r"))) + (vec_duplicate:V4SI + (mem:SI (plus:DI (match_dup 1) (const_int 4)))) + (match_operand 2 "const_uimm4_operand" "")) + (match_operand 3 "const_0_or_1_operand" "")] + UNSPEC_LSX_VREPLVEI_MIRROR))] + "ISA_HAS_LSX + && (INTVAL (operands[3]) ? (INTVAL (operands[2]) & 0xc) == 0x4 + : (INTVAL (operands[2]) & 0x3) == 0x1)" +{ + return "vldrepl.d\t%w0,%1,0"; +} + [(set_attr "type" "simd_load") + (set_attr "mode" "V4SI") + (set_attr "length" "4")]) + ;; Offset store by sel (define_expand "lsx_vstelm_<lsxfmt_f>" [(match_operand:LSX 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c index 92da1c8af9ce..a35cda62f12e 100644 --- a/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c +++ b/gcc/testsuite/gcc.target/loongarch/vector/lsx/lsx-vec-construct-opt.c @@ -20,9 +20,9 @@ vec_construct_v2i64 () return res; } -/* Only load the lowest 2 elements and directly copy them to high half-part, - reducing more vinsgr2vr.w. */ -/* { dg-final { scan-assembler-times "v4i32:.*\tvreplvei\\.d.*v4i32" 1 } } */ +/* Load the lowest 2 elements and directly copy them to high half-part + by vldrepl.d. */ +/* { dg-final { scan-assembler-times "v4i32:.*\tvldrepl\\.d.*v4i32" 1 } } */ v4i32 vec_construct_v4i32 () { @@ -32,6 +32,21 @@ vec_construct_v4i32 () return res; } +/* Load 2 elements of a vector simultaneously by vldrepl.d and shuffle by the + vshuf4i.w to avoid use vinsgr2vr. */ +/* { dg-final { scan-assembler-times "v4i32_1:.*\tvldrepl\\.d.*v4i32_1" 1 } } + */ +/* { dg-final { scan-assembler-times "v4i32_1:.*\tvshuf4i\\.w.*v4i32_1" 1 } } + */ +v4i32 +vec_construct_v4i32_1 () +{ + v4i32 res = + { x_si[0], x_si[1], x_si[0], x_si[0] } + ; + return res; +} + /* Only load the lowest 4 elements and directly copy them to high half-part, reducing more vinsgr2vr.h. */ /* { dg-final { scan-assembler-times "v8i16:.*\tvreplvei\\.d.*v8i16" 1 } } */
