Currently, the shuffle in which LoongArch selects two vectors at corresponding positions is implemented through the [x]vshuf instruction, but this will introduce additional index copies. In this case, the [x]vbitsel.v instruction can be used for optimization.
gcc/ChangeLog: * config/loongarch/lasx.md (lasx_xvbitsel_<lasxfmt_f>): Remove. * config/loongarch/loongarch-builtins.cc (CODE_FOR_lsx_vbitsel_v): Adjust. (CODE_FOR_lasx_xvbitsel_v): Ditto. * config/loongarch/loongarch.cc (loongarch_try_expand_lsx_vshuf_const): Ditto. (loongarch_is_bitsel_pattern): Add new check function. (loongarch_expand_vec_perm_bitsel): Add new implement function. (loongarch_expand_lsx_shuffle): Adjust. (loongarch_expand_vec_perm_const): Add new optimize case. * config/loongarch/lsx.md (lsx_vbitsel_<lsxfmt>): Adjust insn pattern mode. * config/loongarch/simd.md (@simd_vbitsel<mode>): New define_insn template. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vec_perm-xvshuf.c: Move to... * gcc.target/loongarch/vec_perm-xvbitsel.c: ...here. * gcc.target/loongarch/vec_perm-vbitsel.c: New test. --- gcc/config/loongarch/lasx.md | 12 --- gcc/config/loongarch/loongarch-builtins.cc | 4 +- gcc/config/loongarch/loongarch.cc | 89 ++++++++++++++++++- gcc/config/loongarch/lsx.md | 12 --- gcc/config/loongarch/simd.md | 13 +++ .../gcc.target/loongarch/vec_perm-vbitsel.c | 17 ++++ ...{vec_perm-xvshuf.c => vec_perm-xvbitsel.c} | 4 +- 7 files changed, 121 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c rename gcc/testsuite/gcc.target/loongarch/{vec_perm-xvshuf.c => vec_perm-xvbitsel.c} (77%) diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md index a37c85a25a4..c1049e319db 100644 --- a/gcc/config/loongarch/lasx.md +++ b/gcc/config/loongarch/lasx.md @@ -1217,18 +1217,6 @@ (define_insn "lasx_xvbitrevi_<lasxfmt>" [(set_attr "type" "simd_bit") (set_attr "mode" "<MODE>")]) -(define_insn "lasx_xvbitsel_<lasxfmt_f>" - [(set (match_operand:LASX 0 "register_operand" "=f") - (ior:LASX (and:LASX (not:LASX - (match_operand:LASX 3 "register_operand" "f")) - (match_operand:LASX 1 "register_operand" "f")) - (and:LASX (match_dup 3) - (match_operand:LASX 2 "register_operand" "f"))))] - "ISA_HAS_LASX" - "xvbitsel.v\t%u0,%u1,%u2,%u3" - [(set_attr "type" "simd_bitmov") - (set_attr "mode" "<MODE>")]) - (define_insn "lasx_xvbitseli_b" [(set (match_operand:V32QI 0 "register_operand" "=f") (ior:V32QI (and:V32QI (not:V32QI diff --git a/gcc/config/loongarch/loongarch-builtins.cc b/gcc/config/loongarch/loongarch-builtins.cc index 92d995a916a..0682bc6baf9 100644 --- a/gcc/config/loongarch/loongarch-builtins.cc +++ b/gcc/config/loongarch/loongarch-builtins.cc @@ -247,7 +247,7 @@ AVAIL_ALL (lasx_frecipe, ISA_HAS_LASX && ISA_HAS_FRECIPE) #define CODE_FOR_lsx_vandi_b CODE_FOR_andv16qi3 #define CODE_FOR_lsx_bnz_v CODE_FOR_lsx_bnz_v_b #define CODE_FOR_lsx_bz_v CODE_FOR_lsx_bz_v_b -#define CODE_FOR_lsx_vbitsel_v CODE_FOR_lsx_vbitsel_b +#define CODE_FOR_lsx_vbitsel_v CODE_FOR_simd_vbitselv16qi #define CODE_FOR_lsx_vseqi_b CODE_FOR_lsx_vseq_b #define CODE_FOR_lsx_vseqi_h CODE_FOR_lsx_vseq_h #define CODE_FOR_lsx_vseqi_w CODE_FOR_lsx_vseq_w @@ -538,7 +538,7 @@ AVAIL_ALL (lasx_frecipe, ISA_HAS_LASX && ISA_HAS_FRECIPE) #define CODE_FOR_lasx_xvaddi_du CODE_FOR_addv4di3 #define CODE_FOR_lasx_xvand_v CODE_FOR_andv32qi3 #define CODE_FOR_lasx_xvandi_b CODE_FOR_andv32qi3 -#define CODE_FOR_lasx_xvbitsel_v CODE_FOR_lasx_xvbitsel_b +#define CODE_FOR_lasx_xvbitsel_v CODE_FOR_simd_vbitselv32qi #define CODE_FOR_lasx_xvseqi_b CODE_FOR_lasx_xvseq_b #define CODE_FOR_lasx_xvseqi_h CODE_FOR_lasx_xvseq_h #define CODE_FOR_lasx_xvseqi_w CODE_FOR_lasx_xvseq_w diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 3ac6a74f15b..2de3110383a 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -8372,7 +8372,10 @@ loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d) else { sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm)); - emit_move_insn (d->target, sel); + /* Weakening dependencies by copying indices (for vshuf). */ + tmp = gen_reg_rtx (d->vmode); + emit_move_insn (tmp, sel); + emit_move_insn (d->target, tmp); } switch (d->vmode) @@ -8444,9 +8447,31 @@ loongarch_is_imm_set_shuffle (struct expand_vec_perm_d *d) return true; } +/* Check if the d->perm meets the requirements of the [x]vbitsel.v insn. */ +static bool +loongarch_is_bitsel_pattern (struct expand_vec_perm_d *d) +{ + bool result = true; + + for (int i = 0; i < d->nelt; i++) + { + unsigned char buf = d->perm[i]; + if ((buf % d->nelt) != i) + { + result = false; + break; + } + } + + return result; +} + static bool loongarch_expand_vec_perm_even_odd (struct expand_vec_perm_d *); +static bool +loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *); + /* Try to match and expand all kinds of 128-bit const vector permutation cases. */ @@ -8462,6 +8487,9 @@ loongarch_expand_lsx_shuffle (struct expand_vec_perm_d *d) if (loongarch_expand_vec_perm_even_odd (d)) return true; + if (loongarch_expand_vec_perm_bitsel (d)) + return true; + return loongarch_try_expand_lsx_vshuf_const (d); } @@ -9122,6 +9150,57 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) } } +/* Try to use the [x]vbitsel.v insn to optimize the vector shuffle, which + can reduce one copy insn in the loop compared to [x]vshuff. */ +static bool +loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *d) +{ + gcc_assert (ISA_HAS_LSX || ISA_HAS_LASX); + + if (!loongarch_is_bitsel_pattern (d)) + return false; + + if (d->testing_p) + return true; + + int i, val; + rtx tmp, tmp2, sel, op0, op1, target; + rtx rperm[MAX_VECT_LEN]; + + for (i = 0; i < d->nelt; i += 1) + { + /* Here -1 means that all bits of the corresponding type are 1 + (including the sign bit). */ + val = d->perm[i] >= d->nelt ? -1 : 0; + rperm[i] = GEN_INT (val); + } + + tmp2 = gen_reg_rtx (d->vmode); + machine_mode vimode = mode_for_vector + (int_mode_for_size (GET_MODE_BITSIZE + (GET_MODE_INNER + (d->vmode)), 0).require (), d->nelt).require (); + + sel = gen_rtx_CONST_VECTOR (vimode, gen_rtvec_v (d->nelt, rperm)); + if (GET_MODE_CLASS (d->vmode) == MODE_VECTOR_FLOAT) + { + /* Because the [x]vbitsel.v insn pattern requires that all src + operands and dest operands are of the same type, they need to + be type-converted. */ + tmp = simplify_gen_subreg (vimode, tmp2, d->vmode, 0); + emit_move_insn (tmp, sel); + } + else + emit_move_insn (tmp2, sel); + + target = d->target; + op0 = d->op0; + op1 = d->one_vector_p ? d->op0 : d->op1; + + emit_insn (gen_simd_vbitsel (d->vmode, target, op0, op1, tmp2)); + return true; +} + /* Following are the assist function for const vector permutation support. */ static bool loongarch_is_quad_duplicate (struct expand_vec_perm_d *d) @@ -9598,6 +9677,9 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) return true; } + if (loongarch_expand_vec_perm_bitsel (d)) + return true; + if (loongarch_if_match_xvshuffle (d)) { if (d->testing_p) @@ -9666,7 +9748,10 @@ expand_perm_const_end: default: sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm)); - emit_move_insn (d->target, sel); + /* Weakening dependencies by copying indices (for xvshuf). */ + tmp = gen_reg_rtx (d->vmode); + emit_move_insn (tmp, sel); + emit_move_insn (d->target, tmp); break; } diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md index ca0066a21ed..0d7a8588819 100644 --- a/gcc/config/loongarch/lsx.md +++ b/gcc/config/loongarch/lsx.md @@ -1073,18 +1073,6 @@ (define_insn "lsx_vbitrevi_<lsxfmt>" [(set_attr "type" "simd_bit") (set_attr "mode" "<MODE>")]) -(define_insn "lsx_vbitsel_<lsxfmt>" - [(set (match_operand:ILSX 0 "register_operand" "=f") - (ior:ILSX (and:ILSX (not:ILSX - (match_operand:ILSX 3 "register_operand" "f")) - (match_operand:ILSX 1 "register_operand" "f")) - (and:ILSX (match_dup 3) - (match_operand:ILSX 2 "register_operand" "f"))))] - "ISA_HAS_LSX" - "vbitsel.v\t%w0,%w1,%w2,%w3" - [(set_attr "type" "simd_bitmov") - (set_attr "mode" "<MODE>")]) - (define_insn "lsx_vbitseli_b" [(set (match_operand:V16QI 0 "register_operand" "=f") (ior:V16QI (and:V16QI (not:V16QI diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md index 7605b17d21e..4df19b06727 100644 --- a/gcc/config/loongarch/simd.md +++ b/gcc/config/loongarch/simd.md @@ -546,6 +546,19 @@ (define_expand "cbranch<mode>4" DONE; }) +(define_insn "@simd_vbitsel<mode>" + [(set (match_operand:ALLVEC 0 "register_operand" "=f") + (ior:ALLVEC + (and:ALLVEC + (not:ALLVEC (match_operand:ALLVEC 3 "register_operand" "f")) + (match_operand:ALLVEC 1 "register_operand" "f")) + (and:ALLVEC (match_dup 3) + (match_operand:ALLVEC 2 "register_operand" "f"))))] + "" + "<x>vbitsel.v\t%<wu>0,%<wu>1,%<wu>2,%<wu>3" + [(set_attr "type" "simd_bitmov") + (set_attr "mode" "<MODE>")]) + ; The LoongArch SX Instructions. (include "lsx.md") diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c new file mode 100644 index 00000000000..7a5118273c5 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-vbitsel.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mlsx" } */ +/* { dg-final { scan-assembler-not "vshuf.w" } } */ +/* { dg-final { scan-assembler-not "vori.b" } } */ +/* { dg-final { scan-assembler "vbitsel.v" } } */ + +void +foo (int a[], int b[], int c[]) +{ + for (int i = 0; i < 100; i += 4) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 1] - b[i + 1]; + c[i + 2] = a[i + 2] - b[i + 2]; + c[i + 3] = a[i + 3] + b[i + 3]; + } +} diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c similarity index 77% rename from gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c rename to gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c index 6b19c2c2fd8..b3808b550e5 100644 --- a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvshuf.c +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel.c @@ -1,8 +1,8 @@ /* { dg-do compile } */ /* { dg-options "-O3 -mlasx" } */ -/* { dg-final { scan-assembler "xvshuf.w" } } */ +/* { dg-final { scan-assembler-not "xvshuf.w" } } */ /* { dg-final { scan-assembler-not "xvperm.w" } } */ -/* { dg-final { scan-assembler-not "xvbitsel.v" } } */ +/* { dg-final { scan-assembler "xvbitsel.v" } } */ void foo (int a[], int b[], int c[]) -- 2.38.1