In LoongArch, when the permutation idx comes from different vectors and idx is not repeated, for V8SI/V8SF/V4DI/V4DF type vectors, we can use two xvperm.w + one xvbitsel.v instructions or two xvpermi.d + one xvbitsel.v instructions for shuffle optimization.
gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_expand_vec_perm_generic_bitsel): Add new vector shuffle optimize function. (loongarch_expand_vec_perm_const): Adjust. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vec_perm-xvbitsel-2.c: New test. * gcc.target/loongarch/vec_perm-xvbitsel-3.c: New test. --- gcc/config/loongarch/loongarch.cc | 136 ++++++++++++++++++ .../loongarch/vec_perm-xvbitsel-2.c | 18 +++ .../loongarch/vec_perm-xvbitsel-3.c | 22 +++ 3 files changed, 176 insertions(+) create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index a4a72923b7f..16fe755742b 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -9229,6 +9229,139 @@ loongarch_expand_vec_perm_bitsel (struct expand_vec_perm_d *d) return true; } +/* A general shuffle method for 256-bit V8SI/V8SF/V4DI/V4DF types when + the permutate idx comes from different vectors and idx is not repeated. */ +static bool +loongarch_expand_vec_perm_generic_bitsel (struct expand_vec_perm_d *d) +{ + if (!ISA_HAS_LASX) + return false; + + auto_bitmap used; + machine_mode mode = d->vmode; + int nelt = d->nelt, val, i; + + /* Due to instruction set restrictions, the following types do not support + this optimization method. */ + if (mode != E_V8SImode && mode != E_V8SFmode + && mode != E_V4DImode && mode != E_V4DFmode) + return false; + + /* We should ensure that d->perm[i] % nelt has no repeat. */ + for (i = 0; i < nelt; i += 1) + { + if (bitmap_bit_p (used, d->perm[i] % nelt)) + return false; + else + bitmap_set_bit (used, d->perm[i] % nelt); + } + + if (d->testing_p) + return true; + + rtx reg_bitsel, tmp_bitsel, sel_bitsel, op0, op1; + rtx rmap_bitsel[MAX_VECT_LEN]; + op0 = gen_reg_rtx (mode); + op1 = gen_reg_rtx (mode); + reg_bitsel = gen_reg_rtx (mode); + + if (mode == E_V8SImode || mode == E_V8SFmode) + { + rtx rmap_xvperm[MAX_VECT_LEN]; + rtx sel_xvperm, reg_xvperm; + + for (i = 0; i < nelt; i += 1) + { + /* For xvperm insn we just copy original permutate index. */ + rmap_xvperm[i] = GEN_INT (d->perm[i]); + val = d->perm[i] >= nelt ? -1 : 0; + /* For xvbitsel insn we should do some conversion, where -1 means + the destination element comes from operand1, and 0 means the + destination element comes from operand0. */ + rmap_bitsel[i] = GEN_INT (val); + } + + reg_xvperm = gen_reg_rtx (E_V8SImode); + + /* Prepare reg of selective index for xvperm. */ + sel_xvperm = gen_rtx_CONST_VECTOR (E_V8SImode, + gen_rtvec_v (nelt, rmap_xvperm)); + emit_move_insn (reg_xvperm, sel_xvperm); + + /* Prepare reg of selective index for xvbitsel. */ + sel_bitsel = gen_rtx_CONST_VECTOR (E_V8SImode, + gen_rtvec_v (nelt, rmap_bitsel)); + if (mode == E_V8SFmode) + { + tmp_bitsel = simplify_gen_subreg (E_V8SImode, reg_bitsel, mode, 0); + emit_move_insn (tmp_bitsel, sel_bitsel); + } + else + emit_move_insn (reg_bitsel, sel_bitsel); + + switch (mode) + { + case E_V8SFmode: + emit_insn (gen_lasx_xvperm_w_f (op0, d->op0, reg_xvperm)); + emit_insn (gen_lasx_xvperm_w_f (op1, d->op1, reg_xvperm)); + break; + case E_V8SImode: + emit_insn (gen_lasx_xvperm_w (op0, d->op0, reg_xvperm)); + emit_insn (gen_lasx_xvperm_w (op1, d->op1, reg_xvperm)); + break; + default: + gcc_unreachable (); + break; + } + + emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel)); + } + else + { + unsigned int imm = 0; + unsigned int val2; + + for (i = nelt - 1; i >= 0; i -= 1) + { + val = d->perm[i] >= nelt ? -1 : 0; + rmap_bitsel[i] = GEN_INT (val); + val2 = d->perm[i] % nelt; + imm |= val2; + imm = (i != 0) ? imm << 2 : imm; + } + + /* Prepare reg of selective index for xvbitsel. */ + sel_bitsel = gen_rtx_CONST_VECTOR (E_V4DImode, + gen_rtvec_v (nelt, rmap_bitsel)); + if (mode == E_V4DFmode) + { + tmp_bitsel = simplify_gen_subreg (E_V4DImode, reg_bitsel, mode, 0); + emit_move_insn (tmp_bitsel, sel_bitsel); + } + else + emit_move_insn (reg_bitsel, sel_bitsel); + + switch (mode) + { + case E_V4DFmode: + emit_insn (gen_lasx_xvpermi_d_v4df (op0, d->op0, GEN_INT (imm))); + emit_insn (gen_lasx_xvpermi_d_v4df (op1, d->op1, GEN_INT (imm))); + break; + case E_V4DImode: + emit_insn (gen_lasx_xvpermi_d_v4di (op0, d->op0, GEN_INT (imm))); + emit_insn (gen_lasx_xvpermi_d_v4di (op1, d->op1, GEN_INT (imm))); + break; + default: + gcc_unreachable (); + break; + } + + emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel)); + } + + return true; +} + /* Following are the assist function for const vector permutation support. */ static bool loongarch_is_quad_duplicate (struct expand_vec_perm_d *d) @@ -9749,6 +9882,9 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d) goto expand_perm_const_end; } + if (loongarch_expand_vec_perm_generic_bitsel (d)) + return true; + expand_perm_const_end: if (flag) { diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c new file mode 100644 index 00000000000..3c38199126a --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mlasx" } */ +/* { dg-final { scan-assembler "xvpermi.d" } } */ +/* { dg-final { scan-assembler-not "xvrepli.w" } } */ +/* { dg-final { scan-assembler-not "xvand.v" } } */ +/* { dg-final { scan-assembler-not "xvseq.w" } } */ + +void +foo (double a[], double b[], double c[]) +{ + for (int i = 0; i < 800; i += 4) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 2] - b[i + 2]; + c[i + 2] = a[i + 3] - b[i + 3]; + c[i + 3] = a[i + 1] + b[i + 1]; + } +} diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c new file mode 100644 index 00000000000..065c816a15d --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mlasx" } */ +/* { dg-final { scan-assembler "xvperm.w" } } */ +/* { dg-final { scan-assembler-not "xvrepli.w" } } */ +/* { dg-final { scan-assembler-not "xvand.v" } } */ +/* { dg-final { scan-assembler-not "xvseq.w" } } */ + +void +foo (float a[], float b[], float c[]) +{ + for (int i = 0; i < 800; i += 8) + { + c[i + 0] = a[i + 0] + b[i + 0]; + c[i + 1] = a[i + 1] + b[i + 1]; + c[i + 2] = a[i + 4] - b[i + 4]; + c[i + 3] = a[i + 5] - b[i + 5]; + c[i + 4] = a[i + 2] - b[i + 2]; + c[i + 5] = a[i + 3] - b[i + 3]; + c[i + 6] = a[i + 6] + b[i + 6]; + c[i + 7] = a[i + 7] + b[i + 7]; + } +} -- 2.38.1