From: Robin Dapp <rd...@ventanamicro.com> This patch adds a shuffle_slide_patterns to expand_vec_perm_const. It recognizes permutations like
{0, 1, 4, 5} or {2, 3, 6, 7} which can be constructed by a slideup or slidedown of one of the vectors into the other one. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_slide_patterns): New. (expand_vec_perm_const_1): Call new function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c: New test. RISC-V: Add interleave pattern. This patch adds efficient handling of interleaving patterns like [0 4 1 5] to vec_perm_const. It is implemented by a slideup and a gather. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_interleave_patterns): New function. (expand_vec_perm_const_1): Use new function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c: New test. RISC-V: Add even/odd vec_perm_const pattern. This adds handling for even/odd patterns. gcc/ChangeLog: * config/riscv/riscv-v.cc (shuffle_evenodd_patterns): New function. (expand_vec_perm_const_1): Use new function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c: New test. RISC-V: Improve slide1up pattern. This patch adds a second variant to implement the extract/slide1up pattern. In order to do a permutation like <3, 4, 5, 6> from vectors <0, 1, 2, 3> and <4, 5, 6, 7> we currently extract <3> from the first vector and re-insert it into the second vector. Unless register-file crossing latency is essentially zero it should be preferable to first slide the second vector up by one, then slide down the first vector by (nunits - 1). gcc/ChangeLog: * config/riscv/riscv-protos.h (riscv_register_move_cost): Export. * config/riscv/riscv-v.cc (shuffle_extract_and_slide1up_patterns): Add slideup/slidedown variant. * config/riscv/riscv.cc (riscv_secondary_memory_needed): Remove static. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr112599-2.c: Adjust test expectation. varasm: Use PRECISION instead of SIZE [PR123456]. When optimizing the constant pool we currently don't handle vector mask modes properly. While emitting two compress patterns {1, 0, 1, 0} (V4BI) and {1, 0, 1, 0, 1, 0, 1, 0} (V8BI) I noticed that they would end up with the same constant pool entry of "10" (={1, 0, 1, 0}). This is because we compare MODE_SIZE elements instead of MODE_PRECISION so the hash of both constants would be identical. This patch uses GET_MODE_PRECISION instead and also fixes one similar instance in simplify-rtx. gcc/ChangeLog: * simplify-rtx.cc (native_encode_rtx): Use GET_MODE_PRECISION. * varasm.cc (optimize_constant_pool): Ditto. --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 287 +++++++++++++++++- gcc/config/riscv/riscv.cc | 18 +- gcc/simplify-rtx.cc | 2 +- .../gcc.target/riscv/rvv/autovec/pr112599-2.c | 2 +- .../autovec/vls-vlmax/shuffle-evenodd-run.c | 122 ++++++++ .../rvv/autovec/vls-vlmax/shuffle-evenodd.c | 68 +++++ .../vls-vlmax/shuffle-interleave-run.c | 122 ++++++++ .../autovec/vls-vlmax/shuffle-interleave.c | 69 +++++ .../rvv/autovec/vls-vlmax/shuffle-slide-run.c | 266 ++++++++++++++++ .../rvv/autovec/vls-vlmax/shuffle-slide.c | 207 +++++++++++++ gcc/varasm.cc | 2 +- 12 files changed, 1152 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 500b357f6eb..ecb4e64cdf8 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -139,6 +139,7 @@ extern void riscv_expand_ussub (rtx, rtx, rtx); extern void riscv_expand_sssub (rtx, rtx, rtx); extern void riscv_expand_ustrunc (rtx, rtx); extern void riscv_expand_sstrunc (rtx, rtx); +extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index a0e22b6454b..76ee95d5b21 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3377,6 +3377,245 @@ shuffle_compress_patterns (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns like [4 5 6 7 12 13 14 15] where either the lower + or the higher parts of both vectors are combined into one. */ + +static bool +shuffle_slide_patterns (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + poly_int64 vec_len = d->perm.length (); + + if (!vec_len.is_constant ()) + return false; + + int vlen = vec_len.to_constant (); + if (vlen < 4) + return false; + + if (d->one_vector_p) + return false; + + /* For a slideup OP0 can stay, for a slidedown OP1 can. + The former requires that the first element of the permutation + is the first element of OP0, the latter that the last permutation + element is the last element of OP1. */ + bool slideup = false; + bool slidedown = false; + + /* For a slideup the permutation must start at OP0's first element. */ + if (known_eq (d->perm[0], 0)) + slideup = true; + + /* For a slidedown the permutation must end at OP1's last element. */ + if (known_eq (d->perm[vlen - 1], 2 * vlen - 1)) + slidedown = true; + + if (slideup && slidedown) + return false; + + if (!slideup && !slidedown) + return false; + + /* Check for a monotonic sequence with one pivot. */ + int pivot = -1; + for (int i = 0; i < vlen; i++) + { + if (pivot == -1 && known_ge (d->perm[i], vec_len)) + pivot = i; + if (i > 0 && i != pivot + && maybe_ne (d->perm[i], d->perm[i - 1] + 1)) + return false; + } + + if (pivot == -1) + return false; + + /* For a slideup OP1's part (to be slid up) must be a low part, + i.e. starting with its first element. */ + if (slideup && maybe_ne (d->perm[pivot], vlen)) + return false; + + /* For a slidedown OP0's part (to be slid down) must be a high part, + i.e. ending with its last element. */ + if (slidedown && maybe_ne (d->perm[pivot - 1], vlen - 1)) + return false; + + /* Success! */ + if (d->testing_p) + return true; + + /* PIVOT is the start of the lower/higher part of OP1 or OP2. + For a slideup it indicates how many elements of OP1 to + skip/slide over. For a slidedown it indicates how long + OP1's high part is, while VLEN - PIVOT is the amount to slide. */ + int slide_cnt = slideup ? pivot : vlen - pivot; + insn_code icode; + if (slideup) + { + /* No need for a vector length because we slide up until the + end of OP1 anyway. */ + rtx ops[] = {d->target, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode); + emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops); + } + else + { + /* Here we need a length because we slide to the beginning of OP1 + leaving the remaining elements undisturbed. */ + int len = pivot; + rtx ops[] = {d->target, d->op1, d->op0, + gen_int_mode (slide_cnt, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode); + emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops, + gen_int_mode (len, Pmode)); + } + + return true; +} + +/* Recognize interleaving patterns like [0 4 1 5]. */ + +static bool +shuffle_interleave_patterns (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + machine_mode sel_mode = related_int_vector_mode (vmode).require (); + poly_int64 vec_len = d->perm.length (); + int n_patterns = d->perm.encoding ().npatterns (); + + if (!vec_len.is_constant ()) + return false; + + if (n_patterns != 2) + return false; + + int vlen = vec_len.to_constant (); + + if (vlen < 4 || vlen > 64) + return false; + + if (d->one_vector_p) + return false; + + bool low = true; + if (d->perm.series_p (0, 2, 0, 1) + && d->perm.series_p (1, 2, vlen, 1)) + low = true; + else if (d->perm.series_p (0, 2, vlen / 2, 1) + && d->perm.series_p (1, 2, vlen + vlen / 2, 1)) + low = false; + else + return false; + + vec_perm_builder sel (vlen, 2, 1); + sel.safe_grow (vlen); + int cnt = 0; + for (int i = 0; i < vlen; i += 2) + { + sel[i] = cnt; + sel[i + 1] = cnt + vlen / 2; + cnt++; + } + + vec_perm_indices indices (sel, 2, vlen); + + if (vlen != (int)indices.length ().to_constant ()) + return false; + + /* Success! */ + if (d->testing_p) + return true; + + int slide_cnt = vlen / 2; + rtx tmp = gen_reg_rtx (vmode); + + if (low) + { + /* No need for a vector length because we slide up until the + end of OP1 anyway. */ + rtx ops[] = {tmp, d->op0, d->op1, gen_int_mode (slide_cnt, Pmode)}; + insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode); + emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops); + } + else + { + rtx ops[] = {tmp, d->op1, d->op0, gen_int_mode (slide_cnt, Pmode)}; + insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, vmode); + emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops, + gen_int_mode (slide_cnt, Pmode)); + } + + rtx sel_rtx = vec_perm_indices_to_rtx (sel_mode, indices); + emit_vlmax_gather_insn (gen_lowpart (vmode, d->target), tmp, sel_rtx); + + return true; +} + + +/* Recognize even/odd patterns like [0 2 4 6]. We use two compress + and one slideup.j */ + +static bool +shuffle_evenodd_patterns (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + poly_int64 vec_len = d->perm.length (); + int n_patterns = d->perm.encoding ().npatterns (); + + if (n_patterns != 1) + return false; + + if (!vec_len.is_constant ()) + return false; + + int vlen = vec_len.to_constant (); + if (vlen < 4 || vlen > 64) + return false; + + if (d->one_vector_p) + return false; + + bool even = true; + if (!d->perm.series_p (0, 1, 0, 2)) + { + even = false; + if (!d->perm.series_p (0, 1, 1, 2)) + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + machine_mode mask_mode = get_mask_mode (vmode); + rvv_builder builder (mask_mode, vlen, 1); + int bit = even ? 0 : 1; + for (int i = 0; i < vlen; i++) + { + bit ^= 1; + if (bit) + builder.quick_push (CONST1_RTX (BImode)); + else + builder.quick_push (CONST0_RTX (BImode)); + } + rtx mask = force_reg (mask_mode, builder.build ()); + + insn_code icode = code_for_pred_compress (vmode); + rtx ops1[] = {d->target, d->op0, mask}; + emit_vlmax_insn (icode, COMPRESS_OP, ops1); + + rtx tmp2 = gen_reg_rtx (vmode); + rtx ops2[] = {tmp2, d->op1, mask}; + emit_vlmax_insn (icode, COMPRESS_OP, ops2); + + rtx ops[] = {d->target, d->target, tmp2, gen_int_mode (vlen / 2, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode); + emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops); + + return true; +} + /* Recognize decompress patterns: 1. VEC_PERM_EXPR op0 and op1 @@ -3561,17 +3800,39 @@ shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d) if (d->testing_p) return true; - /* Extract the last element of the first vector. */ - scalar_mode smode = GET_MODE_INNER (d->vmode); - rtx tmp = gen_reg_rtx (smode); - emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); + int scalar_cost = riscv_register_move_cost (d->vmode, V_REGS, GR_REGS) + + riscv_register_move_cost (d->vmode, GR_REGS, V_REGS) + 2; + int slide_cost = 2; + + if (slide_cost < scalar_cost) + { + /* This variant should always be preferable because we just need two + slides. The extract-variant also requires two slides but additionally + pays the latency for register-file crossing. */ + rtx tmp = gen_reg_rtx (d->vmode); + rtx ops[] = {tmp, d->op1, gen_int_mode (1, Pmode)}; + insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, d->vmode); + emit_vlmax_insn (icode, BINARY_OP, ops); + + rtx ops2[] = {d->target, tmp, d->op0, gen_int_mode (nunits - 1, Pmode)}; + icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode); + emit_nonvlmax_insn (icode, BINARY_OP_TUMA, ops2, gen_int_mode (1, Pmode)); + } + else + { + /* Extract the last element of the first vector. */ + scalar_mode smode = GET_MODE_INNER (d->vmode); + rtx tmp = gen_reg_rtx (smode); + emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); + + /* Insert the scalar into element 0. */ + unsigned int unspec + = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; + insn_code icode = code_for_pred_slide (unspec, d->vmode); + rtx ops[] = {d->target, d->op1, tmp}; + emit_vlmax_insn (icode, BINARY_OP, ops); + } - /* Insert the scalar into element 0. */ - unsigned int unspec - = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; - insn_code icode = code_for_pred_slide (unspec, d->vmode); - rtx ops[] = {d->target, d->op1, tmp}; - emit_vlmax_insn (icode, BINARY_OP, ops); return true; } @@ -3691,6 +3952,12 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; if (shuffle_consecutive_patterns (d)) return true; + if (shuffle_slide_patterns (d)) + return true; + if (shuffle_interleave_patterns (d)) + return true; + if (shuffle_evenodd_patterns (d)) + return true; if (shuffle_compress_patterns (d)) return true; if (shuffle_decompress_patterns (d)) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 7694954c4c5..62b80fefedd 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -9464,7 +9464,7 @@ riscv_secondary_memory_needed (machine_mode mode, reg_class_t class1, /* Implement TARGET_REGISTER_MOVE_COST. */ -static int +int riscv_register_move_cost (machine_mode mode, reg_class_t from, reg_class_t to) { @@ -9472,6 +9472,22 @@ riscv_register_move_cost (machine_mode mode, (from == GR_REGS && to == FP_REGS)) return tune_param->fmv_cost; + if (from == V_REGS) + { + if (to == GR_REGS) + return get_vector_costs ()->regmove->VR2GR; + else if (to == FP_REGS) + return get_vector_costs ()->regmove->VR2FR; + } + + if (to == V_REGS) + { + if (from == GR_REGS) + return get_vector_costs ()->regmove->GR2VR; + else if (from == FP_REGS) + return get_vector_costs ()->regmove->FR2VR; + } + return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2; } diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index d05efac20dc..d00e5521059 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -7404,7 +7404,7 @@ native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes, /* Make sure that the region is in range. */ unsigned int end_byte = first_byte + num_bytes; - unsigned int mode_bytes = GET_MODE_SIZE (smode); + unsigned int mode_bytes = GET_MODE_PRECISION (smode); gcc_assert (end_byte <= mode_bytes); if (CONST_SCALAR_INT_P (x)) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c index fd87565b054..79d87196bf7 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112599-2.c @@ -48,4 +48,4 @@ foo(void) } /* { dg-final { scan-assembler-not {vrgather} } } */ -/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */ +/* { dg-final { scan-assembler {vslide} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c new file mode 100644 index 00000000000..c0760e5ed30 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd-run.c @@ -0,0 +1,122 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target riscv_v_ok } */ +/* { dg-add-options riscv_v } */ +/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */ + +#include "shuffle-evenodd.c" + +#define SERIES_2(x, y) (x), (x + 1) +#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y) +#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y) +#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y) +#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y) +#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y) + +#define comp(a, b, n) \ + for (unsigned i = 0; i < n; ++i) \ + if ((a)[i] != (b)[i]) \ + __builtin_abort (); + +#define CHECK1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check1_##TYPE () \ + { \ + TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref = (TYPE){MASKE_##NUNITS (0, NUNITS)}; \ + TYPE res; \ + permute1_##TYPE (v0, v1, &res); \ + comp (res, ref, NUNITS); \ + } + +#define CHECK2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check2_##TYPE () \ + { \ + TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref = (TYPE){MASKO_##NUNITS (0, NUNITS)}; \ + TYPE res; \ + permute2_##TYPE (v0, v1, &res); \ + comp (res, ref, NUNITS); \ + } + +#define CHECK_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +CHECK_ALL (CHECK1) +CHECK_ALL (CHECK2) + +int +main () +{ + check1_vnx4qi (); + check1_vnx8qi (); + check1_vnx16qi (); + check1_vnx32qi (); + check1_vnx64qi (); + check1_vnx4hi (); + check1_vnx8hi (); + check1_vnx16hi (); + check1_vnx32hi (); + check1_vnx64hi (); + check1_vnx4si (); + check1_vnx8si (); + check1_vnx16si (); + check1_vnx32si (); + check1_vnx4di (); + check1_vnx8di (); + check1_vnx16di (); + check1_vnx4sf (); + check1_vnx8sf (); + check1_vnx16sf (); + check1_vnx32sf (); + check1_vnx4df (); + check1_vnx8df (); + check1_vnx16df (); + check2_vnx4qi (); + check2_vnx8qi (); + check2_vnx16qi (); + check2_vnx32qi (); + check2_vnx64qi (); + check2_vnx4hi (); + check2_vnx8hi (); + check2_vnx16hi (); + check2_vnx32hi (); + check2_vnx64hi (); + check2_vnx4si (); + check2_vnx8si (); + check2_vnx16si (); + check2_vnx32si (); + check2_vnx4di (); + check2_vnx8di (); + check2_vnx16di (); + check2_vnx4sf (); + check2_vnx8sf (); + check2_vnx16sf (); + check2_vnx32sf (); + check2_vnx4df (); + check2_vnx8df (); + check2_vnx16df (); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c new file mode 100644 index 00000000000..21570d7986e --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-evenodd.c @@ -0,0 +1,68 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mrvv-max-lmul=m8" } */ + +#include "perm.h" + +#define MASKE_2(x, y) (x), (x + 2) +#define MASKE_4(x, y) MASKE_2 (x, y), MASKE_2 (x + 4, y) +#define MASKE_8(x, y) MASKE_4 (x, y), MASKE_4 (x + 8, y) +#define MASKE_16(x, y) MASKE_8 (x, y), MASKE_8 (x + 16, y) +#define MASKE_32(x, y) MASKE_16 (x, y), MASKE_16 (x + 32, y) +#define MASKE_64(x, y) MASKE_32 (x, y), MASKE_32 (x + 64, y) + +#define MASKO_2(x, y) (x + 1), (x + 3) +#define MASKO_4(x, y) MASKO_2 (x, y), MASKO_2 (x + 4, y) +#define MASKO_8(x, y) MASKO_4 (x, y), MASKO_4 (x + 8, y) +#define MASKO_16(x, y) MASKO_8 (x, y), MASKO_8 (x + 16, y) +#define MASKO_32(x, y) MASKO_16 (x, y), MASKO_16 (x + 32, y) +#define MASKO_64(x, y) MASKO_32 (x, y), MASKO_32 (x + 64, y) + +#define PERMUTE1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute1_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASKE_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute2_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASKO_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define TEST_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +TEST_ALL (PERMUTE1) +TEST_ALL (PERMUTE2) + +/* { dg-final { scan-assembler-times "vslideup" 48 } } */ +/* { dg-final { scan-assembler-times "vcompress" 96 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c new file mode 100644 index 00000000000..57748d95362 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave-run.c @@ -0,0 +1,122 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target riscv_v_ok } */ +/* { dg-add-options riscv_v } */ +/* { dg-additional-options "-O3 -mrvv-max-lmul=m8 -std=gnu99" } */ + +#include "shuffle-interleave.c" + +#define SERIES_2(x, y) (x), (x + 1) +#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y) +#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y) +#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y) +#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y) +#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y) + +#define comp(a, b, n) \ + for (unsigned i = 0; i < n; ++i) \ + if ((a)[i] != (b)[i]) \ + __builtin_abort (); + +#define CHECK1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check1_##TYPE () \ + { \ + TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref = (TYPE){MASKL_##NUNITS (0, NUNITS)}; \ + TYPE res; \ + permute1_##TYPE (v0, v1, &res); \ + comp (res, ref, NUNITS); \ + } + +#define CHECK2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check2_##TYPE () \ + { \ + TYPE v0 = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v1 = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref = (TYPE){MASKH_##NUNITS (0, NUNITS)}; \ + TYPE res; \ + permute2_##TYPE (v0, v1, &res); \ + comp (res, ref, NUNITS); \ + } + +#define CHECK_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +CHECK_ALL (CHECK1) +CHECK_ALL (CHECK2) + +int +main () +{ + check1_vnx4qi (); + check1_vnx8qi (); + check1_vnx16qi (); + check1_vnx32qi (); + check1_vnx64qi (); + check1_vnx4hi (); + check1_vnx8hi (); + check1_vnx16hi (); + check1_vnx32hi (); + check1_vnx64hi (); + check1_vnx4si (); + check1_vnx8si (); + check1_vnx16si (); + check1_vnx32si (); + check1_vnx4di (); + check1_vnx8di (); + check1_vnx16di (); + check1_vnx4sf (); + check1_vnx8sf (); + check1_vnx16sf (); + check1_vnx32sf (); + check1_vnx4df (); + check1_vnx8df (); + check1_vnx16df (); + check2_vnx4qi (); + check2_vnx8qi (); + check2_vnx16qi (); + check2_vnx32qi (); + check2_vnx64qi (); + check2_vnx4hi (); + check2_vnx8hi (); + check2_vnx16hi (); + check2_vnx32hi (); + check2_vnx64hi (); + check2_vnx4si (); + check2_vnx8si (); + check2_vnx16si (); + check2_vnx32si (); + check2_vnx4di (); + check2_vnx8di (); + check2_vnx16di (); + check2_vnx4sf (); + check2_vnx8sf (); + check2_vnx16sf (); + check2_vnx32sf (); + check2_vnx4df (); + check2_vnx8df (); + check2_vnx16df (); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c new file mode 100644 index 00000000000..3e241f01871 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-interleave.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mrvv-max-lmul=m8" } */ + +#include "perm.h" + +#define MASKL_2(x, y) (x), (x + y) +#define MASKL_4(x, y) MASKL_2 (x, y), MASKL_2 (x + 1, y) +#define MASKL_8(x, y) MASKL_4 (x, y), MASKL_4 (x + 2, y) +#define MASKL_16(x, y) MASKL_8 (x, y), MASKL_8 (x + 4, y) +#define MASKL_32(x, y) MASKL_16 (x, y), MASKL_16 (x + 8, y) +#define MASKL_64(x, y) MASKL_32 (x, y), MASKL_32 (x + 16, y) + +#define MASKH_2(x, y) (x + y / 2), (x + y / 2 + y) +#define MASKH_4(x, y) MASKH_2 (x, y), MASKH_2 (x + 1, y) +#define MASKH_8(x, y) MASKH_4 (x, y), MASKH_4 (x + 2, y) +#define MASKH_16(x, y) MASKH_8 (x, y), MASKH_8 (x + 4, y) +#define MASKH_32(x, y) MASKH_16 (x, y), MASKH_16 (x + 8, y) +#define MASKH_64(x, y) MASKH_32 (x, y), MASKH_32 (x + 16, y) + +#define PERMUTE1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute1_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASKL_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute2_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASKH_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define TEST_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +TEST_ALL (PERMUTE1) +TEST_ALL (PERMUTE2) + +/* { dg-final { scan-assembler-times "vslideup" 24 } } */ +/* { dg-final { scan-assembler-times "vslidedown" 24 } } */ +/* { dg-final { scan-assembler-times "vrgather" 48 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c new file mode 100644 index 00000000000..50cba3fc8f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide-run.c @@ -0,0 +1,266 @@ +/* { dg-do run } */ +/* { dg-require-effective-target riscv_v_ok } */ +/* { dg-add-options riscv_v } */ +/* { dg-additional-options "-O3 -std=gnu99 -mrvv-max-lmul=m8 -Wno-overflow" } */ + +#include "shuffle-slide.c" + +#define comp(a, b, n) \ + for (unsigned i = 0; i < n; ++i) \ + if ((a)[i] != (b)[i]) \ + __builtin_abort (); + +#define CHECK1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check1_##TYPE () \ + { \ + TYPE v10_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v11_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref1_##TYPE = (TYPE){MASK1_##NUNITS (0, NUNITS)}; \ + TYPE res1_##TYPE; \ + permute1_##TYPE (v10_##TYPE, v11_##TYPE, &res1_##TYPE); \ + comp (res1_##TYPE, ref1_##TYPE, NUNITS); \ + } + +#define CHECK2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check2_##TYPE () \ + { \ + TYPE v20_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v21_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref2_##TYPE = (TYPE){MASK1D_##NUNITS (0, NUNITS)}; \ + TYPE res2_##TYPE; \ + permute2_##TYPE (v20_##TYPE, v21_##TYPE, &res2_##TYPE); \ + comp (res2_##TYPE, ref2_##TYPE, NUNITS); \ + } + +#define CHECK3(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check3_##TYPE () \ + { \ + TYPE v30_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v31_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref3_##TYPE = (TYPE){MASK2U_##NUNITS (0, NUNITS)}; \ + TYPE res3_##TYPE; \ + permute3_##TYPE (v30_##TYPE, v31_##TYPE, &res3_##TYPE); \ + comp (res3_##TYPE, ref3_##TYPE, NUNITS); \ + } + +#define CHECK4(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check4_##TYPE () \ + { \ + TYPE v40_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v41_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref4_##TYPE = (TYPE){MASK3U_##NUNITS (0, NUNITS)}; \ + TYPE res4_##TYPE; \ + permute4_##TYPE (v40_##TYPE, v41_##TYPE, &res4_##TYPE); \ + comp (res4_##TYPE, ref4_##TYPE, NUNITS); \ + } + +#define CHECK5(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check5_##TYPE () \ + { \ + TYPE v50_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v51_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref5_##TYPE = (TYPE){MASK2D_##NUNITS (0, NUNITS)}; \ + TYPE res5_##TYPE; \ + permute5_##TYPE (v50_##TYPE, v51_##TYPE, &res5_##TYPE); \ + comp (res5_##TYPE, ref5_##TYPE, NUNITS); \ + } + +#define CHECK6(TYPE, NUNITS) \ + __attribute__ ((noipa)) void check6_##TYPE () \ + { \ + TYPE v60_##TYPE = (TYPE){SERIES_##NUNITS (0, NUNITS)}; \ + TYPE v61_##TYPE = (TYPE){SERIES_##NUNITS (NUNITS, NUNITS)}; \ + TYPE ref6_##TYPE = (TYPE){MASK3D_##NUNITS (0, NUNITS)}; \ + TYPE res6_##TYPE; \ + permute6_##TYPE (v60_##TYPE, v61_##TYPE, &res6_##TYPE); \ + comp (res6_##TYPE, ref6_##TYPE, NUNITS); \ + } + +#define CHECK_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx128qi, 128) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +CHECK_ALL (CHECK1) +CHECK_ALL (CHECK2) +CHECK_ALL (CHECK3) +CHECK_ALL (CHECK4) +CHECK_ALL (CHECK5) +CHECK_ALL (CHECK6) + +int +main () +{ + check1_vnx4qi (); + check1_vnx8qi (); + check1_vnx16qi (); + check1_vnx32qi (); + check1_vnx64qi (); + check1_vnx128qi (); + check1_vnx4hi (); + check1_vnx8hi (); + check1_vnx16hi (); + check1_vnx32hi (); + check1_vnx64hi (); + check1_vnx4si (); + check1_vnx8si (); + check1_vnx16si (); + check1_vnx32si (); + check1_vnx4di (); + check1_vnx8di (); + check1_vnx16di (); + check1_vnx4sf (); + check1_vnx8sf (); + check1_vnx16sf (); + check1_vnx32sf (); + check1_vnx4df (); + check1_vnx8df (); + check1_vnx16df (); + check2_vnx4qi (); + check2_vnx8qi (); + check2_vnx16qi (); + check2_vnx32qi (); + check2_vnx64qi (); + check2_vnx128qi (); + check2_vnx4hi (); + check2_vnx8hi (); + check2_vnx16hi (); + check2_vnx32hi (); + check2_vnx64hi (); + check2_vnx4si (); + check2_vnx8si (); + check2_vnx16si (); + check2_vnx32si (); + check2_vnx4di (); + check2_vnx8di (); + check2_vnx16di (); + check2_vnx4sf (); + check2_vnx8sf (); + check2_vnx16sf (); + check2_vnx32sf (); + check2_vnx4df (); + check2_vnx8df (); + check2_vnx16df (); + check3_vnx4qi (); + check3_vnx8qi (); + check3_vnx16qi (); + check3_vnx32qi (); + check3_vnx64qi (); + check3_vnx128qi (); + check3_vnx4hi (); + check3_vnx8hi (); + check3_vnx16hi (); + check3_vnx32hi (); + check3_vnx64hi (); + check3_vnx4si (); + check3_vnx8si (); + check3_vnx16si (); + check3_vnx32si (); + check3_vnx4di (); + check3_vnx8di (); + check3_vnx16di (); + check3_vnx4sf (); + check3_vnx8sf (); + check3_vnx16sf (); + check3_vnx32sf (); + check3_vnx4df (); + check3_vnx8df (); + check3_vnx16df (); + check4_vnx4qi (); + check4_vnx8qi (); + check4_vnx16qi (); + check4_vnx32qi (); + check4_vnx64qi (); + check4_vnx128qi (); + check4_vnx4hi (); + check4_vnx8hi (); + check4_vnx16hi (); + check4_vnx32hi (); + check4_vnx64hi (); + check4_vnx4si (); + check4_vnx8si (); + check4_vnx16si (); + check4_vnx32si (); + check4_vnx4di (); + check4_vnx8di (); + check4_vnx16di (); + check4_vnx4sf (); + check4_vnx8sf (); + check4_vnx16sf (); + check4_vnx32sf (); + check4_vnx4df (); + check4_vnx8df (); + check4_vnx16df (); + check5_vnx4qi (); + check5_vnx8qi (); + check5_vnx16qi (); + check5_vnx32qi (); + check5_vnx64qi (); + check5_vnx128qi (); + check5_vnx4hi (); + check5_vnx8hi (); + check5_vnx16hi (); + check5_vnx32hi (); + check5_vnx64hi (); + check5_vnx4si (); + check5_vnx8si (); + check5_vnx16si (); + check5_vnx32si (); + check5_vnx4di (); + check5_vnx8di (); + check5_vnx16di (); + check5_vnx4sf (); + check5_vnx8sf (); + check5_vnx16sf (); + check5_vnx32sf (); + check5_vnx4df (); + check5_vnx8df (); + check5_vnx16df (); + check6_vnx4qi (); + check6_vnx8qi (); + check6_vnx16qi (); + check6_vnx32qi (); + check6_vnx64qi (); + check6_vnx128qi (); + check6_vnx4hi (); + check6_vnx8hi (); + check6_vnx16hi (); + check6_vnx32hi (); + check6_vnx64hi (); + check6_vnx4si (); + check6_vnx8si (); + check6_vnx16si (); + check6_vnx32si (); + check6_vnx4di (); + check6_vnx8di (); + check6_vnx16di (); + check6_vnx4sf (); + check6_vnx8sf (); + check6_vnx16sf (); + check6_vnx32sf (); + check6_vnx4df (); + check6_vnx8df (); + check6_vnx16df (); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c new file mode 100644 index 00000000000..4f40094746c --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/shuffle-slide.c @@ -0,0 +1,207 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mrvv-max-lmul=m8 -Wno-overflow" } */ + +#include "perm.h" + +#define SERIES_2(x, y) (x), (x + 1) +#define SERIES_4(x, y) SERIES_2 (x, y), SERIES_2 (x + 2, y) +#define SERIES_8(x, y) SERIES_4 (x, y), SERIES_4 (x + 4, y) +#define SERIES_16(x, y) SERIES_8 (x, y), SERIES_8 (x + 8, y) +#define SERIES_32(x, y) SERIES_16 (x, y), SERIES_16 (x + 16, y) +#define SERIES_64(x, y) SERIES_32 (x, y), SERIES_32 (x + 32, y) +#define SERIES_128(x, y) SERIES_64 (x, y), SERIES_64 (x + 64, y) + +#define MASK1_4(X, Y) SERIES_2 (X, Y), SERIES_2 (X + 4, Y) +#define MASK1_8(X, Y) SERIES_4 (X, Y), SERIES_4 (X + 8, Y) +#define MASK1_16(X, Y) SERIES_8 (X, Y), SERIES_8 (X + 16, Y) +#define MASK1_32(X, Y) SERIES_16 (X, Y), SERIES_16 (X + 32, Y) +#define MASK1_64(X, Y) SERIES_32 (X, Y), SERIES_32 (X + 64, Y) +#define MASK1_128(X, Y) SERIES_64 (X, Y), SERIES_64 (X + 128, Y) + +#define MASK1D_4(X, Y) SERIES_2 (X + 2, Y), SERIES_2 (X + 6, Y) +#define MASK1D_8(X, Y) SERIES_4 (X + 4, Y), SERIES_4 (X + 12, Y) +#define MASK1D_16(X, Y) SERIES_8 (X + 8, Y), SERIES_8 (X + 24, Y) +#define MASK1D_32(X, Y) SERIES_16 (X + 16, Y), SERIES_16 (X + 48, Y) +#define MASK1D_64(X, Y) SERIES_32 (X + 32, Y), SERIES_32 (X + 96, Y) +#define MASK1D_128(X, Y) SERIES_64 (X + 64, Y), SERIES_64 (X + 192, Y) + +#define MASK2U_4(X, Y) 0, 1, 2, 4 +#define MASK2U_8(X, Y) 0, 1, 2, 3, 4, 5, 6, 8 +#define MASK2U_16(X, Y) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16 +#define MASK2U_32(X, Y) \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32 +#define MASK2U_64(X, Y) \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, \ + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, \ + 57, 58, 59, 60, 61, 62, 64 +#define MASK2U_128(X, Y) \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, \ + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, \ + 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, \ + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, \ + 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, \ + 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, \ + 124, 125, 126, 128 + +#define MASK3U_4(X, Y) 0, 4, 5, 6 +#define MASK3U_8(X, Y) 0, 8, 9, 10, 11, 12, 13, 14 +#define MASK3U_16(X, Y) \ + 0, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 +#define MASK3U_32(X, Y) \ + 0, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, \ + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62 +#define MASK3U_64(X, Y) \ + 0, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, \ + 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, \ + 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, \ + 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126 +#define MASK3U_128(X, Y) \ + 0, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, \ + 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, \ + 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, \ + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, \ + 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, \ + 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, \ + 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, \ + 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, \ + 247, 248, 249, 250, 251, 252, 253, 254 + +#define MASK2D_4(X, Y) 1, 2, 3, 7 +#define MASK2D_8(X, Y) 1, 2, 3, 4, 5, 6, 7, 15 +#define MASK2D_16(X, Y) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 31 +#define MASK2D_32(X, Y) \ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, \ + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 63 +#define MASK2D_64(X, Y) \ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, \ + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, \ + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, \ + 58, 59, 60, 61, 62, 63, 127 +#define MASK2D_128(X, Y) \ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, \ + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, \ + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, \ + 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, \ + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, \ + 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, \ + 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, \ + 125, 126, 127, 255 + +#define MASK3D_4(X, Y) 3, 5, 6, 7 +#define MASK3D_8(X, Y) 7, 9, 10, 11, 12, 13, 14, 15 +#define MASK3D_16(X, Y) \ + 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +#define MASK3D_32(X, Y) \ + 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, \ + 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +#define MASK3D_64(X, Y) \ + 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, \ + 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, \ + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, \ + 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 +#define MASK3D_128(X, Y) \ + 127, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, \ + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, \ + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, \ + 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, \ + 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, \ + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, \ + 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, \ + 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, \ + 248, 249, 250, 251, 252, 253, 254, 255 + +#define PERMUTE1(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute1_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK1_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE2(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute2_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK1D_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE3(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute3_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK2U_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE4(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute4_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK3U_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE5(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute5_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK2D_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define PERMUTE6(TYPE, NUNITS) \ + __attribute__ ((noipa)) void permute6_##TYPE (TYPE values1, TYPE values2, \ + TYPE *out) \ + { \ + TYPE v = __builtin_shufflevector (values1, values2, \ + MASK3D_##NUNITS (0, NUNITS)); \ + *(TYPE *) out = v; \ + } + +#define TEST_ALL(T) \ + T (vnx4qi, 4) \ + T (vnx8qi, 8) \ + T (vnx16qi, 16) \ + T (vnx32qi, 32) \ + T (vnx64qi, 64) \ + T (vnx128qi, 128) \ + T (vnx4hi, 4) \ + T (vnx8hi, 8) \ + T (vnx16hi, 16) \ + T (vnx32hi, 32) \ + T (vnx64hi, 64) \ + T (vnx4si, 4) \ + T (vnx8si, 8) \ + T (vnx16si, 16) \ + T (vnx32si, 32) \ + T (vnx4di, 4) \ + T (vnx8di, 8) \ + T (vnx16di, 16) \ + T (vnx4sf, 4) \ + T (vnx8sf, 8) \ + T (vnx16sf, 16) \ + T (vnx32sf, 32) \ + T (vnx4df, 4) \ + T (vnx8df, 8) \ + T (vnx16df, 16) + +TEST_ALL (PERMUTE1) +TEST_ALL (PERMUTE2) +TEST_ALL (PERMUTE3) +TEST_ALL (PERMUTE4) +TEST_ALL (PERMUTE5) +TEST_ALL (PERMUTE6) + +/* { dg-final { scan-assembler-times "vslideup" 75 } } */ +/* { dg-final { scan-assembler-times "vslidedown" 75 } } */ +/* { dg-final { scan-assembler-not "vrgather" } } */ +/* { dg-final { scan-assembler-not "vmerge" } } */ diff --git a/gcc/varasm.cc b/gcc/varasm.cc index 0712b486029..2f1e375c13d 100644 --- a/gcc/varasm.cc +++ b/gcc/varasm.cc @@ -4500,7 +4500,7 @@ optimize_constant_pool (struct rtx_constant_pool *pool) constant_descriptor_rtx_data *data = data_pool.allocate (); data->desc = desc; data->bytes = NULL; - data->size = GET_MODE_SIZE (desc->mode); + data->size = GET_MODE_PRECISION (desc->mode); data->offset = 0; data->hash = idx++; size += data->size; -- 2.47.0