Hi, for a generic vec_construct from scalar elements we need to load each scalar element and move it over to a vector register. Right now we only use a cost of 1 per element.
This patch uses register-move cost as well as scalar_to_vec and multiplies it with the number of elements in the vector instead. Regtested on rv64gcv_zvl512b. Changes from V1: - Added a test case. Regards Robin PR target/118019 gcc/ChangeLog: * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Increase vec_construct cost. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr118019.c: New test. --- gcc/config/riscv/riscv.cc | 8 ++- .../gcc.target/riscv/rvv/autovec/pr118019.c | 52 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index be2ebf9d9c0..aa8a4562d9a 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -12263,7 +12263,13 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost; case vec_construct: - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); + { + /* TODO: This is too pessimistic in case we can splat. */ + int regmove_cost = fp ? costs->regmove->FR2VR + : costs->regmove->GR2VR; + return (regmove_cost + common_costs->scalar_to_vec_cost) + * estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); + } default: gcc_unreachable (); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019.c new file mode 100644 index 00000000000..b1431d123bf --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019.c @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mstrict-align -mvector-strict-align" } */ + +/* Make sure we do not construct the vector element-wise despite + slow misaligned scalar and vector accesses. */ + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) \ + { \ + int t0 = s0 + s1; \ + int t1 = s0 - s1; \ + int t2 = s2 + s3; \ + int t3 = s2 - s3; \ + d0 = t0 + t2; \ + d2 = t0 - t2; \ + d1 = t1 + t3; \ + d3 = t1 - t3; \ + } + +uint32_t +abs2 (uint32_t a) +{ + uint32_t s = ((a >> 15) & 0x10001) * 0xffff; + return (a + s) ^ s; +} + +int +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2) +{ + uint32_t tmp[4][4]; + uint32_t a0, a1, a2, a3; + int sum = 0; + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + for (int i = 0; i < 4; i++) + { + HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); + } + return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1; +} + +/* { dg-final { scan-assembler-not "lbu" } } */ -- 2.47.1