This patch introduces multiplicative cost scaling (x2/x4/x8) to model
the higher latency and register pressure of large LMULs. The scaling
is applied uniformly in adjust_stmt_cost for all vector operations.
Performance impact:
3.69% uplift in SPEC2017 525.x264_r on an internal uarch.
Changes from v1 based on feedback from Robin:
- Rename penalty to scaling.
- Use multiplicative factors instead of additive ones.
- Apply scaling uniformly and remove NITERS checks.
Regarding Jeff's implementation concerns, targeting GCC 17 is fine. No rush
for GCC 16 unless a proper fix is found.
PR target/122558
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (get_lmul_cost_scaling):
New function to calculate multiplicative scaling factors.
(costs::adjust_stmt_cost): Apply LMUL scaling uniformly to all
vector statements and remove duplicate logic.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr122558.c: New test.
Signed-off-by: Zhongyao Chen <[email protected]>
---
gcc/config/riscv/riscv-vector-costs.cc | 51 +++++++++++++++++++
.../gcc.target/riscv/rvv/autovec/pr122558.c | 37 ++++++++++++++
2 files changed, 88 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
diff --git a/gcc/config/riscv/riscv-vector-costs.cc
b/gcc/config/riscv/riscv-vector-costs.cc
index 27ced61e815..1a27b05c934 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1099,6 +1099,46 @@ segment_loadstore_group_size (enum vect_cost_for_stmt
kind,
return 0;
}
+/* Calculate LMUL-based cost scaling factor.
+ Larger LMUL values process more data but have proportionally
+ higher latency and register pressure.
+
+ Returns the cost scaling factor based on LMUL. For LMUL > 1,
+ the factor represents the relative cost increase (2x, 4x, 8x).
+ For LMUL <= 1, returns 1 (no scaling). */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+ if (!riscv_v_ext_vector_mode_p (mode))
+ return 1;
+
+ enum vlmul_type vlmul = get_vlmul (mode);
+
+ /* Cost scaling based on LMUL and data processed.
+ Larger LMUL values have proportionally higher latency:
+ - m1 (LMUL_1): 1x (baseline)
+ - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+ - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+ - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+ - mf2/mf4/mf8: 1x (fractional LMUL, already efficient) */
+ switch (vlmul)
+ {
+ case LMUL_2:
+ return 2;
+ case LMUL_4:
+ return 4;
+ case LMUL_8:
+ return 8;
+ case LMUL_1:
+ case LMUL_F2:
+ case LMUL_F4:
+ case LMUL_F8:
+ default:
+ return 1;
+ }
+}
+
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1243,6 +1283,17 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
default:
break;
}
+
+ /* Apply LMUL cost scaling uniformly to all vector operations.
+ Larger LMUL values have higher latency and register pressure,
+ which affects performance regardless of loop structure. */
+ if (vectype)
+ {
+ unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+ if (lmul_factor > 1)
+ stmt_cost *= lmul_factor;
+ }
+
return stmt_cost;
}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644
index 00000000000..c9dbba64961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv
-mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+ int16_t tmp[16];
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = d[i*4+0] + d[i*4+3];
+ int s12 = d[i*4+1] + d[i*4+2];
+ int d03 = d[i*4+0] - d[i*4+3];
+ int d12 = d[i*4+1] - d[i*4+2];
+ tmp[0*4+i] = s03 + s12;
+ tmp[1*4+i] = 2*d03 + d12;
+ tmp[2*4+i] = s03 - s12;
+ tmp[3*4+i] = d03 - 2*d12;
+ }
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = tmp[i*4+0] + tmp[i*4+3];
+ int s12 = tmp[i*4+1] + tmp[i*4+2];
+ int d03 = tmp[i*4+0] - tmp[i*4+3];
+ int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+ dct[i*4+0] = s03 + s12;
+ dct[i*4+1] = 2*d03 + d12;
+ dct[i*4+2] = s03 - s12;
+ dct[i*4+3] = d03 - 2*d12;
+ }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
--
2.43.0