> This patch looks odd to me. > I don't see memrefs in the trunk code.
It's on top of the vle/vse offset handling patch from a while back that I haven't committed yet. > Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE > does: I don't mind having separate costs for each but I figured they scale anyway with the number of vectors already. Attached v2 is more similar to aarch64. Regards Robin Subject: [PATCH v2] RISC-V: Add initial cost handling for segment loads/stores. This patch makes segment loads and stores more expensive. It adds segment_permute_2 (as well as 4 and 8) cost fields to the common vector costs and adds handling to adjust_stmt_cost. gcc/ChangeLog: * config/riscv/riscv-protos.h (struct common_vector_cost): Add segment_permute cost. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Handle segment loads/stores. * config/riscv/riscv.cc: Initialize segment_permute_[248] to 1. --- gcc/config/riscv/riscv-protos.h | 5 + gcc/config/riscv/riscv-vector-costs.cc | 139 +++++++++++++++++-------- gcc/config/riscv/riscv.cc | 6 ++ 3 files changed, 108 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 80efdf2b7e5..9b737aca1a3 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -218,6 +218,11 @@ struct common_vector_cost const int gather_load_cost; const int scatter_store_cost; + /* Segment load/store permute cost. */ + const int segment_permute_2; + const int segment_permute_4; + const int segment_permute_8; + /* Cost of a vector-to-scalar operation. */ const int vec_to_scalar_cost; diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index adf9c197df5..c8178d71101 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Returns the group size i.e. the number of vectors to be loaded by a + segmented load/store instruction. Return 0 if it is no segmented + load/store. */ +static int +segment_loadstore_group_size (enum vect_cost_for_stmt kind, + stmt_vec_info stmt_info) +{ + if (stmt_info + && (kind == vector_load || kind == vector_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, case vector_load: case vector_store: { - /* Unit-stride vector loads and stores do not have offset addressing - as opposed to scalar loads and stores. - If the address depends on a variable we need an additional - add/sub for each load/store in the worst case. */ - if (stmt_info && stmt_info->stmt) + if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info)) { - data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - class loop *father = stmt_info->stmt->bb->loop_father; - if (!loop && father && !father->inner && father->superloops) + /* Segment loads and stores. When the group size is > 1 + the vectorizer will add a vector load/store statement for + each vector in the group. Here we additionally add permute + costs for each. */ + /* TODO: Indexed and ordered/unordered cost. */ + int group_size = segment_loadstore_group_size (kind, stmt_info); + if (group_size > 1) + { + switch (group_size) + { + case 2: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_2; + else + stmt_cost += costs->vls->segment_permute_2; + break; + case 4: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_4; + else + stmt_cost += costs->vls->segment_permute_4; + break; + case 8: + if (riscv_v_ext_vector_mode_p (loop->vector_mode)) + stmt_cost += costs->vla->segment_permute_8; + else + stmt_cost += costs->vls->segment_permute_8; + break; + default: + break; + } + } + else { - tree ref; - if (TREE_CODE (dr->ref) != MEM_REF - || !(ref = TREE_OPERAND (dr->ref, 0)) - || TREE_CODE (ref) != SSA_NAME) - break; + /* Unit-stride vector loads and stores do not have offset + addressing as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; - if (SSA_NAME_IS_DEFAULT_DEF (ref)) - break; + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; - if (memrefs.contains ({ref, cst0})) - break; + if (memrefs.contains ({ref, cst0})) + break; - memrefs.add ({ref, cst0}); + memrefs.add ({ref, cst0}); - /* In case we have not seen REF before and the base address - is a pointer operation try a bit harder. */ - tree base = DR_BASE_ADDRESS (dr); - if (TREE_CODE (base) == POINTER_PLUS_EXPR - || TREE_CODE (base) == POINTER_DIFF_EXPR) - { - /* Deconstruct BASE's first operand. If it is a binary - operation, i.e. a base and an "offset" store this - pair. Only increase the stmt_cost if we haven't seen - it before. */ - tree argp = TREE_OPERAND (base, 1); - typedef std::pair<tree, tree> addr_pair; - addr_pair pair; - if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + /* In case we have not seen REF before and the base + address is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) { - tree argp0 = tree_strip_nop_conversions - (TREE_OPERAND (argp, 0)); - tree argp1 = TREE_OPERAND (argp, 1); - pair = addr_pair (argp0, argp1); - if (memrefs.contains (pair)) - break; - - memrefs.add (pair); - stmt_cost += builtin_vectorization_cost (scalar_stmt, - NULL_TREE, 0); + /* Deconstruct BASE's first operand. If it is a + binary operation, i.e. a base and an "offset" + store this pair. Only increase the stmt_cost if + we haven't seen it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair<tree, tree> addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost + += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } } } } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 5e984ee2a55..141278ec35e 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -365,6 +365,9 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ @@ -381,6 +384,9 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ + 1, /* segment_permute (2) */ + 1, /* segment_permute (4) */ + 1, /* segment_permute (8) */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* permute_cost */ -- 2.43.2