vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value to be insert, arg2 is the place to insert arg1 to arg0. This patch adds __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to not expand too early in gimple stage if arg2 is variable, to avoid generate store hit load instructions.
For Power9 V4SI: addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) => addis 9,2,.LC0@toc@ha addi 9,9,.LC0@toc@l mtvsrwz 33,5 lxv 32,0(9) sradi 9,6,2 addze 9,9 sldi 9,9,2 subf 9,9,6 subfic 9,9,3 sldi 9,9,2 subfic 9,9,20 lvsl 13,0,9 xxperm 33,33,45 xxperm 32,32,45 xxsel 34,34,33,32 Though instructions increase from 5 to 15, the performance is improved 60% in typical cases. gcc/ChangeLog: * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Extend to SDI mode. * config/rs6000/rs6000-builtin.def (BU_VSX_X): Add support macros for vec_insert built-in functions. * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Generate built-in calls for vec_insert. * config/rs6000/rs6000-call.c (altivec_expand_vec_insert_builtin): New function. (altivec_expand_builtin): Add case entry for VSX_BUILTIN_VEC_INSERT_V16QI, VSX_BUILTIN_VEC_INSERT_V8HI, VSX_BUILTIN_VEC_INSERT_V4SF, VSX_BUILTIN_VEC_INSERT_V4SI, VSX_BUILTIN_VEC_INSERT_V2DF, VSX_BUILTIN_VEC_INSERT_V2DI. (altivec_init_builtins): * config/rs6000/rs6000-protos.h (rs6000_expand_vector_insert): New declear. * config/rs6000/rs6000.c (rs6000_expand_vector_insert): New function. * config/rs6000/rs6000.md (FQHS): New mode iterator. (FD): New mode iterator. p8_mtvsrwz_v16qi<mode>2: New define_insn. p8_mtvsrd_v16qi<mode>2: New define_insn. * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. gcc/testsuite/ChangeLog: * gcc.target/powerpc/pr79251.c: New test. --- gcc/config/rs6000/altivec.md | 4 +- gcc/config/rs6000/rs6000-builtin.def | 6 + gcc/config/rs6000/rs6000-c.c | 61 +++++++++ gcc/config/rs6000/rs6000-call.c | 74 +++++++++++ gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 146 +++++++++++++++++++++ gcc/config/rs6000/rs6000.md | 19 +++ gcc/config/rs6000/vsx.md | 2 +- gcc/testsuite/gcc.target/powerpc/pr79251.c | 23 ++++ 9 files changed, 333 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..66b636059a6 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2772,10 +2772,10 @@ DONE; }) -(define_insn "altivec_lvsl_reg" +(define_insn "altivec_lvsl_reg_<mode>2" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SDI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index f9f0fece549..d095b365c14 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -2047,6 +2047,12 @@ BU_VSX_X (VEC_INIT_V2DI, "vec_init_v2di", CONST) BU_VSX_X (VEC_SET_V1TI, "vec_set_v1ti", CONST) BU_VSX_X (VEC_SET_V2DF, "vec_set_v2df", CONST) BU_VSX_X (VEC_SET_V2DI, "vec_set_v2di", CONST) +BU_VSX_X (VEC_INSERT_V16QI, "vec_insert_v16qi", CONST) +BU_VSX_X (VEC_INSERT_V8HI, "vec_insert_v8hi", CONST) +BU_VSX_X (VEC_INSERT_V4SI, "vec_insert_v4si", CONST) +BU_VSX_X (VEC_INSERT_V4SF, "vec_insert_v4sf", CONST) +BU_VSX_X (VEC_INSERT_V2DI, "vec_insert_v2di", CONST) +BU_VSX_X (VEC_INSERT_V2DF, "vec_insert_v2df", CONST) BU_VSX_X (VEC_EXT_V1TI, "vec_ext_v1ti", CONST) BU_VSX_X (VEC_EXT_V2DF, "vec_ext_v2df", CONST) BU_VSX_X (VEC_EXT_V2DI, "vec_ext_v2di", CONST) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 2fad3d94706..03b00738a5e 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1563,6 +1563,67 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } + else if (VECTOR_MEM_VSX_P (mode)) + { + tree call = NULL_TREE; + + arg2 = fold_for_warn (arg2); + + /* If the second argument is variable, we can optimize it if we are + generating 64-bit code on a machine with direct move. */ + if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) + { + switch (mode) + { + default: + break; + + case E_V2DImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DI]; + break; + + case E_V2DFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V2DF]; + break; + + case E_V4SFmode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SF]; + break; + + case E_V4SImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V4SI]; + break; + + case E_V8HImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V8HI]; + break; + + case E_V16QImode: + call = rs6000_builtin_decls[VSX_BUILTIN_VEC_INSERT_V16QI]; + break; + } + } + + if (call) + { + if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) + arg2 = build_int_cst (TREE_TYPE (arg2), 0); + else + arg2 = build_binary_op ( + loc, BIT_AND_EXPR, arg2, + build_int_cst (TREE_TYPE (arg2), + TYPE_VECTOR_SUBPARTS (arg1_type) - 1), + 0); + tree result + = build_call_expr (call, 3, arg1, + convert (TREE_TYPE (arg1_type), arg0), + convert (integer_type_node, arg2)); + /* Coerce the result to vector element type. May be no-op. */ + result = fold_convert (TREE_TYPE (arg1), result); + return result; + } + } + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ arg1_inner_type = TREE_TYPE (arg1_type); if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index e39cfcf672b..339e9ae87e3 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10660,6 +10660,40 @@ altivec_expand_vec_set_builtin (tree exp) return op0; } +/* Expand vec_insert builtin. */ +static rtx +altivec_expand_vec_insert_builtin (tree exp, rtx target) +{ + machine_mode tmode, mode1, mode2; + tree arg0, arg1, arg2; + rtx op0 = NULL_RTX, op1, op2; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + tmode = TYPE_MODE (TREE_TYPE (arg0)); + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + mode2 = TYPE_MODE ((TREE_TYPE (arg2))); + gcc_assert (VECTOR_MODE_P (tmode)); + + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); + op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL); + + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); + + op0 = force_reg (tmode, op0); + op1 = force_reg (mode1, op1); + op2 = force_reg (mode2, op2); + + target = gen_reg_rtx (V16QImode); + rs6000_expand_vector_insert (target, op0, op1, op2); + + return target; +} + /* Expand vec_ext builtin. */ static rtx altivec_expand_vec_ext_builtin (tree exp, rtx target) @@ -10922,6 +10956,14 @@ altivec_expand_builtin (tree exp, rtx target, bool *expandedp) case VSX_BUILTIN_VEC_SET_V1TI: return altivec_expand_vec_set_builtin (exp); + case VSX_BUILTIN_VEC_INSERT_V16QI: + case VSX_BUILTIN_VEC_INSERT_V8HI: + case VSX_BUILTIN_VEC_INSERT_V4SF: + case VSX_BUILTIN_VEC_INSERT_V4SI: + case VSX_BUILTIN_VEC_INSERT_V2DF: + case VSX_BUILTIN_VEC_INSERT_V2DI: + return altivec_expand_vec_insert_builtin (exp, target); + case ALTIVEC_BUILTIN_VEC_EXT_V4SI: case ALTIVEC_BUILTIN_VEC_EXT_V8HI: case ALTIVEC_BUILTIN_VEC_EXT_V16QI: @@ -13681,6 +13723,38 @@ altivec_init_builtins (void) integer_type_node, NULL_TREE); def_builtin ("__builtin_vec_set_v2di", ftype, VSX_BUILTIN_VEC_SET_V2DI); + /* Access to the vec_insert patterns. */ + ftype = build_function_type_list (V16QI_type_node, V16QI_type_node, + intQI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v16qi", ftype, + VSX_BUILTIN_VEC_INSERT_V16QI); + + ftype = build_function_type_list (V8HI_type_node, V8HI_type_node, + intHI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v8hi", ftype, VSX_BUILTIN_VEC_INSERT_V8HI); + + ftype = build_function_type_list (V4SI_type_node, V4SI_type_node, + integer_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v4si", ftype, VSX_BUILTIN_VEC_INSERT_V4SI); + + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, + float_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v4sf", ftype, VSX_BUILTIN_VEC_INSERT_V4SF); + + ftype = build_function_type_list (V2DI_type_node, V2DI_type_node, + intDI_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v2di", ftype, VSX_BUILTIN_VEC_INSERT_V2DI); + + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, + double_type_node, + integer_type_node, NULL_TREE); + def_builtin ("__builtin_vec_insert_v2df", ftype, VSX_BUILTIN_VEC_INSERT_V2DF); + /* Access to the vec_extract patterns. */ ftype = build_function_type_list (intSI_type_node, V4SI_type_node, integer_type_node, NULL_TREE); diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..78b5b31d79f 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_insert (rtx, rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..afa845f3dff 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) emit_insn (gen_rtx_SET (target, x)); } +/* Insert value from VEC into idx of TARGET. */ + +void +rs6000_expand_vector_insert (rtx target, rtx vec, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (vec); + + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) + gcc_unreachable (); + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) + && TARGET_DIRECT_MOVE_64BIT) + { + gcc_assert (GET_MODE (idx) == E_SImode); + machine_mode inner_mode = GET_MODE (val); + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + if (GET_MODE_SIZE (inner_mode) == 8) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 1 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); + /* idx = idx * 8. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); + /* idx = 16 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 4) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 3 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); + /* idx = idx * 4. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); + /* idx = 20 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 2) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 7 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); + /* idx = idx * 2. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + /* idx = 22 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 1) + if (!BYTES_BIG_ENDIAN) + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); + else + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); + else + gcc_unreachable (); + + /* lxv vs32, mask. + DImode: 0xffffffffffffffff0000000000000000 + SImode: 0x00000000ffffffff0000000000000000 + HImode: 0x000000000000ffff0000000000000000. + QImode: 0x00000000000000ff0000000000000000. */ + rtx mask = gen_reg_rtx (V16QImode); + rtx mask_v2di = gen_reg_rtx (V2DImode); + rtvec v = rtvec_alloc (2); + if (!BYTES_BIG_ENDIAN) + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); + } + else + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); + } + emit_insn ( + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); + emit_insn (gen_rtx_SET (mask, sub_mask)); + + /* mtvsrd[wz] f0,val. */ + rtx val_v16qi = gen_reg_rtx (V16QImode); + switch (inner_mode) + { + default: + gcc_unreachable (); + break; + case E_QImode: + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); + break; + case E_HImode: + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); + break; + case E_SImode: + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); + break; + case E_SFmode: + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); + break; + case E_DImode: + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); + break; + case E_DFmode: + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); + break; + } + + /* lvsl v1,0,idx. */ + rtx pcv = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); + + /* xxperm vs0,vs0,vs33. */ + /* xxperm vs32,vs32,vs33. */ + rtx val_perm = gen_reg_rtx (V16QImode); + rtx mask_perm = gen_reg_rtx (V16QImode); + emit_insn ( + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); + + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); + emit_insn (gen_rtx_SET (target, sub_target)); + + /* xxsel vs34,vs34,vs0,vs32. */ + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); + } +} + /* Extract field ELT from VEC into TARGET. */ void diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 43b620ae1c0..b02fda836d4 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8713,6 +8713,25 @@ "mtvsrwz %x0,%1" [(set_attr "type" "mftgpr")]) +(define_mode_iterator FQHS [SF QI HI SI]) +(define_mode_iterator FD [DF DI]) + +(define_insn "p8_mtvsrwz_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRWZ))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrwz %x0,%1" + [(set_attr "type" "mftgpr")]) + +(define_insn "p8_mtvsrd_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRD))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrd %x0,%1" + [(set_attr "type" "mftgpr")]) + (define_insn_and_split "reload_fpr_from_gpr<mode>" [(set (match_operand:FMOVE64X 0 "register_operand" "=d") (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index dd750210758..7e82690d12d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5349,7 +5349,7 @@ rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c new file mode 100644 index 00000000000..877659a0146 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> + +#define TYPE int + +__attribute__ ((noinline)) +vector TYPE test (vector TYPE v, TYPE i, size_t n) +{ + vector TYPE v1 = v; + v1 = vec_insert (i, v, n); + + return v1; +} + +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ +/* { dg-final { scan-assembler-times {\mlvsl\M} 1 } } */ +/* { dg-final { scan-assembler-times {\mxxperm\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxsel\M} 1 } } */ -- 2.27.0.90.geebb51ba8c