From: Robin Dapp <rd...@ventanamicro.com> This adds zero else operands to masked loads and their intrinsics. I needed to adjust more than initially thought because we rely on combine for several instructions and a change in a "base" pattern needs to propagate to all those.
For the lack of a better idea I used a function call property to specify whether a builtin needs an else operand or not. Somebody with better knowledge of the aarch64 target can surely improve that. gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc: Add else handling. * config/aarch64/aarch64-sve-builtins.cc (function_expander::use_contiguous_load_insn): Ditto. * config/aarch64/aarch64-sve-builtins.h: Add else operand to contiguous load. * config/aarch64/aarch64-sve.md (@aarch64_load<SVE_PRED_LOAD:pred_load> _<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Split and add else operand. (@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Ditto. (*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Ditto. * config/aarch64/aarch64-sve2.md: Ditto. * config/aarch64/iterators.md: Remove unused iterators. * config/aarch64/predicates.md (aarch64_maskload_else_operand): Add zero else operand. --- .../aarch64/aarch64-sve-builtins-base.cc | 46 ++++++++++------ gcc/config/aarch64/aarch64-sve-builtins.cc | 7 ++- gcc/config/aarch64/aarch64-sve-builtins.h | 2 +- gcc/config/aarch64/aarch64-sve.md | 53 ++++++++++++++++--- gcc/config/aarch64/aarch64-sve2.md | 3 +- gcc/config/aarch64/iterators.md | 4 -- gcc/config/aarch64/predicates.md | 4 ++ 7 files changed, 90 insertions(+), 29 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index fe16d93adcd..406ceb13a4c 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1523,11 +1523,12 @@ public: gimple_seq stmts = NULL; tree pred = f.convert_pred (stmts, vectype, 0); tree base = f.fold_contiguous_base (stmts, vectype); + tree els = build_zero_cst (vectype); gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT); tree cookie = f.load_store_cookie (TREE_TYPE (vectype)); - gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3, - base, cookie, pred); + gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 4, + base, cookie, pred, els); gimple_call_set_lhs (new_call, f.lhs); return new_call; } @@ -1537,11 +1538,14 @@ public: { insn_code icode; if (e.vectors_per_tuple () == 1) - icode = convert_optab_handler (maskload_optab, - e.vector_mode (0), e.gp_mode (0)); + { + icode = convert_optab_handler (maskload_optab, + e.vector_mode (0), e.gp_mode (0)); + e.args.quick_push (CONST0_RTX (e.vector_mode (0))); + } else icode = code_for_aarch64 (UNSPEC_LD1_COUNT, e.tuple_mode (0)); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1551,13 +1555,19 @@ class svld1_extend_impl : public extending_load public: using extending_load::extending_load; + unsigned int + call_properties (const function_instance &) const override + { + return CP_READ_MEMORY; + } + rtx expand (function_expander &e) const override { - insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code (), + insn_code icode = code_for_aarch64_load (extend_rtx_code (), e.vector_mode (0), e.memory_vector_mode ()); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1576,6 +1586,8 @@ public: e.prepare_gather_address_operands (1); /* Put the predicate last, as required by mask_gather_load_optab. */ e.rotate_inputs_left (0, 5); + /* Add the else operand. */ + e.args.quick_push (CONST0_RTX (e.vector_mode (0))); machine_mode mem_mode = e.memory_vector_mode (); machine_mode int_mode = aarch64_sve_int_mode (mem_mode); insn_code icode = convert_optab_handler (mask_gather_load_optab, @@ -1599,6 +1611,8 @@ public: e.rotate_inputs_left (0, 5); /* Add a constant predicate for the extension rtx. */ e.args.quick_push (CONSTM1_RTX (VNx16BImode)); + /* Add the else operand. */ + e.args.quick_push (CONST0_RTX (e.vector_mode (1))); insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (), e.vector_mode (0), e.memory_vector_mode ()); @@ -1741,6 +1755,7 @@ public: /* Get the predicate and base pointer. */ gimple_seq stmts = NULL; tree pred = f.convert_pred (stmts, vectype, 0); + tree els = build_zero_cst (vectype); tree base = f.fold_contiguous_base (stmts, vectype); gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT); @@ -1759,8 +1774,8 @@ public: /* Emit the load itself. */ tree cookie = f.load_store_cookie (TREE_TYPE (vectype)); - gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, - base, cookie, pred); + gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 4, + base, cookie, pred, els); gimple_call_set_lhs (new_call, lhs_array); gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT); @@ -1773,7 +1788,7 @@ public: machine_mode tuple_mode = e.result_mode (); insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab, tuple_mode, e.vector_mode (0)); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1840,11 +1855,12 @@ public: rtx expand (function_expander &e) const override { - insn_code icode = (e.vectors_per_tuple () == 1 - ? code_for_aarch64_ldnt1 (e.vector_mode (0)) - : code_for_aarch64 (UNSPEC_LDNT1_COUNT, - e.tuple_mode (0))); - return e.use_contiguous_load_insn (icode); + insn_code icode; + if (e.vectors_per_tuple () == 1) + icode = code_for_aarch64_ldnt1 (e.vector_mode (0)); + else + icode = code_for_aarch64 (UNSPEC_LDNT1_COUNT, e.tuple_mode (0)); + return e.use_contiguous_load_insn (icode, true); } }; diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index ef14f8cd39d..84c0a0caa50 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -4229,7 +4229,7 @@ function_expander::use_vcond_mask_insn (insn_code icode, Extending loads have a further predicate (operand 3) that nominally controls the extension. */ rtx -function_expander::use_contiguous_load_insn (insn_code icode) +function_expander::use_contiguous_load_insn (insn_code icode, bool has_else) { machine_mode mem_mode = memory_vector_mode (); @@ -4238,6 +4238,11 @@ function_expander::use_contiguous_load_insn (insn_code icode) add_input_operand (icode, args[0]); if (GET_MODE_UNIT_BITSIZE (mem_mode) < type_suffix (0).element_bits) add_input_operand (icode, CONSTM1_RTX (VNx16BImode)); + + /* If we have an else operand, add it. */ + if (has_else) + add_input_operand (icode, CONST0_RTX (mem_mode)); + return generate_insn (icode); } diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index 4cdc0541bdc..1aa9caf84ba 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -695,7 +695,7 @@ public: rtx use_pred_x_insn (insn_code); rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO); rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO); - rtx use_contiguous_load_insn (insn_code); + rtx use_contiguous_load_insn (insn_code, bool = false); rtx use_contiguous_prefetch_insn (insn_code); rtx use_contiguous_store_insn (insn_code); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 06bd3e4bb2c..a2e9f52d024 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1291,7 +1291,8 @@ (define_insn "maskload<mode><vpred>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w") (unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_ALL 1 "memory_operand" "m")] + (match_operand:SVE_ALL 1 "memory_operand" "m") + (match_operand:SVE_ALL 3 "aarch64_maskload_else_operand")] UNSPEC_LD1_SVE))] "TARGET_SVE" "ld1<Vesize>\t%0.<Vctype>, %2/z, %1" @@ -1302,11 +1303,14 @@ (define_expand "vec_load_lanes<mode><vsingle>" [(set (match_operand:SVE_STRUCT 0 "register_operand") (unspec:SVE_STRUCT [(match_dup 2) - (match_operand:SVE_STRUCT 1 "memory_operand")] + (match_operand:SVE_STRUCT 1 "memory_operand") + (match_dup 3) + ] UNSPEC_LDN))] "TARGET_SVE" { operands[2] = aarch64_ptrue_reg (<VPRED>mode); + operands[3] = CONST0_RTX (<MODE>mode); } ) @@ -1315,7 +1319,8 @@ (define_insn "vec_mask_load_lanes<mode><vsingle>" [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w") (unspec:SVE_STRUCT [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_STRUCT 1 "memory_operand" "m")] + (match_operand:SVE_STRUCT 1 "memory_operand" "m") + (match_operand 3 "aarch64_maskload_else_operand")] UNSPEC_LDN))] "TARGET_SVE" "ld<vector_count><Vesize>\t%0, %2/z, %1" @@ -1334,15 +1339,16 @@ (define_insn "vec_mask_load_lanes<mode><vsingle>" ;; ------------------------------------------------------------------------- ;; Predicated load and extend, with 8 elements per 128-bit block. -(define_insn_and_rewrite "@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>" +(define_insn_and_rewrite "@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>" [(set (match_operand:SVE_HSDI 0 "register_operand" "=w") (unspec:SVE_HSDI [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm") (ANY_EXTEND:SVE_HSDI (unspec:SVE_PARTIAL_I [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")] - SVE_PRED_LOAD))] + (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m") + (match_operand:SVE_PARTIAL_I 4 "aarch64_maskload_else_operand")] + UNSPEC_LD1_SVE))] UNSPEC_PRED_X))] "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0" "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1" @@ -1352,6 +1358,26 @@ (define_insn_and_rewrite "@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:opt } ) +;; Same as above without the maskload_else_operand to still allow combine to +;; match a sign-extended pred_mov pattern. +(define_insn_and_rewrite "*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>" + [(set (match_operand:SVE_HSDI 0 "register_operand" "=w") + (unspec:SVE_HSDI + [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm") + (ANY_EXTEND:SVE_HSDI + (unspec:SVE_PARTIAL_I + [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl") + (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")] + UNSPEC_PRED_X))] + UNSPEC_PRED_X))] + "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0" + "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1" + "&& !CONSTANT_P (operands[3])" + { + operands[3] = CONSTM1_RTX (<SVE_HSDI:VPRED>mode); + } +) + ;; ------------------------------------------------------------------------- ;; ---- First-faulting contiguous loads ;; ------------------------------------------------------------------------- @@ -1433,7 +1459,8 @@ (define_insn "@aarch64_ldnt1<mode>" [(set (match_operand:SVE_FULL 0 "register_operand" "=w") (unspec:SVE_FULL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_FULL 1 "memory_operand" "m")] + (match_operand:SVE_FULL 1 "memory_operand" "m") + (match_operand:SVE_FULL 3 "aarch64_maskload_else_operand")] UNSPEC_LDNT1_SVE))] "TARGET_SVE" "ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1" @@ -1456,11 +1483,13 @@ (define_expand "gather_load<mode><v_int_container>" (match_operand:<V_INT_CONTAINER> 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_dup 6) (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" { operands[5] = aarch64_ptrue_reg (<VPRED>mode); + operands[6] = CONST0_RTX (<MODE>mode); } ) @@ -1474,6 +1503,7 @@ (define_insn "mask_gather_load<mode><v_int_container>" (match_operand:VNx4SI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_4 6 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1503,6 +1533,7 @@ (define_insn "mask_gather_load<mode><v_int_container>" (match_operand:VNx2DI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 6 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1531,6 +1562,7 @@ (define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_<su>xtw_unpac UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1561,6 +1593,7 @@ (define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_sxtw" UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1588,6 +1621,7 @@ (define_insn "*mask_gather_load<mode><v_int_container>_uxtw" (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate")) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1624,6 +1658,7 @@ (define_insn_and_rewrite "@aarch64_gather_load_<ANY_EXTEND:optab><SVE_4HSI:mode> (match_operand:VNx4SI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_4BHI:Vesize>") + (match_operand:SVE_4BHI 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1663,6 +1698,7 @@ (define_insn_and_rewrite "@aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode (match_operand:VNx2DI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1701,6 +1737,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1738,6 +1775,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1772,6 +1810,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate")) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 5f2697c3179..22e8632af80 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -138,7 +138,8 @@ (define_insn "@aarch64_<optab><mode>" [(set (match_operand:SVE_FULLx24 0 "aligned_register_operand" "=Uw<vector_count>") (unspec:SVE_FULLx24 [(match_operand:VNx16BI 2 "register_operand" "Uph") - (match_operand:SVE_FULLx24 1 "memory_operand" "m")] + (match_operand:SVE_FULLx24 1 "memory_operand" "m") + (match_operand:SVE_FULLx24 3 "aarch64_maskload_else_operand")] LD1_COUNT))] "TARGET_STREAMING_SME2" "<optab><Vesize>\t%0, %K2/z, %1" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 0bc98315bb6..6592b3df3b2 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3224,10 +3224,6 @@ (define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE (define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1]) -(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE]) - -(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")]) - (define_int_iterator LD1_COUNT [UNSPEC_LD1_COUNT UNSPEC_LDNT1_COUNT]) (define_int_iterator ST1_COUNT [UNSPEC_ST1_COUNT UNSPEC_STNT1_COUNT]) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 6ad9a4bd8b9..26cfaed2402 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -1067,3 +1067,7 @@ (define_predicate "aarch64_granule16_simm9" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), -4096, 4080) && !(INTVAL (op) & 0xf)"))) + +(define_predicate "aarch64_maskload_else_operand" + (and (match_code "const_int,const_vector") + (match_test "op == CONST0_RTX (GET_MODE (op))"))) -- 2.47.0