From: Robin Dapp <rd...@ventanamicro.com> This adds zero else operands to masked loads and their intrinsics. I needed to adjust more than initially thought because we rely on combine for several instructions and a change in a "base" pattern needs to propagate to all those.
gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc: Add else handling. * config/aarch64/aarch64-sve-builtins.cc (function_expander::use_contiguous_load_insn): Ditto. * config/aarch64/aarch64-sve-builtins.h: Add else operand to contiguous load. * config/aarch64/aarch64-sve.md (@aarch64_load<SVE_PRED_LOAD:pred_load> _<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Split and add else operand. (@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Ditto. (*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>): Ditto. * config/aarch64/aarch64-sve2.md: Ditto. * config/aarch64/iterators.md: Remove unused iterators. * config/aarch64/predicates.md (aarch64_maskload_else_operand): Add zero else operand. --- .../aarch64/aarch64-sve-builtins-base.cc | 24 +++++---- gcc/config/aarch64/aarch64-sve-builtins.cc | 12 ++++- gcc/config/aarch64/aarch64-sve-builtins.h | 2 +- gcc/config/aarch64/aarch64-sve.md | 52 ++++++++++++++++--- gcc/config/aarch64/aarch64-sve2.md | 3 +- gcc/config/aarch64/iterators.md | 4 -- gcc/config/aarch64/predicates.md | 4 ++ 7 files changed, 77 insertions(+), 24 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index fe16d93adcd..d840f590202 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -1523,11 +1523,12 @@ public: gimple_seq stmts = NULL; tree pred = f.convert_pred (stmts, vectype, 0); tree base = f.fold_contiguous_base (stmts, vectype); + tree els = build_zero_cst (vectype); gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT); tree cookie = f.load_store_cookie (TREE_TYPE (vectype)); - gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3, - base, cookie, pred); + gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 4, + base, cookie, pred, els); gimple_call_set_lhs (new_call, f.lhs); return new_call; } @@ -1541,7 +1542,7 @@ public: e.vector_mode (0), e.gp_mode (0)); else icode = code_for_aarch64 (UNSPEC_LD1_COUNT, e.tuple_mode (0)); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1554,10 +1555,10 @@ public: rtx expand (function_expander &e) const override { - insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code (), + insn_code icode = code_for_aarch64_load (extend_rtx_code (), e.vector_mode (0), e.memory_vector_mode ()); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1576,6 +1577,8 @@ public: e.prepare_gather_address_operands (1); /* Put the predicate last, as required by mask_gather_load_optab. */ e.rotate_inputs_left (0, 5); + /* Add the else operand. */ + e.args.quick_push (CONST0_RTX (e.vector_mode (0))); machine_mode mem_mode = e.memory_vector_mode (); machine_mode int_mode = aarch64_sve_int_mode (mem_mode); insn_code icode = convert_optab_handler (mask_gather_load_optab, @@ -1599,6 +1602,8 @@ public: e.rotate_inputs_left (0, 5); /* Add a constant predicate for the extension rtx. */ e.args.quick_push (CONSTM1_RTX (VNx16BImode)); + /* Add the else operand. */ + e.args.quick_push (CONST0_RTX (e.vector_mode (1))); insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (), e.vector_mode (0), e.memory_vector_mode ()); @@ -1741,6 +1746,7 @@ public: /* Get the predicate and base pointer. */ gimple_seq stmts = NULL; tree pred = f.convert_pred (stmts, vectype, 0); + tree els = build_zero_cst (vectype); tree base = f.fold_contiguous_base (stmts, vectype); gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT); @@ -1759,8 +1765,8 @@ public: /* Emit the load itself. */ tree cookie = f.load_store_cookie (TREE_TYPE (vectype)); - gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, - base, cookie, pred); + gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 4, + base, cookie, pred, els); gimple_call_set_lhs (new_call, lhs_array); gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT); @@ -1773,7 +1779,7 @@ public: machine_mode tuple_mode = e.result_mode (); insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab, tuple_mode, e.vector_mode (0)); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; @@ -1844,7 +1850,7 @@ public: ? code_for_aarch64_ldnt1 (e.vector_mode (0)) : code_for_aarch64 (UNSPEC_LDNT1_COUNT, e.tuple_mode (0))); - return e.use_contiguous_load_insn (icode); + return e.use_contiguous_load_insn (icode, true); } }; diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index ef14f8cd39d..0db9a7e9dbe 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -4227,9 +4227,12 @@ function_expander::use_vcond_mask_insn (insn_code icode, /* Implement the call using instruction ICODE, which loads memory operand 1 into register operand 0 under the control of predicate operand 2. Extending loads have a further predicate (operand 3) that nominally - controls the extension. */ + controls the extension. + HAS_ELSE is true if the pattern has an additional operand that specifies + the values of inactive lanes. This exists to match the general maskload + interface and is always zero for AArch64. */ rtx -function_expander::use_contiguous_load_insn (insn_code icode) +function_expander::use_contiguous_load_insn (insn_code icode, bool has_else) { machine_mode mem_mode = memory_vector_mode (); @@ -4238,6 +4241,11 @@ function_expander::use_contiguous_load_insn (insn_code icode) add_input_operand (icode, args[0]); if (GET_MODE_UNIT_BITSIZE (mem_mode) < type_suffix (0).element_bits) add_input_operand (icode, CONSTM1_RTX (VNx16BImode)); + + /* If we have an else operand, add it. */ + if (has_else) + add_input_operand (icode, CONST0_RTX (mem_mode)); + return generate_insn (icode); } diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index 4cdc0541bdc..1aa9caf84ba 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -695,7 +695,7 @@ public: rtx use_pred_x_insn (insn_code); rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO); rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO); - rtx use_contiguous_load_insn (insn_code); + rtx use_contiguous_load_insn (insn_code, bool = false); rtx use_contiguous_prefetch_insn (insn_code); rtx use_contiguous_store_insn (insn_code); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 06bd3e4bb2c..17cca97555c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1291,7 +1291,8 @@ (define_insn "maskload<mode><vpred>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w") (unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_ALL 1 "memory_operand" "m")] + (match_operand:SVE_ALL 1 "memory_operand" "m") + (match_operand:SVE_ALL 3 "aarch64_maskload_else_operand")] UNSPEC_LD1_SVE))] "TARGET_SVE" "ld1<Vesize>\t%0.<Vctype>, %2/z, %1" @@ -1302,11 +1303,13 @@ (define_expand "vec_load_lanes<mode><vsingle>" [(set (match_operand:SVE_STRUCT 0 "register_operand") (unspec:SVE_STRUCT [(match_dup 2) - (match_operand:SVE_STRUCT 1 "memory_operand")] + (match_operand:SVE_STRUCT 1 "memory_operand") + (match_dup 3)] UNSPEC_LDN))] "TARGET_SVE" { operands[2] = aarch64_ptrue_reg (<VPRED>mode); + operands[3] = CONST0_RTX (<MODE>mode); } ) @@ -1315,7 +1318,8 @@ (define_insn "vec_mask_load_lanes<mode><vsingle>" [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w") (unspec:SVE_STRUCT [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_STRUCT 1 "memory_operand" "m")] + (match_operand:SVE_STRUCT 1 "memory_operand" "m") + (match_operand 3 "aarch64_maskload_else_operand")] UNSPEC_LDN))] "TARGET_SVE" "ld<vector_count><Vesize>\t%0, %2/z, %1" @@ -1334,15 +1338,16 @@ (define_insn "vec_mask_load_lanes<mode><vsingle>" ;; ------------------------------------------------------------------------- ;; Predicated load and extend, with 8 elements per 128-bit block. -(define_insn_and_rewrite "@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>" +(define_insn_and_rewrite "@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>" [(set (match_operand:SVE_HSDI 0 "register_operand" "=w") (unspec:SVE_HSDI [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm") (ANY_EXTEND:SVE_HSDI (unspec:SVE_PARTIAL_I [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")] - SVE_PRED_LOAD))] + (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m") + (match_operand:SVE_PARTIAL_I 4 "aarch64_maskload_else_operand")] + UNSPEC_LD1_SVE))] UNSPEC_PRED_X))] "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0" "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1" @@ -1352,6 +1357,26 @@ (define_insn_and_rewrite "@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:opt } ) +;; Same as above without the maskload_else_operand to still allow combine to +;; match a sign-extended pred_mov pattern. +(define_insn_and_rewrite "*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>" + [(set (match_operand:SVE_HSDI 0 "register_operand" "=w") + (unspec:SVE_HSDI + [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm") + (ANY_EXTEND:SVE_HSDI + (unspec:SVE_PARTIAL_I + [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl") + (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")] + UNSPEC_PRED_X))] + UNSPEC_PRED_X))] + "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0" + "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1" + "&& !CONSTANT_P (operands[3])" + { + operands[3] = CONSTM1_RTX (<SVE_HSDI:VPRED>mode); + } +) + ;; ------------------------------------------------------------------------- ;; ---- First-faulting contiguous loads ;; ------------------------------------------------------------------------- @@ -1433,7 +1458,8 @@ (define_insn "@aarch64_ldnt1<mode>" [(set (match_operand:SVE_FULL 0 "register_operand" "=w") (unspec:SVE_FULL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_FULL 1 "memory_operand" "m")] + (match_operand:SVE_FULL 1 "memory_operand" "m") + (match_operand:SVE_FULL 3 "aarch64_maskload_else_operand")] UNSPEC_LDNT1_SVE))] "TARGET_SVE" "ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1" @@ -1456,11 +1482,13 @@ (define_expand "gather_load<mode><v_int_container>" (match_operand:<V_INT_CONTAINER> 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_dup 6) (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" { operands[5] = aarch64_ptrue_reg (<VPRED>mode); + operands[6] = CONST0_RTX (<MODE>mode); } ) @@ -1474,6 +1502,7 @@ (define_insn "mask_gather_load<mode><v_int_container>" (match_operand:VNx4SI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_4 6 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1503,6 +1532,7 @@ (define_insn "mask_gather_load<mode><v_int_container>" (match_operand:VNx2DI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 6 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1531,6 +1561,7 @@ (define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_<su>xtw_unpac UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1561,6 +1592,7 @@ (define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_sxtw" UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1588,6 +1620,7 @@ (define_insn "*mask_gather_load<mode><v_int_container>_uxtw" (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate")) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_2 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] "TARGET_SVE && TARGET_NON_STREAMING" @@ -1624,6 +1657,7 @@ (define_insn_and_rewrite "@aarch64_gather_load_<ANY_EXTEND:optab><SVE_4HSI:mode> (match_operand:VNx4SI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_4BHI:Vesize>") + (match_operand:SVE_4BHI 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1663,6 +1697,7 @@ (define_insn_and_rewrite "@aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode (match_operand:VNx2DI 2 "register_operand") (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 7 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1701,6 +1736,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1738,6 +1774,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode UNSPEC_PRED_X) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] @@ -1772,6 +1809,7 @@ (define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><SVE_2HSDI:mode (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate")) (match_operand:DI 3 "const_int_operand") (match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>") + (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand") (mem:BLK (scratch))] UNSPEC_LD1_GATHER))] UNSPEC_PRED_X))] diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 5f2697c3179..22e8632af80 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -138,7 +138,8 @@ (define_insn "@aarch64_<optab><mode>" [(set (match_operand:SVE_FULLx24 0 "aligned_register_operand" "=Uw<vector_count>") (unspec:SVE_FULLx24 [(match_operand:VNx16BI 2 "register_operand" "Uph") - (match_operand:SVE_FULLx24 1 "memory_operand" "m")] + (match_operand:SVE_FULLx24 1 "memory_operand" "m") + (match_operand:SVE_FULLx24 3 "aarch64_maskload_else_operand")] LD1_COUNT))] "TARGET_STREAMING_SME2" "<optab><Vesize>\t%0, %K2/z, %1" diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 0bc98315bb6..6592b3df3b2 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3224,10 +3224,6 @@ (define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE (define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1]) -(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE]) - -(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")]) - (define_int_iterator LD1_COUNT [UNSPEC_LD1_COUNT UNSPEC_LDNT1_COUNT]) (define_int_iterator ST1_COUNT [UNSPEC_ST1_COUNT UNSPEC_STNT1_COUNT]) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 6ad9a4bd8b9..26cfaed2402 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -1067,3 +1067,7 @@ (define_predicate "aarch64_granule16_simm9" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), -4096, 4080) && !(INTVAL (op) & 0xf)"))) + +(define_predicate "aarch64_maskload_else_operand" + (and (match_code "const_int,const_vector") + (match_test "op == CONST0_RTX (GET_MODE (op))"))) -- 2.47.0