This patch adds support for hardware loops as described in: https://docs.openhwgroup.org/projects/cv32e40p-user-manual/en/cv32e40p_v1.3.2/instruction_set_extensions.html#hardware-loops .
riscv32-corev-elf (using newlib) regression tested for multilibs: rv32imc_zicsr-ilp32-- rv32imfc_zicsr-ilp32-- rv32imc_zicsr_zfinx-ilp32-- rv32imfc_zicsr_xcvmac_xcvalu-ilp32-- also tested against this: rv32imc_zicsr_xcvhwlp-ilp32-- rv32imfc_zicsr_xcvhwlp-ilp32-- rv32imc_zicsr_zfinx_xcvhwlp-ilp32-- rv32imfc_zicsr_xcvmac_xcvalu_xcvhwlp-ilp32- Bootstrapped on x86_64 build 'all-gcc' for x86_64 x sh-elf
Add support for XCVhwlp extension in CV32E40P 2023-11-18 Joern Rennecke <joern.renne...@embecosm.com> gcc/ * common/config/riscv/riscv-common.cc (riscv_ext_version_table): Add xcvhwlp. (riscv_ext_flag_table): Likewise. * config.gcc (riscv*): Add corev.o to extra_objs. * config/riscv/constraints.md (xcvl0s, xcvl0e): New constraints. (xcvl0c, xcvl1s, xcvl1e, xcvl1c): Likewise. (CVl0, xcvlb5, xcvlbs, xcvlbe, CV12): Likewise. * config/riscv/corev.cc: New file. * config/riscv/corev.md (UNSPEC_CV_LOOPBUG): New constant. (UNSPECV_CV_LOOPALIGN, UNSPEC_CV_FOLLOWS): Likewise. (UNSPEC_CV_LP_START_12): Likewise. (UNSPEC_CV_LP_END_5, UNSPEC_CV_LP_END_12): Likewise. (doloop_end_i, *cv_start, *cv_end, *cv_count): New insn patterns. (doloop_align): Likewise. (doloop_end, doloop_begin): New expanders. (doloop_begin_i): New define_insn_and_split. (doloop_begin_i+1): New splitter. * config/riscv/predicates.md (lpstart_reg_op): New predicate. (lpend_reg_op, lpcount_reg_op): Likewise. (label_register_operand, move_dest_operand): Likewise. * config/riscv/riscv-passes.def (pass_riscv_doloop_begin): Add. (pass_riscv_doloop_ranges): Insert before and after register allocation. * config/riscv/riscv-protos.h (make_pass_riscv_doloop_begin): Declare. (make_pass_riscv_doloop_ranges): Likewise. (riscv_can_use_doloop_p, riscv_invalid_within_doloop): Likewise. (hwloop_setupi_p, add_label_op_ref, corev_label_align): Likewise. * config/riscv/riscv.cc (riscv_regno_to_class): Add classes for hardware loop start, end and counter registers. (riscv_strip_unspec_address): Also strip UNSPEC_CV_LP_START_12, UNSPEC_CV_LP_END_5 and UNSPEC_CV_LP_END_12. (riscv_output_move): Add support to read loop counter registers. (TARGET_CAN_USE_DOLOOP_P, TARGET_INVALID_WITHIN_DOLOOP): Override. * config/riscv/riscv.h (enum reg_class): Add items for hardware loop start, end and counter registers. (REG_CLASS_NAMES): Likewise. (REG_CLASS_CONTENTS): Likewise. (REG_ALLOC_ORDER): Likewise. (REGISTER_NAMES): Likewise. (LABEL_ALIGN): Define. * config/riscv/riscv.md (LPSTART0_REGNUM): New constant. (LPEND0_REGNUM, LPCOUNT0_REGNUM): Likewise. (LPSTART1_REGNUM, LPEND1_REGNUM, LPCOUNT1_REGNUM): Likewise. (attr ext): New value xcvhwlp. (attr enabled): Handle xcvhwlp. (movsi_internal): Add alternatives to read loop counters. Use move_dest_operand. * config/riscv/riscv.opt (XCVHWLP): New Mask. * config/riscv/t-riscv (corev.o): New rule. * doc/md.texi (doloop_end): Document optional operand 2. * loop-doloop.cc (doloop_optimize): Provide 3rd operand to gen_doloop_end. * target-insns.def (doloop_end): Add optional 3rd operand. gcc/testsuite/ * gcc.target/riscv/cv-hwlp-shiftsub.c: New test. diff --git a/gcc/common/config/riscv/riscv-common.cc b/gcc/common/config/riscv/riscv-common.cc index 5111626157b..55b56235134 100644 --- a/gcc/common/config/riscv/riscv-common.cc +++ b/gcc/common/config/riscv/riscv-common.cc @@ -312,6 +312,7 @@ static const struct riscv_ext_version riscv_ext_version_table[] = {"xcvmac", ISA_SPEC_CLASS_NONE, 1, 0}, {"xcvalu", ISA_SPEC_CLASS_NONE, 1, 0}, + {"xcvhwlp", ISA_SPEC_CLASS_NONE, 1, 0}, {"xtheadba", ISA_SPEC_CLASS_NONE, 1, 0}, {"xtheadbb", ISA_SPEC_CLASS_NONE, 1, 0}, @@ -1676,6 +1677,7 @@ static const riscv_ext_flag_table_t riscv_ext_flag_table[] = {"xcvmac", &gcc_options::x_riscv_xcv_subext, MASK_XCVMAC}, {"xcvalu", &gcc_options::x_riscv_xcv_subext, MASK_XCVALU}, + {"xcvhwlp", &gcc_options::x_riscv_xcv_subext, MASK_XCVHWLP}, {"xtheadba", &gcc_options::x_riscv_xthead_subext, MASK_XTHEADBA}, {"xtheadbb", &gcc_options::x_riscv_xthead_subext, MASK_XTHEADBB}, diff --git a/gcc/config.gcc b/gcc/config.gcc index 6d51bd93f3f..8cddfbb12b3 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -546,7 +546,7 @@ riscv*) extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o" extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o riscv-avlprop.o" extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o" - extra_objs="${extra_objs} thead.o riscv-target-attr.o" + extra_objs="${extra_objs} thead.o riscv-target-attr.o corev.o" d_target_objs="riscv-d.o" extra_headers="riscv_vector.h" target_gtfiles="$target_gtfiles \$(srcdir)/config/riscv/riscv-vector-builtins.cc" diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md index 68be4515c04..df7f2122bb1 100644 --- a/gcc/config/riscv/constraints.md +++ b/gcc/config/riscv/constraints.md @@ -158,6 +158,50 @@ (and (match_test "IN_RANGE (ival, 0, 1073741823)") (match_test "exact_log2 (ival + 1) != -1")))) +(define_register_constraint "xcvl0s" "TARGET_XCVHWLP ? LP0START_REGS : NO_REGS" + "lpstart0 for Xcv") + +(define_register_constraint "xcvl0e" "TARGET_XCVHWLP ? LP0END_REGS : NO_REGS" + "lpend0 for Xcv") + +(define_register_constraint "xcvl0c" "TARGET_XCVHWLP ? LP0COUNT_REGS : NO_REGS" + "lpcount0 for Xcv") + +(define_register_constraint "xcvl1s" "TARGET_XCVHWLP ? LP1START_REGS : NO_REGS" + "lpstart1 for Xcv") + +(define_register_constraint "xcvl1e" "TARGET_XCVHWLP ? LP1END_REGS : NO_REGS" + "lpend1 for Xcv") + +(define_register_constraint "xcvl1c" "TARGET_XCVHWLP ? LP1COUNT_REGS : NO_REGS" + "lpcount1 for Xcv") + +(define_constraint "CVl0" + "A label that follows immediately after the instruction that uses it" + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_CV_FOLLOWS"))) + +(define_constraint "xcvlb5" + "A label for a loop end that can probably be addressed with a 5-bit-offset" + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_5"))) + +(define_constraint "xcvlbs" + "A label for a loop start that can definitely be addressed with a 12-bit-offset" + (and (match_code "unspec") + (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_12"))) + +(define_constraint "xcvlbe" + "A label for a loop end that can definitely be addressed with a 12-bit-offset" + (and (match_code "unspec") + (ior (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_5") + (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_12")))) + +(define_constraint "CV12" + "A 12-bit unsigned immediate to set up a loop counter with cv.setupi" + (and (match_code "const_int") + (match_test "IN_RANGE (ival, 0, 4095)"))) + ;; Vector constraints. (define_register_constraint "vr" "TARGET_VECTOR ? V_REGS : NO_REGS" diff --git a/gcc/config/riscv/corev.cc b/gcc/config/riscv/corev.cc new file mode 100644 index 00000000000..5bc711a6afe --- /dev/null +++ b/gcc/config/riscv/corev.cc @@ -0,0 +1,392 @@ +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "rtl.h" +#include "insn-config.h" +#include "recog.h" +#include "function.h" +#include "memmodel.h" +#include "emit-rtl.h" +#include "tm_p.h" +#include "tree-pass.h" +#include "df.h" + +/* Creating doloop_begin patterns fully formed with a named pattern + reusults in the labels they use to refer to the loop start being + removed from the insn list during loop_done pass, so instead we + put the doloop_end insn in the place of the label, and patch this + up after the loop_done pass. + Also, while being at that, replace the pseudo reg used for the + counter in doloop_begin / doloop_end by the appropriate hard register, + since lra doesn't find the right solution. */ + +namespace { + +const pass_data pass_data_riscv_doloop_begin = +{ + RTL_PASS, /* type */ + "riscv_doloop_begin", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_LOOP_DOLOOP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_riscv_doloop_begin : public rtl_opt_pass +{ +public: + pass_riscv_doloop_begin (gcc::context *ctxt) + : rtl_opt_pass (pass_data_riscv_doloop_begin, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return TARGET_XCVHWLP + && flag_branch_on_count_reg && optimize > 0; + } + virtual unsigned int execute (function *); +}; // class pass_riscv_doloop_begin + +unsigned int +pass_riscv_doloop_begin::execute (function *) +{ + for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if (!NONJUMP_INSN_P (insn) + || recog_memoized (insn) != CODE_FOR_doloop_begin_i) + continue; + rtx *lref_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 0)); + rtx_insn *end_insn + = as_a <rtx_insn *> (XEXP (XVECEXP (*lref_loc, 0, 0), 0)); + rtx pat = PATTERN (end_insn); + rtx start_label_ref = XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 1); + start_label_ref + = gen_rtx_LABEL_REF (SImode, label_ref_label (start_label_ref)); + *lref_loc = start_label_ref; + add_label_op_ref (insn, start_label_ref); + rtx *reg_loc0 = &SET_DEST (XVECEXP (PATTERN (insn), 0, 2)); + rtx *reg_loc1 = &XEXP (XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 0), 0); + rtx *reg_loc2 = &XEXP (SET_SRC (XVECEXP (pat, 0, 1)), 0); + rtx *reg_loc3 = &SET_DEST (XVECEXP (pat, 0, 1)); + gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc1)); + gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc2)); + gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc3)); + rtx start_reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 0)); + rtx hreg = gen_rtx_REG (SImode, + LPCOUNT0_REGNUM + + REGNO (start_reg) - LPSTART0_REGNUM); + *reg_loc0 = hreg; + *reg_loc1 = hreg; + *reg_loc2 = hreg; + *reg_loc3 = hreg; + df_insn_rescan (insn); + df_insn_rescan (end_insn); + } + + return 0; +} + +} // anon namespace + +rtl_opt_pass * +make_pass_riscv_doloop_begin (gcc::context *ctxt) +{ + return new pass_riscv_doloop_begin (ctxt); +} + +/* We'd like to check that there's no flow control inside the loop + except for nested HW loops and the final branch back to the loop latch. + However, we can't do that becaue we are not being passed the loop + structure. + Likewise, if there is a large loop that has hardly any iterations, + the loop setup can't be amortized, but we can't test here if the + loop is large. */ +bool +riscv_can_use_doloop_p (const widest_int &, const widest_int &, + unsigned int loop_depth, bool entered_at_top) +{ + if (!TARGET_XCVHWLP) + return false; + if (loop_depth > 2) + return false; + if (!entered_at_top) + return false; + return true; +} + +/* The only control flow allowed inside a HW loop is another HW loop, + ebreak, and ecall. */ +const char * +riscv_invalid_within_doloop (const rtx_insn *insn) +{ + if (CALL_P (insn)) + return "Function call in the loop."; + /* Alas, the jump at the end of the loop is considered part of the loop, + and there's no good way here to distinguish it from interspersed control + flow. We have to leave it to the doloop_end expander to analyze the loop + again. */ +#if 0 + if (JUMP_P (insn) && recog_memoized (const_cast <rtx_insn *> (insn)) != CODE_FOR_doloop_end_i) + return "Jump in loop."; +#endif + + return NULL; +} + +/* Starting at INSN, try to find, within the next COUNT insn, + a doloop_end_i pattern that provides the label END . + If found, return the remaining value of COUNT, otherwise, 0. */ +static unsigned +doloop_end_range_check (rtx_insn *insn, rtx_insn *end, unsigned count) +{ + for (; count > 0; insn = NEXT_INSN (insn)) + { + if (insn == NULL_RTX) + return 0; + if (!active_insn_p (insn)) + continue; + if (recog_memoized (insn) == CODE_FOR_doloop_end_i) + { + rtx label_use = XVECEXP (PATTERN (insn), 0, 4); + if (label_ref_label (XEXP (label_use, 0)) == end) + break; + } + count--; + } + return count; +} + +/* Determine if we can implement the loop setup MD_INSN with cv.setupi, + considering the hardware loop starts at the labels in the LABEL_REFs + START_REF and END_REF. */ + +bool +hwloop_setupi_p (rtx md_insn, rtx start_ref, rtx end_ref) +{ + rtx_insn *insn = as_a <rtx_insn *> (md_insn); + if (GET_CODE (start_ref) == UNSPEC) + start_ref = XVECEXP (start_ref, 0, 0); + if (GET_CODE (end_ref) == UNSPEC) + end_ref = XVECEXP (end_ref, 0, 0); + rtx_insn *start = label_ref_label (start_ref); + rtx_insn *end = label_ref_label (end_ref); + + /* The the loop must directly follow the cv.setupi instruction. */ + if (next_active_insn (insn) != next_active_insn (start)) + return false; + + /* Loops with >= 4K instructions can't be setup with cv.setupi . */ + if (doloop_end_range_check (insn, end, 4095) == 0) + return false; + + return true; +} + +void +add_label_op_ref (rtx_insn *insn, rtx label) +{ + if (GET_CODE (label) == LABEL_REF) + label = label_ref_label (label); + add_reg_note (insn, REG_LABEL_OPERAND, label); + ++LABEL_NUSES (label); +} + + +/* Before register allocation, we need to know if a cv.setupi instruction + might need to replaced with instructions that use an extra scratch + register becasue the labels are out of range. If we split into + cv.starti / cv.endi / cv.counti, all three parameters can use a + 12 bit immediate. Considering the instructions inside the loop, + we got a three-address machine, so a typical instruction has three + operands, each of which might need reloading. To load or store a + register from a stack slot on a 32 bit RISC-V, worst case we might + need a LUI and a load or store instruction. Thus seven instruction + after reload for one instruction before reload. The 12 bit unsigned + offset allows 4095 instructions, so for a safe number before reload, + we divide by seven to arrive at 585. That seems a comfortable number + that we don't have to worry too much about pessimizing the code when + reserve a scratch register when the loop gets that big. + + For performance, we like to use sv.setupi or at least cv.setup where + possible, as it is only a single instruction; we assume that usually, + there will be no reloads for a HW loop if currently fit into the 5 bit + immeidate range, as that makes them a small inner loop. + + loop start not immediatly following -> need to split + otherwise, if loop end won't fit in u5, probably in u12 -> aim for cv.setup + otherwise, if loop count won't fit in u12 -> aim for cv.setup + loop end might not fit into u12 after reload -> need scratch register + in case end needs to be loaded with cv.end . */ + +namespace { + +const pass_data pass_data_riscv_doloop_ranges = +{ + RTL_PASS, /* type */ + "riscv_doloop_ranges", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_LOOP_DOLOOP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_riscv_doloop_ranges : public rtl_opt_pass +{ +public: + pass_riscv_doloop_ranges (gcc::context *ctxt) + : rtl_opt_pass (pass_data_riscv_doloop_ranges, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return TARGET_XCVHWLP + && flag_branch_on_count_reg && optimize > 0; + } + virtual unsigned int execute (function *); + + opt_pass *clone () + { + return new pass_riscv_doloop_ranges (m_ctxt); + } +}; // class pass_riscv_doloop_ranges + +/* Look for doloop_begin_i patterns and make sure start labels are + appropriatly encapsulated or non-encapsulated in UNSPECs to show + if they satisfy offset range requirements. + We run this once just before register allocation and once afterwards, + so we can't just formulate this as a branch shortening problem. + In the post-reload pass, also add doloop_align if necessary. */ +unsigned int +pass_riscv_doloop_ranges::execute (function *) +{ + for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if (!NONJUMP_INSN_P (insn) + || recog_memoized (insn) != CODE_FOR_doloop_begin_i) + continue; + rtx *lref_s_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 0)); + rtx *lref_e_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 1)); + rtx lp_count = SET_SRC (XVECEXP (PATTERN (insn), 0, 2)); + rtx scratch = SET_DEST (XVECEXP (PATTERN (insn), 0, 3)); + rtx start_label_ref = *lref_s_loc; + rtx end_label_ref = *lref_e_loc; + if (GET_CODE (start_label_ref) == UNSPEC) + start_label_ref = XVECEXP (start_label_ref, 0, 0); + if (GET_CODE (end_label_ref) == UNSPEC) + end_label_ref = XVECEXP (end_label_ref, 0, 0); + + if (reload_completed + && GET_CODE (scratch) == SCRATCH + && (next_active_insn (label_ref_label (start_label_ref)) + != next_active_insn (insn)) + && CONST_INT_P (lp_count)) + { + /* This is supposed to be a cv.setupi, but register allocation + put spill code in between the doloop_setup_i and the loop + start. Move the doloop_begin_i back to the start of the loop. + We can't do this if the loop counter is initialized from a + register, because that register might be used by the spill code; + it fact, it is likely to be used by it, so there is little point + to analyze if it is. + ??? We could allow the doloop_begin_i pattern to read the count + from memory (using clobber and splitter to fix that up) to have + a better chance to get code that allows the doloop_begin_i to + be moved back to the start of the loop. + ??? Much better would be to have a hook or other mechanism to + prevent reload / lra from inserting spill code between + doloop_begin_i and the loop start. */ + + rtx_insn *prev = PREV_INSN (insn); + rtx_insn *next = NEXT_INSN (insn); + SET_NEXT_INSN (prev) = NEXT_INSN (insn); + SET_PREV_INSN (next) = PREV_INSN (insn); + SET_PREV_INSN (insn) = NULL_RTX; + SET_NEXT_INSN (insn) = NULL_RTX; + + emit_insn_after (insn, PREV_INSN (label_ref_label (start_label_ref))); + + insn = next; + continue; + } + + if (next_active_insn (label_ref_label (start_label_ref)) + != next_active_insn (insn)) + { + if (GET_CODE (*lref_s_loc) == UNSPEC) + *lref_s_loc = gen_rtx_UNSPEC (SImode, + gen_rtvec (1, start_label_ref), + UNSPEC_CV_LP_START_12); + else + *lref_s_loc = start_label_ref; + /* We must not emit an insn outside of basic blocks, so + emit the align after the NOTE_INSN_BASIC_BLOCK note. */ + rtx_insn *after = NEXT_INSN (label_ref_label (start_label_ref)); + if (reload_completed && TARGET_RVC) + emit_insn_after (gen_doloop_align (), after); + } + else + { + if (GET_CODE (*lref_s_loc) != UNSPEC) + *lref_s_loc = gen_rtx_UNSPEC (SImode, + gen_rtvec (1, start_label_ref), + UNSPEC_CV_FOLLOWS); + if (reload_completed && TARGET_RVC) + emit_insn_before (gen_doloop_align (), insn); + } + + rtx_insn *end_label = label_ref_label (end_label_ref); + unsigned count = (reload_completed ? 4095 : 585); + unsigned rest = doloop_end_range_check (insn, end_label, count); + + if (rest) + { + /* Check if an unsigned 5 bit offset is enough. */ + bool short_p = count - rest <= 31; + HOST_WIDE_INT val + = short_p ? UNSPEC_CV_LP_END_5 : UNSPEC_CV_LP_END_12; + if (GET_CODE (*lref_e_loc) != UNSPEC + || XINT (*lref_e_loc, 1) != val) + *lref_e_loc + = gen_rtx_UNSPEC (SImode, gen_rtvec (1, end_label_ref), val); + } + else + *lref_e_loc = end_label_ref; + } + + return 0; +} + +} // anon namespace + +rtl_opt_pass * +make_pass_riscv_doloop_ranges (gcc::context *ctxt) +{ + return new pass_riscv_doloop_ranges (ctxt); +} + +/* Return alignment requested for a label as a power of two. + We can't put doloop_align instructions before doloop start labels lest + they end up outside of basic blocks in case there's a preceding BARRIER, + so we put them after the label. However, the label must be aligned. */ +int +corev_label_align (rtx_insn *label) +{ + rtx_insn *next = label; + do + next = NEXT_INSN (next); + while (next && NOTE_P (next)); + if (next && NONJUMP_INSN_P (next) + && recog_memoized (next) == CODE_FOR_doloop_align) + return 2; + return 0; +} diff --git a/gcc/config/riscv/corev.md b/gcc/config/riscv/corev.md index 1350bd4b81e..7d4890eab52 100644 --- a/gcc/config/riscv/corev.md +++ b/gcc/config/riscv/corev.md @@ -24,6 +24,15 @@ UNSPEC_CV_ALU_CLIPR UNSPEC_CV_ALU_CLIPU UNSPEC_CV_ALU_CLIPUR + + ;; CORE-V HWLP + UNSPEC_CV_LOOPBUG + UNSPECV_CV_LOOPALIGN + UNSPEC_CV_FOLLOWS + UNSPEC_CV_LP_START_12 + UNSPEC_CV_LP_END_5 + UNSPEC_CV_LP_END_12 + ]) ;; XCVMAC extension. @@ -691,3 +700,288 @@ cv.suburnr\t%0,%2,%3" [(set_attr "type" "arith") (set_attr "mode" "SI")]) + +;; ??? The manual is unclear what the hardware loops actually do. +;; We are just guessing here. +(define_insn "doloop_end_i" + [(set (pc) + (if_then_else + (ne (match_operand:SI 0 "nonimmediate_operand" "+xcvl0c,xcvl1c") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_dup 0) + (plus:SI (match_dup 0) + (const_int -1))) + (use (match_operand:SI 2 "" "xcvl0s,xcvl1s")) + (use (match_operand:SI 3 "" "xcvl0e,xcvl1e")) + (use (match_operand:SI 4 "" "X,X"))] + "TARGET_XCVHWLP" +{ + unsigned n_nops = 3; + for (rtx_insn * curr = PREV_INSN (current_output_insn); + curr && n_nops && !LABEL_P (curr); + curr = PREV_INSN (curr)) + if (active_insn_p (curr)) + { + n_nops--; + if (recog_memoized (curr) == CODE_FOR_doloop_end_i) + break; + } + while (n_nops--) + asm_fprintf (asm_out_file, "\tnop\n"); + output_asm_insn ("%4:", operands); + if (TARGET_RVC) + asm_fprintf (asm_out_file, "\t.option rvc\n"); + return ""; +} + [(set_attr "type" "branch") + (set_attr "length" "0")] +) + +(define_expand "doloop_end" + [(match_operand:SI 0 "nonimmediate_operand" "") + (match_operand 1 "" "") + (match_operand 2 "" "")] + "TARGET_XCVHWLP" +{ + if (GET_MODE (operands[0]) != SImode) + FAIL; + + rtx_insn *start_label = as_a<rtx_insn *> (operands[1]); + + /* A HW loop must contain at least three insns. If there are less than + two insns in the loop, we must add two or more nops, which is worse + than just using a normal loop with separate decrement and + branch instructions. */ + unsigned n_insns = 0; + /* We must not set the counter register inside the loop, except for the + increment that'll be folded into the doloop_end. But that is already + taken care of by loop_optimize, which creates a new register just for + counting. */ + + /* If nesting HW loops, the inner loop must be using + lpspart0, lpend0, lpcount0 . It's OK if we have more than one inner + loop, as long as they are not nested into each other; we have already + checked the nesting depth in riscv_can_use_doloop_p. */ + bool inner_loop_p = false; + rtx_insn *bb_last = NULL; + rtx_insn *bb_succ = NULL; + for (rtx_insn *insn = start_label; ; + insn = (insn == bb_last ? bb_succ : NEXT_INSN (insn))) + { + if (!insn) + FAIL; + + /* For: int f (int i, int j) { while (--j) i = (i << 1) - 13; return i; } + we get passed a start label that's actually after the final branch. */ + + if (NOTE_INSN_BASIC_BLOCK_P (insn)) + { + basic_block bb = NOTE_BASIC_BLOCK (insn); + bb_last = BB_END (bb); + if (single_succ_p (bb)) + bb_succ = BB_HEAD (single_succ (bb)); + else if (recog_memoized (bb_last) == CODE_FOR_doloop_end_i) + bb_succ = BB_HEAD (FALLTHRU_EDGE (bb)->dest); + else if (bb_last == operands[2]) + bb_succ = NULL; + else + FAIL; + } + + if (NONJUMP_INSN_P (insn)) + n_insns++; + else if (JUMP_P (insn)) + { + if (recog_memoized (insn) == CODE_FOR_doloop_end_i) + inner_loop_p = true; + else if (insn != operands[2]) + FAIL; + else + break; + } + } + /* We have counted in the counter decrement, so we need three insns for the + cost of the HW loop to be amortized. */ + if (n_insns < 3) + FAIL; + + rtx start = gen_rtx_REG (SImode, LPSTART0_REGNUM + (inner_loop_p ? 3 : 0)); + rtx end = gen_rtx_REG (SImode, LPEND0_REGNUM + (inner_loop_p ? 3 : 0)); + rtx ref = gen_rtx_LABEL_REF (SImode, gen_label_rtx ()); + rtx_insn *jump + = emit_jump_insn (gen_doloop_end_i (operands[0], operands[1], + start, end, ref)); + add_label_op_ref (jump, ref); + DONE; +}) + +;; Although the alignment can be thought of taking up to two bytes, that is +;; only the case if the assembler first saved space by creating a short insn. +;; The compiler doesn't generally take short insn into account when calculating +;; lengths. +(define_insn "doloop_align" + [(unspec_volatile [(const_int 0)] UNSPECV_CV_LOOPALIGN)] + "TARGET_XCVHWLP && TARGET_RVC" + ".balign\t4\;.option norvc" + [(set_attr "type" "ghost")]) + +; We use an actual doloop_begin pattern to make sure the loop counter +; gets allocated to the right registers, and that we have a scratch GPR +; if we nee it. +; We do want the doloop_begin_i pattern to be right at the top of the loop +; for efficiency, as we can use cv.setup / cv.setupi then. +; If we must, we can, however, split the instruction into a triplet +; of instruction that can go anywhere - with potentially some extra +; instrustions to load constants into GPR registers first, particularily +; if the loop start setup ends up below the loop. + +;; Sometimes - e.g. newlib/libc/stdlib/src4random.c +;; -Os -march=rv32imc_zicsr_xcvhwlp - we have spagetti code at split2, with +;; the loop setup below the loop, and it's still spaghetti at peephole2, but +;; it gets sorted out at bbro. Should we delay the doloop_begin_i split +;; until after bbro, and add another split pass - or always drive the split +;; with a '#' output pattern, to avoid this issue? + +(define_insn_and_split "doloop_begin_i" + [(set (match_operand:SI 0 "lpstart_reg_op") + (match_operand:SI 1)) + (set (match_operand:SI 2 "lpend_reg_op") + (match_operand:SI 3)) + (set (match_operand:SI 4 "register_operand") + (match_operand:SI 5 "immediate_register_operand")) + (clobber (match_scratch:SI 6))] + "TARGET_XCVHWLP" + {@ [cons: =0, 1, =2, 3, =4, 5, =6; attrs: length ] + [xcvl0s, CVl0, xcvl0e, xcvlb5, xcvl0c, CV12, X ; 4] cv.setupi\t0, %5, %3 + [xcvl1s, CVl0, xcvl1e, xcvlb5, xcvl1c, CV12, X ; 4] cv.setupi\t1, %5, %3 + [xcvl0s, CVl0, xcvl0e, xcvlbe, xcvl0c, r, X ; 4] cv.setup\t0, %5, %3 + [xcvl1s, CVl0, xcvl1e, xcvlbe, xcvl1c, r, X ; 4] cv.setup\t1, %5, %3 + [xcvl0s,?iCVl0,xcvl0e,?ixcvlbe,xcvl0c, ?ri, &r ; 12] # + [xcvl1s,?iCVl0,xcvl1e,?ixcvlbe,xcvl1c, ?ri, &r ; 12] # + } + ;; We don't know the loop length until after register allocation. + ;; Even in the cases where we already can know before reload that we must + ;; split, the test is costly, and splitting early could confuse RA. + "&& reload_completed + && (GET_CODE (operands[1]) == LABEL_REF + || GET_CODE (operands[1]) == UNSPEC) + && !hwloop_setupi_p (insn, operands[1], operands[3])" + [(set (match_dup 4) (match_dup 5))] +{ + if (GET_CODE (operands[1]) == UNSPEC) + operands[1] = XVECEXP (operands[1], 0, 0); + else + { + emit_move_insn (operands[6], operands[1]); + operands[1] = operands[6]; + } + emit_insn (gen_rtx_SET (operands[0], operands[1])); + if (GET_CODE (operands[3]) == UNSPEC) + operands[3] = XVECEXP (operands[3], 0, 0); + else + { + emit_move_insn (operands[6], operands[3]); + operands[3] = operands[6]; + } + emit_insn (gen_rtx_SET (operands[2], operands[3])); + if (!REG_P (operands[5]) + && !satisfies_constraint_CV12 (operands[5])) + { + emit_move_insn (operands[6], operands[5]); + operands[5] = operands[6]; + } +} + [(set_attr "move_type" "move")] +) + +;; If we have a doloop_begin_i instruction that has labels that +;; statisfy cv.setup, but not cv.setupi, yet the loop count is an +;; immediate, split to load the immediate into the scratch register. +(define_split + [(set (match_operand:SI 0 "lpstart_reg_op") + (match_operand:SI 1)) + (set (match_operand:SI 2 "lpend_reg_op") + (match_operand:SI 3)) + (set (match_operand:SI 4 "register_operand") + (match_operand:SI 5 "immediate_operand")) + (clobber (match_operand:SI 6 "register_operand"))] + "TARGET_XCVHWLP + && reload_completed + && hwloop_setupi_p (insn, operands[1], operands[3]) + && !satisfies_constraint_xcvlb5 (operands[3])" + [(set (match_dup 6) (match_dup 5)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3)) + (set (match_dup 4) (match_dup 6)) + (clobber (scratch:SI))])] +) + +(define_expand "doloop_begin" + [(use (match_operand 0 "register_operand")) + (use (match_operand 1 ""))] + "TARGET_XCVHWLP" +{ + rtx pat = PATTERN (operands[1]); + /* ??? cleanup_cfg, called from pass_rtl_loop_done::execute, deletes + loop latches without updating LABEL_REFS in non-jump instructions + even when marked with REG_LABEL_OPEREND notes. */ +#if 0 + rtx start_label_ref + = XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 1); +#else + rtx lst = gen_rtx_INSN_LIST (VOIDmode, operands[1], NULL_RTX); + rtx start_label_ref + = gen_rtx_UNSPEC (SImode, gen_rtvec (1, lst), UNSPEC_CV_LOOPBUG); +#endif + rtx start_reg = XEXP (XVECEXP (pat, 0, 2), 0); + rtx end_reg = XEXP (XVECEXP (pat, 0, 3), 0); + rtx end_label_ref = XEXP (XVECEXP (pat, 0, 4), 0); + rtx_insn *insn = emit_insn (gen_doloop_begin_i (start_reg, start_label_ref, + end_reg, end_label_ref, + operands[0], operands[0])); + //add_label_op_ref (insn, start_label_ref); + add_label_op_ref (insn, end_label_ref); + DONE; +}) + +;; Although cv.start / cv.end / cv.count could be seen as move instructions +;; and therefore belonging to movsi_internal, that is problematic because +;; using them outside loopsetup contexts might confuse the HW loop logic +;; of the processor. We might model this with UNSPEC_VOLATILEs, but +;; that'd likely get too much into the way of optimizations. +(define_insn "*cv_start" + [(set (match_operand:SI 0 "lpstart_reg_op" "=xcvl0s,xcvl1s,xcvl0s,xcvl1s") + (match_operand:SI 1 "label_register_operand" "i,i,r,r"))] + "TARGET_XCVHWLP" +{ + if (!REG_P (operands[1]) && TARGET_RVC) + asm_fprintf (asm_out_file, "\t.balign\t4\n"); + operands[0] = GEN_INT (REGNO (operands[0]) == LPSTART0_REGNUM ? 0 : 1); + return REG_P (operands[1]) ? "cv.start %0,%1" : "cv.starti %0, %1"; +} + [(set_attr "move_type" "move")]) + +(define_insn "*cv_end" + [(set (match_operand:SI 0 "lpend_reg_op" "=xcvl0e,xcvl1e,xcvl0e,xcvl1e") + (match_operand:SI 1 "label_register_operand" "i,i,r,r"))] + "TARGET_XCVHWLP" +{ + if (!REG_P (operands[1]) && TARGET_RVC) + asm_fprintf (asm_out_file, "\t.balign\t4\n"); + operands[0] = GEN_INT (REGNO (operands[0]) == LPEND0_REGNUM ? 0 : 1); + return REG_P (operands[1]) ? "cv.end %0,%1" : "cv.endi %0, %1"; +} + [(set_attr "move_type" "move")]) + +(define_insn "*cv_count" + [(set (match_operand:SI 0 "lpcount_reg_op" "=xcvl0c,xcvl1c,xcvl0c,xcvl1c") + (match_operand:SI 1 "immediate_register_operand" "CV12,CV12,r,r"))] + "TARGET_XCVHWLP" +{ + operands[0] = GEN_INT (REGNO (operands[0]) == LPCOUNT0_REGNUM ? 0 : 1); + return REG_P (operands[1]) ? "cv.count %0,%1" : "cv.counti %0, %1"; +} + [(set_attr "move_type" "move")]) diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 90567a817a7..5fb89500146 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -400,6 +400,31 @@ (ior (match_operand 0 "register_operand") (match_code "const_int"))) +(define_predicate "label_register_operand" + (ior (match_operand 0 "register_operand") + (match_code "label_ref"))) + +(define_predicate "lpstart_reg_op" + (and (match_code "reg") + (match_test "REGNO (op) == LPSTART0_REGNUM || REGNO (op) == LPSTART1_REGNUM"))) + +(define_predicate "lpend_reg_op" + (and (match_code "reg") + (match_test "REGNO (op) == LPEND0_REGNUM || REGNO (op) == LPEND1_REGNUM"))) + +(define_predicate "lpcount_reg_op" + (and (match_code "reg") + (match_test "REGNO (op) == LPCOUNT0_REGNUM || REGNO (op) == LPCOUNT1_REGNUM"))) + +;; The instructions to set hardware loop start / end are special. +;; We don't want the register allocator to morph these from/to ordinary +;; moves, since pc-relative loads are position dependent in their range. +(define_predicate "move_dest_operand" + (and (match_operand 0 "nonimmediate_operand") + (not (match_operand 0 "lpstart_reg_op")) + (not (match_operand 0 "lpend_reg_op")) + (not (match_operand 0 "lpcount_reg_op")))) + ;; Predicates for the V extension. (define_special_predicate "vector_length_operand" (ior (match_operand 0 "pmode_register_operand") diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def index b6260939d5c..4efe09ab04f 100644 --- a/gcc/config/riscv/riscv-passes.def +++ b/gcc/config/riscv/riscv-passes.def @@ -20,3 +20,6 @@ INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs); INSERT_PASS_AFTER (pass_split_all_insns, 1, pass_avlprop); INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl); +INSERT_PASS_AFTER (pass_rtl_loop_done, 1, pass_riscv_doloop_begin); +INSERT_PASS_BEFORE (pass_ira, 1, pass_riscv_doloop_ranges); +INSERT_PASS_BEFORE (pass_split_after_reload, 1, pass_riscv_doloop_ranges); diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 196b53f10f3..018215b7f7a 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -669,4 +669,15 @@ struct riscv_tune_info { const struct riscv_tune_info * riscv_parse_tune (const char *, bool); +/* Routines implemented in corev.cc. */ +rtl_opt_pass * make_pass_riscv_doloop_begin (gcc::context *ctxt); +rtl_opt_pass * make_pass_riscv_doloop_ranges (gcc::context *ctxt); +extern bool riscv_can_use_doloop_p (const widest_int &, const widest_int &, + unsigned int, bool); +extern const char *riscv_invalid_within_doloop (const rtx_insn *insn); +extern bool hwloop_setupi_p (rtx insn, rtx start_ref, rtx end_ref); +extern void add_label_op_ref (rtx_insn *insn, rtx label); +extern int corev_label_align (rtx_insn *); + + #endif /* ! GCC_RISCV_PROTOS_H */ diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index c2bd1c2ed29..291f1bbcdf6 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -314,8 +314,8 @@ const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = { FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FRAME_REGS, FRAME_REGS, NO_REGS, NO_REGS, - NO_REGS, NO_REGS, NO_REGS, NO_REGS, - NO_REGS, NO_REGS, NO_REGS, NO_REGS, + NO_REGS, NO_REGS, LP0START_REGS, LP0END_REGS, + LP0COUNT_REGS,LP1START_REGS, LP1END_REGS, LP1COUNT_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, NO_REGS, @@ -1769,7 +1769,8 @@ riscv_unspec_address (rtx address, enum riscv_symbol_type symbol_type) return riscv_unspec_address_offset (base, offset, symbol_type); } -/* If OP is an UNSPEC address, return the address to which it refers, +/* If OP is an UNSPEC address, UNSPEC_CV_LP_START_12, UNSPEC_CV_LP_END_5, + or UNSPEC_CV_LP_END_12, return the address to which it refers, otherwise return OP itself. */ static rtx @@ -1778,7 +1779,11 @@ riscv_strip_unspec_address (rtx op) rtx base, offset; split_const (op, &base, &offset); - if (UNSPEC_ADDRESS_P (base)) + if ((UNSPEC_ADDRESS_P (base)) + || (GET_CODE (base) == UNSPEC + && (XINT (base, 1) == UNSPEC_CV_LP_START_12 + || XINT (base, 1) == UNSPEC_CV_LP_END_5 + || XINT (base, 1) == UNSPEC_CV_LP_END_12))) op = plus_constant (Pmode, UNSPEC_ADDRESS (base), INTVAL (offset)); return op; } @@ -3436,6 +3441,19 @@ riscv_output_move (rtx dest, rtx src) return "fmv.x.d\t%0,%1"; } + if (src_code == REG && REGNO (src) == LPCOUNT0_REGNUM) + { + gcc_assert (width == 4); + gcc_assert ("TARGET_XCVHWLP"); + return "csrr %0, 0xcc2"; + } + if (src_code == REG && REGNO (src) == LPCOUNT1_REGNUM) + { + gcc_assert (width == 4); + gcc_assert ("TARGET_XCVHWLP"); + return "csrr %0, 0xcc5"; + } + if (src_code == MEM) switch (width) { @@ -9971,6 +9989,12 @@ riscv_preferred_else_value (unsigned ifn, tree vectype, unsigned int nops, #undef TARGET_EXPAND_BUILTIN #define TARGET_EXPAND_BUILTIN riscv_expand_builtin +#undef TARGET_CAN_USE_DOLOOP_P +#define TARGET_CAN_USE_DOLOOP_P riscv_can_use_doloop_p + +#undef TARGET_INVALID_WITHIN_DOLOOP +#define TARGET_INVALID_WITHIN_DOLOOP riscv_invalid_within_doloop + #undef TARGET_HARD_REGNO_NREGS #define TARGET_HARD_REGNO_NREGS riscv_hard_regno_nregs #undef TARGET_HARD_REGNO_MODE_OK diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 6205d7533f4..a17acbf49f2 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -497,6 +497,12 @@ enum reg_class GR_REGS, /* integer registers */ FP_REGS, /* floating-point registers */ FRAME_REGS, /* arg pointer and frame pointer */ + LP0START_REGS, /* xcv loop 0 start register */ + LP0END_REGS, /* xcv loop 0 end register */ + LP0COUNT_REGS, /* xcv loop 0 count register */ + LP1START_REGS, /* xcv loop 1 start register */ + LP1END_REGS, /* xcv loop 1 end register */ + LP1COUNT_REGS, /* xcv loop 1 count register */ VM_REGS, /* v0.t registers */ VD_REGS, /* vector registers except v0.t */ V_REGS, /* vector registers */ @@ -520,6 +526,12 @@ enum reg_class "GR_REGS", \ "FP_REGS", \ "FRAME_REGS", \ + "LP0START_REGS", \ + "LP0END_REGS", \ + "LP0COUNT_REGS", \ + "LP1START_REGS", \ + "LP1END_REGS", \ + "LP1COUNT_REGS", \ "VM_REGS", \ "VD_REGS", \ "V_REGS", \ @@ -545,10 +557,16 @@ enum reg_class { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }, /* GR_REGS */ \ { 0x00000000, 0xffffffff, 0x00000000, 0x00000000 }, /* FP_REGS */ \ { 0x00000000, 0x00000000, 0x00000003, 0x00000000 }, /* FRAME_REGS */ \ + { 0x00000000, 0x00000000, 0x00000040, 0x00000000 }, /* LP0START_REGS */\ + { 0x00000000, 0x00000000, 0x00000080, 0x00000000 }, /* LP0END_REGS */\ + { 0x00000000, 0x00000000, 0x00000100, 0x00000000 }, /* LP0COUNT_REGS */\ + { 0x00000000, 0x00000000, 0x00000200, 0x00000000 }, /* LP1START_REGS */\ + { 0x00000000, 0x00000000, 0x00000400, 0x00000000 }, /* LP1END_REGS */\ + { 0x00000000, 0x00000000, 0x00000800, 0x00000000 }, /* LP1COUNT_REGS */\ { 0x00000000, 0x00000000, 0x00000000, 0x00000001 }, /* V0_REGS */ \ { 0x00000000, 0x00000000, 0x00000000, 0xfffffffe }, /* VNoV0_REGS */ \ { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }, /* V_REGS */ \ - { 0xffffffff, 0xffffffff, 0x00000003, 0xffffffff } /* ALL_REGS */ \ + { 0xffffffff, 0xffffffff, 0x00000fc3, 0xffffffff } /* ALL_REGS */ \ } /* A C expression whose value is a register class containing hard @@ -596,7 +614,7 @@ enum reg_class 96, \ /* None of the remaining classes have defined call-saved \ registers. */ \ - 64, 65, 66, 67 \ + 64, 65, 66, 67, 70, 71, 72, 73, 74, 75 \ } /* True if VALUE is a signed 12-bit number. */ @@ -912,8 +930,8 @@ extern enum riscv_cc get_riscv_cc (const rtx use); "fs0", "fs1", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", \ "fa6", "fa7", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", \ "fs8", "fs9", "fs10","fs11","ft8", "ft9", "ft10","ft11", \ - "arg", "frame", "vl", "vtype", "vxrm", "frm", "N/A", "N/A", \ - "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", \ + "arg", "frame", "vl", "vtype", "vxrm", "frm", "lpstart0", "lpend0", \ + "lpcount0","lpstart1","lpend1","lpcount1", "N/A", "N/A", "N/A", "N/A", \ "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", \ "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", \ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", \ @@ -989,6 +1007,8 @@ extern enum riscv_cc get_riscv_cc (const rtx use); { "f31", 31 + FP_REG_FIRST }, \ } +#define LABEL_ALIGN(label) corev_label_align (label) + /* Globalizing directive for a label. */ #define GLOBAL_ASM_OP "\t.globl\t" diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 8f28e8e56ab..cdae62044e8 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -157,6 +157,12 @@ (VTYPE_REGNUM 67) (VXRM_REGNUM 68) (FRM_REGNUM 69) + (LPSTART0_REGNUM 70) + (LPEND0_REGNUM 71) + (LPCOUNT0_REGNUM 72) + (LPSTART1_REGNUM 73) + (LPEND1_REGNUM 74) + (LPCOUNT1_REGNUM 75) ]) (include "predicates.md") @@ -258,7 +264,7 @@ (const_string "no"))) ;; ISA attributes. -(define_attr "ext" "base,f,d,vector" +(define_attr "ext" "base,f,d,vector,xcvhwlp" (const_string "base")) ;; True if the extension is enabled. @@ -277,6 +283,10 @@ (and (eq_attr "ext" "vector") (match_test "TARGET_VECTOR")) (const_string "yes") + + (and (eq_attr "ext" "xcvhwlp") + (match_test "TARGET_XCVHWLP")) + (const_string "yes") ] (const_string "no"))) @@ -2095,17 +2105,17 @@ }) (define_insn "*movsi_internal" - [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r, m, *f,*f,*r,*m,r") - (match_operand:SI 1 "move_operand" " r,T,m,rJ,*r*J,*m,*f,*f,vp"))] + [(set (match_operand:SI 0 "move_dest_operand" "=r,r,r, m, *f,*f,*r,*m,r,r,r") + (match_operand:SI 1 "move_operand" " r,T,m,rJ,*r*J,*m,*f,*f,vp,xcvl0c,xcvl1c"))] "(register_operand (operands[0], SImode) || reg_or_0_operand (operands[1], SImode)) && !(register_operand (operands[1], SImode) && reg_or_subregno (operands[1]) == VL_REGNUM)" { return riscv_output_move (operands[0], operands[1]); } - [(set_attr "move_type" "move,const,load,store,mtc,fpload,mfc,fpstore,rdvlenb") + [(set_attr "move_type" "move,const,load,store,mtc,fpload,mfc,fpstore,rdvlenb,move,move") (set_attr "mode" "SI") (set_attr "type" "move") - (set_attr "ext" "base,base,base,base,f,f,f,f,vector")]) + (set_attr "ext" "base,base,base,base,f,f,f,f,vector,xcvhwlp,xcvhwlp")]) ;; 16-bit Integer moves diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index 1bd661a3fe4..14ec810e023 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -411,6 +411,8 @@ Mask(XCVMAC) Var(riscv_xcv_subext) Mask(XCVALU) Var(riscv_xcv_subext) +Mask(XCVHWLP) Var(riscv_xcv_subext) + TargetVariable int riscv_xthead_subext diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 3b9686daa58..e246923fd82 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -126,6 +126,13 @@ thead.o: $(srcdir)/config/riscv/thead.cc \ $(COMPILE) $< $(POSTCOMPILE) +corev.o: $(srcdir)/config/riscv/corev.cc \ + $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TARGET_H) backend.h $(RTL_H) \ + memmodel.h $(EMIT_RTL_H) insn-config.h $(RECOG_H) $(FUNCTION_H) \ + $(TM_P.H) tree-pass.h df.h + $(COMPILE) $< + $(POSTCOMPILE) + PASSES_EXTRA += $(srcdir)/config/riscv/riscv-passes.def $(common_out_file): $(srcdir)/config/riscv/riscv-cores.def \ diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index e01cdcbe22c..a55d07f0497 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -7724,6 +7724,10 @@ Conditional branch instruction that decrements a register and jumps if the register is nonzero. Operand 0 is the register to decrement and test; operand 1 is the label to jump to if the register is nonzero. +Operand 2 is the original branch back to the loop latch; this is useful +if you want to accept some nested loops, but need to analyze the nested +loops, to tell the loop being handled from its nested loops, as they might +share the loop latch. @xref{Looping Patterns}. This optional instruction pattern should be defined for machines with diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc index 4feb0a25ab9..d703cb5f2af 100644 --- a/gcc/loop-doloop.cc +++ b/gcc/loop-doloop.cc @@ -720,7 +720,8 @@ doloop_optimize (class loop *loop) count = copy_rtx (desc->niter_expr); start_label = block_label (desc->in_edge->dest); doloop_reg = gen_reg_rtx (mode); - rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, + BB_END (desc->in_edge->src)); word_mode_size = GET_MODE_PRECISION (word_mode); word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1; @@ -737,7 +738,8 @@ doloop_optimize (class loop *loop) else count = lowpart_subreg (word_mode, count, mode); PUT_MODE (doloop_reg, word_mode); - doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label); + doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, + BB_END (desc->in_edge->src)); } if (! doloop_seq) { diff --git a/gcc/target-insns.def b/gcc/target-insns.def index c4415d00735..962c5cc51d1 100644 --- a/gcc/target-insns.def +++ b/gcc/target-insns.def @@ -48,7 +48,7 @@ DEF_TARGET_INSN (casesi, (rtx x0, rtx x1, rtx x2, rtx x3, rtx x4)) DEF_TARGET_INSN (check_stack, (rtx x0)) DEF_TARGET_INSN (clear_cache, (rtx x0, rtx x1)) DEF_TARGET_INSN (doloop_begin, (rtx x0, rtx x1)) -DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1)) +DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1, rtx opt2)) DEF_TARGET_INSN (eh_return, (rtx x0)) DEF_TARGET_INSN (epilogue, (void)) DEF_TARGET_INSN (exception_receiver, (void)) diff --git a/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c b/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c new file mode 100644 index 00000000000..c0ead8056b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-Os" "-Og" "-O3" "-Oz" "-flto"} } */ +/* { dg-options "-march=rv32imc_xcvhwlp -mabi=ilp32 -O2" } */ + +int f (int i, int j) +{ + while (--j) + i = (i << 1) - 13; + return i; +} +/* { dg-final { scan-assembler {\mcv.setup\M} } } */