This patch adds support for hardware loops as described in:
https://docs.openhwgroup.org/projects/cv32e40p-user-manual/en/cv32e40p_v1.3.2/instruction_set_extensions.html#hardware-loops
.

riscv32-corev-elf (using newlib) regression tested for multilibs:
rv32imc_zicsr-ilp32--
rv32imfc_zicsr-ilp32--
rv32imc_zicsr_zfinx-ilp32--
rv32imfc_zicsr_xcvmac_xcvalu-ilp32--

also tested against this:

rv32imc_zicsr_xcvhwlp-ilp32--
rv32imfc_zicsr_xcvhwlp-ilp32--
rv32imc_zicsr_zfinx_xcvhwlp-ilp32--
rv32imfc_zicsr_xcvmac_xcvalu_xcvhwlp-ilp32-

Bootstrapped on x86_64

build 'all-gcc' for x86_64 x sh-elf
Add support for XCVhwlp extension in CV32E40P

2023-11-18  Joern Rennecke  <joern.renne...@embecosm.com>

gcc/
        * common/config/riscv/riscv-common.cc (riscv_ext_version_table):
        Add xcvhwlp.
        (riscv_ext_flag_table): Likewise.
        * config.gcc (riscv*): Add corev.o to extra_objs.
        * config/riscv/constraints.md (xcvl0s, xcvl0e): New constraints.
        (xcvl0c, xcvl1s, xcvl1e, xcvl1c): Likewise.
        (CVl0, xcvlb5, xcvlbs, xcvlbe, CV12): Likewise.
        * config/riscv/corev.cc: New file.
        * config/riscv/corev.md (UNSPEC_CV_LOOPBUG): New constant.
        (UNSPECV_CV_LOOPALIGN, UNSPEC_CV_FOLLOWS): Likewise.
        (UNSPEC_CV_LP_START_12): Likewise.
        (UNSPEC_CV_LP_END_5, UNSPEC_CV_LP_END_12): Likewise.
        (doloop_end_i, *cv_start, *cv_end, *cv_count): New insn patterns.
        (doloop_align): Likewise.
        (doloop_end, doloop_begin): New expanders.
        (doloop_begin_i): New define_insn_and_split.
        (doloop_begin_i+1): New splitter.
        * config/riscv/predicates.md (lpstart_reg_op): New predicate.
        (lpend_reg_op, lpcount_reg_op): Likewise.
        (label_register_operand, move_dest_operand): Likewise.
        * config/riscv/riscv-passes.def (pass_riscv_doloop_begin): Add.
        (pass_riscv_doloop_ranges):
        Insert before and after register allocation.
        * config/riscv/riscv-protos.h (make_pass_riscv_doloop_begin): Declare.
        (make_pass_riscv_doloop_ranges): Likewise.
        (riscv_can_use_doloop_p, riscv_invalid_within_doloop): Likewise.
        (hwloop_setupi_p, add_label_op_ref, corev_label_align): Likewise.
        * config/riscv/riscv.cc (riscv_regno_to_class): Add classes for
        hardware loop start, end and counter registers.
        (riscv_strip_unspec_address): Also strip UNSPEC_CV_LP_START_12,
        UNSPEC_CV_LP_END_5 and UNSPEC_CV_LP_END_12.
        (riscv_output_move): Add support to read loop counter registers.
        (TARGET_CAN_USE_DOLOOP_P, TARGET_INVALID_WITHIN_DOLOOP): Override.
        * config/riscv/riscv.h (enum reg_class): Add items for hardware
        loop start, end and counter registers.
        (REG_CLASS_NAMES): Likewise.
        (REG_CLASS_CONTENTS): Likewise.
        (REG_ALLOC_ORDER): Likewise.
        (REGISTER_NAMES): Likewise.
        (LABEL_ALIGN): Define.
        * config/riscv/riscv.md (LPSTART0_REGNUM): New constant.
        (LPEND0_REGNUM, LPCOUNT0_REGNUM): Likewise.
        (LPSTART1_REGNUM, LPEND1_REGNUM, LPCOUNT1_REGNUM): Likewise.
        (attr ext): New value xcvhwlp.
        (attr enabled): Handle xcvhwlp.
        (movsi_internal): Add alternatives to read loop counters.
        Use move_dest_operand.
        * config/riscv/riscv.opt (XCVHWLP): New Mask.
        * config/riscv/t-riscv (corev.o): New rule.
        * doc/md.texi (doloop_end): Document optional operand 2.
        * loop-doloop.cc (doloop_optimize): Provide 3rd operand to
        gen_doloop_end.
        * target-insns.def (doloop_end): Add optional 3rd operand.
gcc/testsuite/
        * gcc.target/riscv/cv-hwlp-shiftsub.c: New test.

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 5111626157b..55b56235134 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -312,6 +312,7 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
 
   {"xcvmac", ISA_SPEC_CLASS_NONE, 1, 0},
   {"xcvalu", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"xcvhwlp", ISA_SPEC_CLASS_NONE, 1, 0},
 
   {"xtheadba", ISA_SPEC_CLASS_NONE, 1, 0},
   {"xtheadbb", ISA_SPEC_CLASS_NONE, 1, 0},
@@ -1676,6 +1677,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
 
   {"xcvmac",        &gcc_options::x_riscv_xcv_subext, MASK_XCVMAC},
   {"xcvalu",        &gcc_options::x_riscv_xcv_subext, MASK_XCVALU},
+  {"xcvhwlp",       &gcc_options::x_riscv_xcv_subext, MASK_XCVHWLP},
 
   {"xtheadba",      &gcc_options::x_riscv_xthead_subext, MASK_XTHEADBA},
   {"xtheadbb",      &gcc_options::x_riscv_xthead_subext, MASK_XTHEADBB},
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 6d51bd93f3f..8cddfbb12b3 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -546,7 +546,7 @@ riscv*)
        extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o 
riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o"
        extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o 
riscv-avlprop.o"
        extra_objs="${extra_objs} riscv-vector-builtins.o 
riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
-       extra_objs="${extra_objs} thead.o riscv-target-attr.o"
+       extra_objs="${extra_objs} thead.o riscv-target-attr.o corev.o"
        d_target_objs="riscv-d.o"
        extra_headers="riscv_vector.h"
        target_gtfiles="$target_gtfiles 
\$(srcdir)/config/riscv/riscv-vector-builtins.cc"
diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 68be4515c04..df7f2122bb1 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -158,6 +158,50 @@
        (and (match_test "IN_RANGE (ival, 0, 1073741823)")
             (match_test "exact_log2 (ival + 1) != -1"))))
 
+(define_register_constraint "xcvl0s" "TARGET_XCVHWLP ? LP0START_REGS : NO_REGS"
+  "lpstart0 for Xcv")
+
+(define_register_constraint "xcvl0e" "TARGET_XCVHWLP ? LP0END_REGS : NO_REGS"
+  "lpend0 for Xcv")
+
+(define_register_constraint "xcvl0c" "TARGET_XCVHWLP ? LP0COUNT_REGS : NO_REGS"
+  "lpcount0 for Xcv")
+
+(define_register_constraint "xcvl1s" "TARGET_XCVHWLP ? LP1START_REGS : NO_REGS"
+  "lpstart1 for Xcv")
+
+(define_register_constraint "xcvl1e" "TARGET_XCVHWLP ? LP1END_REGS : NO_REGS"
+  "lpend1 for Xcv")
+
+(define_register_constraint "xcvl1c" "TARGET_XCVHWLP ? LP1COUNT_REGS : NO_REGS"
+  "lpcount1 for Xcv")
+
+(define_constraint "CVl0"
+  "A label that follows immediately after the instruction that uses it"
+  (and (match_code "unspec")
+       (match_test "XINT (op, 1) == UNSPEC_CV_FOLLOWS")))
+
+(define_constraint "xcvlb5"
+  "A label for a loop end that can probably be addressed with a 5-bit-offset"
+  (and (match_code "unspec")
+       (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_5")))
+
+(define_constraint "xcvlbs"
+  "A label for a loop start that can definitely be addressed with a 
12-bit-offset"
+  (and (match_code "unspec")
+       (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_12")))
+
+(define_constraint "xcvlbe"
+  "A label for a loop end that can definitely be addressed with a 
12-bit-offset"
+  (and (match_code "unspec")
+       (ior (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_5")
+           (match_test "XINT (op, 1) == UNSPEC_CV_LP_END_12"))))
+
+(define_constraint "CV12"
+  "A 12-bit unsigned immediate to set up a loop counter with cv.setupi"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 4095)")))
+
 ;; Vector constraints.
 
 (define_register_constraint "vr" "TARGET_VECTOR ? V_REGS : NO_REGS"
diff --git a/gcc/config/riscv/corev.cc b/gcc/config/riscv/corev.cc
new file mode 100644
index 00000000000..5bc711a6afe
--- /dev/null
+++ b/gcc/config/riscv/corev.cc
@@ -0,0 +1,392 @@
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "function.h"
+#include "memmodel.h"
+#include "emit-rtl.h"
+#include "tm_p.h"
+#include "tree-pass.h"
+#include "df.h"
+
+/* Creating doloop_begin patterns fully formed with a named pattern
+   reusults in the labels they use to refer to the loop start being
+   removed from the insn list during loop_done pass, so instead we
+   put the doloop_end insn in the place of the label, and patch this
+   up after the loop_done pass.
+   Also, while being at that, replace the pseudo reg used for the
+   counter in doloop_begin / doloop_end by the appropriate hard register,
+   since lra doesn't find the right solution.  */
+
+namespace {
+
+const pass_data pass_data_riscv_doloop_begin =
+{
+  RTL_PASS, /* type */
+  "riscv_doloop_begin", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_LOOP_DOLOOP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_riscv_doloop_begin : public rtl_opt_pass
+{
+public:
+  pass_riscv_doloop_begin (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_riscv_doloop_begin, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return TARGET_XCVHWLP
+       && flag_branch_on_count_reg && optimize > 0;
+    }
+  virtual unsigned int execute (function *);
+}; // class pass_riscv_doloop_begin
+
+unsigned int
+pass_riscv_doloop_begin::execute (function *)
+{
+  for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONJUMP_INSN_P (insn)
+         || recog_memoized (insn) != CODE_FOR_doloop_begin_i)
+       continue;
+      rtx *lref_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
+      rtx_insn *end_insn
+       = as_a <rtx_insn *> (XEXP (XVECEXP (*lref_loc, 0, 0), 0));
+      rtx pat = PATTERN (end_insn);
+      rtx start_label_ref = XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 1);
+      start_label_ref
+       = gen_rtx_LABEL_REF (SImode, label_ref_label (start_label_ref));
+      *lref_loc = start_label_ref;
+      add_label_op_ref (insn, start_label_ref);
+      rtx *reg_loc0 = &SET_DEST (XVECEXP (PATTERN (insn), 0, 2));
+      rtx *reg_loc1 = &XEXP (XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 0), 0);
+      rtx *reg_loc2 = &XEXP (SET_SRC (XVECEXP (pat, 0, 1)), 0);
+      rtx *reg_loc3 = &SET_DEST (XVECEXP (pat, 0, 1));
+      gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc1));
+      gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc2));
+      gcc_assert (rtx_equal_p (*reg_loc0, *reg_loc3));
+      rtx start_reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 0));
+      rtx hreg = gen_rtx_REG (SImode,
+                             LPCOUNT0_REGNUM
+                             + REGNO (start_reg) - LPSTART0_REGNUM);
+      *reg_loc0 = hreg;
+      *reg_loc1 = hreg;
+      *reg_loc2 = hreg;
+      *reg_loc3 = hreg;
+      df_insn_rescan (insn);
+      df_insn_rescan (end_insn);
+    }
+
+  return 0;
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_riscv_doloop_begin (gcc::context *ctxt)
+{
+  return new pass_riscv_doloop_begin (ctxt);
+}
+
+/* We'd like to check that there's no flow control inside the loop
+   except for nested HW loops and the final branch back to the loop latch.
+   However, we can't do that becaue we are not being passed the loop
+   structure.
+   Likewise, if there is a large loop that has hardly any iterations,
+   the loop setup can't be amortized, but we can't test here if the
+   loop is large.  */
+bool
+riscv_can_use_doloop_p (const widest_int &, const widest_int &,
+                       unsigned int loop_depth, bool entered_at_top)
+{
+  if (!TARGET_XCVHWLP)
+    return false;
+  if (loop_depth > 2)
+    return false;
+  if (!entered_at_top)
+    return false;
+  return true;
+}
+
+/* The only control flow allowed inside a HW loop is another HW loop,
+   ebreak, and ecall.  */
+const char *
+riscv_invalid_within_doloop (const rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+  /* Alas, the jump at the end of the loop is considered part of the loop,
+     and there's no good way here to distinguish it from interspersed control
+     flow.  We have to leave it to the doloop_end expander to analyze the loop
+     again.  */
+#if 0
+  if (JUMP_P (insn) && recog_memoized (const_cast <rtx_insn *> (insn)) != 
CODE_FOR_doloop_end_i)
+    return "Jump in loop.";
+#endif
+
+  return NULL;
+}
+
+/* Starting at INSN, try to find, within the next COUNT insn,
+   a doloop_end_i pattern that provides the label END .
+   If found, return the remaining value of COUNT, otherwise, 0.  */
+static unsigned
+doloop_end_range_check (rtx_insn *insn, rtx_insn *end, unsigned count)
+{
+  for (; count > 0; insn = NEXT_INSN (insn))
+    {
+      if (insn == NULL_RTX)
+       return 0;
+      if (!active_insn_p (insn))
+       continue;
+      if (recog_memoized (insn) == CODE_FOR_doloop_end_i)
+       {
+         rtx label_use = XVECEXP (PATTERN (insn), 0, 4);
+         if (label_ref_label (XEXP (label_use, 0)) == end)
+           break;
+       }
+      count--;
+    }
+  return count;
+}
+
+/* Determine if we can implement the loop setup MD_INSN with cv.setupi,
+   considering the hardware loop starts at the labels in the LABEL_REFs
+   START_REF and END_REF.  */
+
+bool
+hwloop_setupi_p (rtx md_insn, rtx start_ref, rtx end_ref)
+{
+  rtx_insn *insn = as_a <rtx_insn *> (md_insn);
+  if (GET_CODE (start_ref) == UNSPEC)
+    start_ref = XVECEXP (start_ref, 0, 0);
+  if (GET_CODE (end_ref) == UNSPEC)
+    end_ref = XVECEXP (end_ref, 0, 0);
+  rtx_insn *start = label_ref_label (start_ref);
+  rtx_insn *end = label_ref_label (end_ref);
+
+  /* The the loop must directly follow the cv.setupi instruction.  */
+  if (next_active_insn (insn) != next_active_insn (start))
+    return false;
+
+  /* Loops with >= 4K instructions can't be setup with cv.setupi .  */
+  if (doloop_end_range_check (insn, end, 4095) == 0)
+    return false;
+
+  return true;
+}
+
+void
+add_label_op_ref (rtx_insn *insn, rtx label)
+{
+  if (GET_CODE (label) == LABEL_REF)
+    label = label_ref_label (label);
+  add_reg_note (insn, REG_LABEL_OPERAND, label);
+  ++LABEL_NUSES (label);
+}
+
+
+/* Before register allocation, we need to know if a cv.setupi instruction
+   might need to replaced with instructions that use an extra scratch
+   register becasue the labels are out of range.  If we split into
+   cv.starti / cv.endi / cv.counti, all three parameters can use a
+   12 bit immediate.  Considering the instructions inside the loop,
+   we got a three-address machine, so a typical instruction has three
+   operands, each of which might need reloading.  To load or store a
+   register from a stack slot on a 32 bit RISC-V, worst case we might
+   need a LUI and a load or store instruction.  Thus seven instruction
+   after reload for one instruction before reload.  The 12 bit unsigned
+   offset allows 4095 instructions, so for a safe number before reload,
+   we divide by seven to arrive at 585.  That seems a comfortable number
+   that we don't have to worry too much about pessimizing the code when
+   reserve a scratch register when the loop gets that big.
+
+   For performance, we like to use sv.setupi or at least cv.setup where
+   possible, as it is only a single instruction; we assume that usually,
+   there will be no reloads for a HW loop if currently fit into the 5 bit
+   immeidate range, as that makes them a small inner loop.
+
+   loop start not immediatly following -> need to split
+   otherwise, if loop end won't fit in u5, probably in u12 -> aim for cv.setup
+   otherwise, if loop count won't fit in u12 -> aim for cv.setup
+   loop end might not fit into u12 after reload -> need scratch register
+    in case end needs to be loaded with cv.end .  */
+
+namespace {
+
+const pass_data pass_data_riscv_doloop_ranges =
+{
+  RTL_PASS, /* type */
+  "riscv_doloop_ranges", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_LOOP_DOLOOP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_riscv_doloop_ranges : public rtl_opt_pass
+{
+public:
+  pass_riscv_doloop_ranges (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_riscv_doloop_ranges, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return TARGET_XCVHWLP
+       && flag_branch_on_count_reg && optimize > 0;
+    }
+  virtual unsigned int execute (function *);
+
+  opt_pass *clone ()
+  {
+    return new pass_riscv_doloop_ranges (m_ctxt);
+  }
+}; // class pass_riscv_doloop_ranges
+
+/* Look for doloop_begin_i patterns and make sure start labels are
+   appropriatly encapsulated or non-encapsulated in UNSPECs to show
+   if they satisfy offset range requirements.
+   We run this once just before register allocation and once afterwards,
+   so we can't just formulate this as a branch shortening problem.
+   In the post-reload pass, also add doloop_align if necessary.  */
+unsigned int
+pass_riscv_doloop_ranges::execute (function *)
+{
+  for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONJUMP_INSN_P (insn)
+         || recog_memoized (insn) != CODE_FOR_doloop_begin_i)
+       continue;
+      rtx *lref_s_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
+      rtx *lref_e_loc = &SET_SRC (XVECEXP (PATTERN (insn), 0, 1));
+      rtx lp_count =        SET_SRC (XVECEXP (PATTERN (insn), 0, 2));
+      rtx scratch =     SET_DEST (XVECEXP (PATTERN (insn), 0, 3));
+      rtx start_label_ref = *lref_s_loc;
+      rtx end_label_ref = *lref_e_loc;
+      if (GET_CODE (start_label_ref) == UNSPEC)
+       start_label_ref = XVECEXP (start_label_ref, 0, 0);
+      if (GET_CODE (end_label_ref) == UNSPEC)
+       end_label_ref = XVECEXP (end_label_ref, 0, 0);
+
+      if (reload_completed
+         && GET_CODE (scratch) == SCRATCH
+         && (next_active_insn (label_ref_label (start_label_ref))
+             != next_active_insn (insn))
+         && CONST_INT_P (lp_count))
+       {
+         /* This is supposed to be a cv.setupi, but register allocation
+            put spill code in between the doloop_setup_i and the loop
+            start.  Move the doloop_begin_i back to the start of the loop.
+            We can't do this if the loop counter is initialized from a
+            register, because that register might be used by the spill code;
+            it fact, it is likely to be used by it, so there is little point
+            to analyze if it is.
+            ??? We could allow the doloop_begin_i pattern to read the count
+            from memory (using clobber and splitter to fix that up) to have
+            a better chance to get code that allows the doloop_begin_i to
+            be moved back to the start of the loop.
+            ??? Much better would be to have a hook or other mechanism to
+            prevent reload / lra from inserting spill code between
+            doloop_begin_i and the loop start.  */
+
+         rtx_insn *prev = PREV_INSN (insn);
+         rtx_insn *next = NEXT_INSN (insn);
+         SET_NEXT_INSN (prev) = NEXT_INSN (insn);
+         SET_PREV_INSN (next) = PREV_INSN (insn);
+         SET_PREV_INSN (insn) = NULL_RTX;
+         SET_NEXT_INSN (insn) = NULL_RTX;
+
+         emit_insn_after (insn, PREV_INSN (label_ref_label (start_label_ref)));
+
+         insn = next;
+         continue;
+       }
+
+      if (next_active_insn (label_ref_label (start_label_ref))
+         != next_active_insn (insn))
+       {
+         if (GET_CODE (*lref_s_loc) == UNSPEC)
+           *lref_s_loc = gen_rtx_UNSPEC (SImode,
+                                         gen_rtvec (1, start_label_ref),
+                                         UNSPEC_CV_LP_START_12);
+         else
+           *lref_s_loc = start_label_ref;
+         /* We must not emit an insn outside of basic blocks, so
+            emit the align after the  NOTE_INSN_BASIC_BLOCK note.  */
+         rtx_insn *after = NEXT_INSN (label_ref_label (start_label_ref));
+         if (reload_completed && TARGET_RVC)
+           emit_insn_after (gen_doloop_align (), after);
+       }
+      else
+       {
+         if (GET_CODE (*lref_s_loc) != UNSPEC)
+           *lref_s_loc = gen_rtx_UNSPEC (SImode,
+                                         gen_rtvec (1, start_label_ref),
+                                         UNSPEC_CV_FOLLOWS);
+         if (reload_completed && TARGET_RVC)
+           emit_insn_before (gen_doloop_align (), insn);
+       }
+
+      rtx_insn *end_label = label_ref_label (end_label_ref);
+      unsigned count = (reload_completed ? 4095 : 585);
+      unsigned rest = doloop_end_range_check (insn, end_label, count);
+
+      if (rest)
+       {
+         /* Check if an unsigned 5 bit offset is enough.  */
+         bool short_p = count - rest <= 31;
+         HOST_WIDE_INT val
+           = short_p ? UNSPEC_CV_LP_END_5 : UNSPEC_CV_LP_END_12;
+         if (GET_CODE (*lref_e_loc) != UNSPEC
+             || XINT (*lref_e_loc, 1) != val)
+           *lref_e_loc
+             = gen_rtx_UNSPEC (SImode, gen_rtvec (1, end_label_ref), val);
+       }
+      else
+       *lref_e_loc = end_label_ref;
+    }
+
+  return 0;
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_riscv_doloop_ranges (gcc::context *ctxt)
+{
+  return new pass_riscv_doloop_ranges (ctxt);
+}
+
+/* Return alignment requested for a label as a power of two.
+   We can't put doloop_align instructions before doloop start labels lest
+   they end up outside of basic blocks in case there's a preceding BARRIER,
+   so we put them after the label.  However, the label must be aligned.  */
+int
+corev_label_align (rtx_insn *label)
+{
+  rtx_insn *next = label;
+  do
+    next = NEXT_INSN (next);
+  while (next && NOTE_P (next));
+  if (next && NONJUMP_INSN_P (next)
+      && recog_memoized (next) == CODE_FOR_doloop_align)
+    return 2;
+  return 0;
+}
diff --git a/gcc/config/riscv/corev.md b/gcc/config/riscv/corev.md
index 1350bd4b81e..7d4890eab52 100644
--- a/gcc/config/riscv/corev.md
+++ b/gcc/config/riscv/corev.md
@@ -24,6 +24,15 @@
   UNSPEC_CV_ALU_CLIPR
   UNSPEC_CV_ALU_CLIPU
   UNSPEC_CV_ALU_CLIPUR
+
+  ;; CORE-V HWLP
+  UNSPEC_CV_LOOPBUG
+  UNSPECV_CV_LOOPALIGN
+  UNSPEC_CV_FOLLOWS
+  UNSPEC_CV_LP_START_12
+  UNSPEC_CV_LP_END_5
+  UNSPEC_CV_LP_END_12
+
 ])
 
 ;; XCVMAC extension.
@@ -691,3 +700,288 @@
   cv.suburnr\t%0,%2,%3"
   [(set_attr "type" "arith")
   (set_attr "mode" "SI")])
+
+;; ??? The manual is unclear what the hardware loops actually do.
+;; We are just guessing here.
+(define_insn "doloop_end_i"
+  [(set (pc)
+       (if_then_else
+         (ne (match_operand:SI 0 "nonimmediate_operand" "+xcvl0c,xcvl1c")
+             (const_int 1))
+         (label_ref (match_operand 1 "" ""))
+         (pc)))
+   (set (match_dup 0)
+       (plus:SI (match_dup 0)
+                (const_int -1)))
+   (use (match_operand:SI 2 "" "xcvl0s,xcvl1s"))
+   (use (match_operand:SI 3 "" "xcvl0e,xcvl1e"))
+   (use (match_operand:SI 4 "" "X,X"))]
+  "TARGET_XCVHWLP"
+{
+  unsigned n_nops = 3;
+  for (rtx_insn * curr = PREV_INSN (current_output_insn);
+       curr && n_nops && !LABEL_P (curr);
+       curr = PREV_INSN (curr))
+    if (active_insn_p (curr))
+      {
+       n_nops--;
+       if (recog_memoized (curr) == CODE_FOR_doloop_end_i)
+         break;
+      }
+  while (n_nops--)
+    asm_fprintf (asm_out_file, "\tnop\n");
+  output_asm_insn ("%4:", operands);
+  if (TARGET_RVC)
+    asm_fprintf (asm_out_file, "\t.option rvc\n");
+  return "";
+}
+  [(set_attr "type" "branch")
+   (set_attr "length" "0")]
+)
+
+(define_expand "doloop_end"
+  [(match_operand:SI 0 "nonimmediate_operand" "")
+   (match_operand 1 "" "")
+   (match_operand 2 "" "")]
+  "TARGET_XCVHWLP"
+{
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+
+  rtx_insn *start_label = as_a<rtx_insn *> (operands[1]);
+
+  /* A HW loop must contain at least three insns.  If there are less than
+     two insns in the loop, we must add two or more nops, which is worse
+     than just using a normal loop with separate decrement and
+     branch instructions.  */
+  unsigned n_insns = 0;
+  /* We must not set the counter register inside the loop, except for the
+     increment that'll be folded into the doloop_end.  But that is already
+    taken care of by loop_optimize, which creates a new register just for
+    counting.  */
+
+  /* If nesting HW loops, the inner loop must be using
+     lpspart0, lpend0, lpcount0 .  It's OK if we have more than one inner
+     loop, as long as they are not nested into each other; we have already
+     checked the nesting depth in riscv_can_use_doloop_p.  */
+  bool inner_loop_p = false;
+  rtx_insn *bb_last = NULL;
+  rtx_insn *bb_succ = NULL;
+  for (rtx_insn *insn = start_label; ;
+       insn = (insn == bb_last ? bb_succ : NEXT_INSN (insn)))
+    {
+      if (!insn)
+       FAIL;
+
+      /* For: int f (int i, int j) { while (--j) i = (i << 1) - 13; return i; }
+        we get passed a start label that's actually after the final branch.  */
+
+      if (NOTE_INSN_BASIC_BLOCK_P (insn))
+       {
+         basic_block bb = NOTE_BASIC_BLOCK (insn);
+         bb_last = BB_END (bb);
+         if (single_succ_p (bb))
+           bb_succ = BB_HEAD (single_succ (bb));
+         else if (recog_memoized (bb_last) == CODE_FOR_doloop_end_i)
+           bb_succ = BB_HEAD (FALLTHRU_EDGE (bb)->dest);
+         else if (bb_last == operands[2])
+           bb_succ = NULL;
+         else
+           FAIL;
+       }
+
+      if (NONJUMP_INSN_P (insn))
+       n_insns++;
+      else if (JUMP_P (insn))
+       {
+         if (recog_memoized (insn) == CODE_FOR_doloop_end_i)
+           inner_loop_p = true;
+         else if (insn != operands[2])
+           FAIL;
+         else
+           break;
+       }
+    }
+  /* We have counted in the counter decrement, so we need three insns for the
+     cost of the HW loop to be amortized.  */
+  if (n_insns < 3)
+    FAIL;
+
+  rtx start = gen_rtx_REG (SImode, LPSTART0_REGNUM + (inner_loop_p ? 3 : 0));
+  rtx end = gen_rtx_REG (SImode, LPEND0_REGNUM + (inner_loop_p ?  3 : 0));
+  rtx ref = gen_rtx_LABEL_REF (SImode, gen_label_rtx ());
+  rtx_insn *jump
+    = emit_jump_insn (gen_doloop_end_i (operands[0], operands[1],
+                                       start, end, ref));
+  add_label_op_ref (jump, ref);
+  DONE;
+})
+
+;; Although the alignment can be thought of taking up to two bytes, that is
+;; only the case if the assembler first saved space by creating a short insn.
+;; The compiler doesn't generally take short insn into account when calculating
+;; lengths.
+(define_insn "doloop_align"
+  [(unspec_volatile [(const_int 0)] UNSPECV_CV_LOOPALIGN)]
+  "TARGET_XCVHWLP && TARGET_RVC"
+  ".balign\t4\;.option norvc"
+  [(set_attr "type" "ghost")])
+
+; We use an actual doloop_begin pattern to make sure the loop counter
+; gets allocated to the right registers, and that we have a scratch GPR
+; if we nee it.
+; We do want the doloop_begin_i pattern to be right at the top of the loop
+; for efficiency, as we can use cv.setup / cv.setupi then.
+; If we must, we can, however, split the instruction into a triplet
+; of instruction that can go anywhere - with potentially some extra
+; instrustions to load constants into GPR registers first, particularily
+; if the loop start setup ends up below the loop.
+
+;; Sometimes - e.g. newlib/libc/stdlib/src4random.c 
+;; -Os  -march=rv32imc_zicsr_xcvhwlp - we have spagetti code at split2, with
+;; the loop setup below the loop, and it's still spaghetti at peephole2, but
+;; it gets sorted out at bbro.  Should we delay the doloop_begin_i split
+;; until after bbro, and add another split pass - or always drive the split
+;; with a '#' output pattern, to avoid this issue?
+
+(define_insn_and_split "doloop_begin_i"
+  [(set (match_operand:SI 0 "lpstart_reg_op")
+       (match_operand:SI 1))
+   (set (match_operand:SI 2 "lpend_reg_op")
+       (match_operand:SI 3))
+   (set (match_operand:SI 4 "register_operand")
+       (match_operand:SI 5 "immediate_register_operand"))
+   (clobber (match_scratch:SI 6))]
+  "TARGET_XCVHWLP"
+  {@ [cons: =0, 1, =2, 3, =4, 5, =6; attrs: length ]
+    [xcvl0s, CVl0, xcvl0e, xcvlb5, xcvl0c, CV12, X ; 4] cv.setupi\t0, %5, %3
+    [xcvl1s, CVl0, xcvl1e, xcvlb5, xcvl1c, CV12, X ; 4] cv.setupi\t1, %5, %3
+    [xcvl0s, CVl0, xcvl0e, xcvlbe, xcvl0c, r,    X ; 4] cv.setup\t0, %5, %3
+    [xcvl1s, CVl0, xcvl1e, xcvlbe, xcvl1c, r,    X ; 4] cv.setup\t1, %5, %3
+    [xcvl0s,?iCVl0,xcvl0e,?ixcvlbe,xcvl0c, ?ri, &r ; 12] #
+    [xcvl1s,?iCVl0,xcvl1e,?ixcvlbe,xcvl1c, ?ri, &r ; 12] #
+  }
+  ;; We don't know the loop length until after register allocation.
+  ;; Even in the cases where we already can know before reload that we must
+  ;; split, the test is costly, and splitting early could confuse RA.
+  "&& reload_completed
+   && (GET_CODE (operands[1]) == LABEL_REF
+       || GET_CODE (operands[1]) == UNSPEC)
+   && !hwloop_setupi_p (insn, operands[1], operands[3])"
+  [(set (match_dup 4) (match_dup 5))]
+{
+  if (GET_CODE (operands[1]) == UNSPEC)
+    operands[1] = XVECEXP (operands[1], 0, 0);
+  else
+    {
+      emit_move_insn (operands[6], operands[1]);
+      operands[1] = operands[6];
+    }
+  emit_insn (gen_rtx_SET (operands[0], operands[1]));
+  if (GET_CODE (operands[3]) == UNSPEC)
+    operands[3] = XVECEXP (operands[3], 0, 0);
+  else
+    {
+      emit_move_insn (operands[6], operands[3]);
+      operands[3] = operands[6];
+    }
+  emit_insn (gen_rtx_SET (operands[2], operands[3]));
+  if (!REG_P (operands[5])
+      && !satisfies_constraint_CV12 (operands[5]))
+    {
+      emit_move_insn (operands[6], operands[5]);
+      operands[5] = operands[6];
+    }
+}
+  [(set_attr "move_type" "move")]
+)
+
+;; If we have a doloop_begin_i instruction that has labels that
+;; statisfy cv.setup, but not cv.setupi, yet the loop count is an
+;; immediate, split to load the immediate into the scratch register.
+(define_split
+  [(set (match_operand:SI 0 "lpstart_reg_op")
+        (match_operand:SI 1))
+   (set (match_operand:SI 2 "lpend_reg_op")
+        (match_operand:SI 3))
+   (set (match_operand:SI 4 "register_operand")
+        (match_operand:SI 5 "immediate_operand"))
+   (clobber (match_operand:SI 6 "register_operand"))]
+  "TARGET_XCVHWLP
+   && reload_completed
+   && hwloop_setupi_p (insn, operands[1], operands[3])
+   && !satisfies_constraint_xcvlb5 (operands[3])"
+  [(set (match_dup 6) (match_dup 5))
+   (parallel
+     [(set (match_dup 0) (match_dup 1))
+      (set (match_dup 2) (match_dup 3))
+      (set (match_dup 4) (match_dup 6))
+      (clobber (scratch:SI))])]
+)
+
+(define_expand "doloop_begin"
+  [(use (match_operand 0 "register_operand"))
+   (use (match_operand 1 ""))]
+  "TARGET_XCVHWLP"
+{
+  rtx pat = PATTERN (operands[1]);
+  /* ??? cleanup_cfg, called from pass_rtl_loop_done::execute, deletes
+     loop latches without updating LABEL_REFS in non-jump instructions
+     even when marked with REG_LABEL_OPEREND notes.  */
+#if 0
+  rtx start_label_ref
+    = XEXP (SET_SRC (XVECEXP (pat, 0, 0)), 1);
+#else
+  rtx lst = gen_rtx_INSN_LIST (VOIDmode, operands[1], NULL_RTX);
+  rtx start_label_ref
+    = gen_rtx_UNSPEC (SImode, gen_rtvec (1, lst), UNSPEC_CV_LOOPBUG);
+#endif
+  rtx start_reg = XEXP (XVECEXP (pat, 0, 2), 0);
+  rtx end_reg = XEXP (XVECEXP (pat, 0, 3), 0);
+  rtx end_label_ref = XEXP (XVECEXP (pat, 0, 4), 0);
+  rtx_insn *insn = emit_insn (gen_doloop_begin_i (start_reg, start_label_ref,
+                                                 end_reg, end_label_ref,
+                                                 operands[0], operands[0]));
+  //add_label_op_ref (insn, start_label_ref);
+  add_label_op_ref (insn, end_label_ref);
+  DONE;
+})
+
+;; Although cv.start / cv.end / cv.count could be seen as move instructions
+;; and therefore belonging to movsi_internal, that is problematic because
+;; using them outside loopsetup contexts might confuse the HW loop logic
+;; of the processor.  We might model this with UNSPEC_VOLATILEs, but
+;; that'd likely get too much into the way of optimizations.
+(define_insn "*cv_start"
+  [(set (match_operand:SI 0 "lpstart_reg_op" "=xcvl0s,xcvl1s,xcvl0s,xcvl1s")
+       (match_operand:SI 1 "label_register_operand" "i,i,r,r"))]
+  "TARGET_XCVHWLP"
+{
+  if (!REG_P (operands[1]) && TARGET_RVC)
+    asm_fprintf (asm_out_file, "\t.balign\t4\n");
+  operands[0] = GEN_INT (REGNO (operands[0]) == LPSTART0_REGNUM ? 0 : 1);
+  return REG_P (operands[1]) ? "cv.start %0,%1" : "cv.starti %0, %1";
+}
+  [(set_attr "move_type" "move")])
+
+(define_insn "*cv_end"
+  [(set (match_operand:SI 0 "lpend_reg_op" "=xcvl0e,xcvl1e,xcvl0e,xcvl1e")
+       (match_operand:SI 1 "label_register_operand" "i,i,r,r"))]
+  "TARGET_XCVHWLP"
+{
+  if (!REG_P (operands[1]) && TARGET_RVC)
+    asm_fprintf (asm_out_file, "\t.balign\t4\n");
+  operands[0] = GEN_INT (REGNO (operands[0]) == LPEND0_REGNUM ? 0 : 1);
+  return REG_P (operands[1]) ? "cv.end %0,%1" : "cv.endi %0, %1";
+}
+  [(set_attr "move_type" "move")])
+
+(define_insn "*cv_count"
+  [(set (match_operand:SI 0 "lpcount_reg_op" "=xcvl0c,xcvl1c,xcvl0c,xcvl1c")
+       (match_operand:SI 1 "immediate_register_operand" "CV12,CV12,r,r"))]
+  "TARGET_XCVHWLP"
+{
+  operands[0] = GEN_INT (REGNO (operands[0]) == LPCOUNT0_REGNUM ? 0 : 1);
+  return REG_P (operands[1]) ? "cv.count %0,%1" : "cv.counti %0, %1";
+}
+  [(set_attr "move_type" "move")])
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 90567a817a7..5fb89500146 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -400,6 +400,31 @@
   (ior (match_operand 0 "register_operand")
        (match_code "const_int")))
 
+(define_predicate "label_register_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_code "label_ref")))
+
+(define_predicate "lpstart_reg_op"
+  (and (match_code "reg")
+       (match_test "REGNO (op) == LPSTART0_REGNUM || REGNO (op) == 
LPSTART1_REGNUM")))
+
+(define_predicate "lpend_reg_op"
+  (and (match_code "reg")
+       (match_test "REGNO (op) == LPEND0_REGNUM || REGNO (op) == 
LPEND1_REGNUM")))
+
+(define_predicate "lpcount_reg_op"
+  (and (match_code "reg")
+       (match_test "REGNO (op) == LPCOUNT0_REGNUM || REGNO (op) == 
LPCOUNT1_REGNUM")))
+
+;; The instructions to set hardware loop start / end are special.
+;; We don't want the register allocator to morph these from/to ordinary
+;; moves, since pc-relative loads are position dependent in their range.
+(define_predicate "move_dest_operand"
+  (and (match_operand 0 "nonimmediate_operand")
+       (not (match_operand 0 "lpstart_reg_op"))
+       (not (match_operand 0 "lpend_reg_op"))
+       (not (match_operand 0 "lpcount_reg_op"))))
+
 ;; Predicates for the V extension.
 (define_special_predicate "vector_length_operand"
   (ior (match_operand 0 "pmode_register_operand")
diff --git a/gcc/config/riscv/riscv-passes.def 
b/gcc/config/riscv/riscv-passes.def
index b6260939d5c..4efe09ab04f 100644
--- a/gcc/config/riscv/riscv-passes.def
+++ b/gcc/config/riscv/riscv-passes.def
@@ -20,3 +20,6 @@
 INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
 INSERT_PASS_AFTER (pass_split_all_insns, 1, pass_avlprop);
 INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
+INSERT_PASS_AFTER (pass_rtl_loop_done, 1, pass_riscv_doloop_begin);
+INSERT_PASS_BEFORE (pass_ira, 1, pass_riscv_doloop_ranges);
+INSERT_PASS_BEFORE (pass_split_after_reload, 1, pass_riscv_doloop_ranges);
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 196b53f10f3..018215b7f7a 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -669,4 +669,15 @@ struct riscv_tune_info {
 const struct riscv_tune_info *
 riscv_parse_tune (const char *, bool);
 
+/* Routines implemented in corev.cc.  */
+rtl_opt_pass * make_pass_riscv_doloop_begin (gcc::context *ctxt);
+rtl_opt_pass * make_pass_riscv_doloop_ranges (gcc::context *ctxt);
+extern bool riscv_can_use_doloop_p (const widest_int &, const widest_int &,
+                                   unsigned int, bool);
+extern const char *riscv_invalid_within_doloop (const rtx_insn *insn);
+extern bool hwloop_setupi_p (rtx insn, rtx start_ref, rtx end_ref);
+extern void add_label_op_ref (rtx_insn *insn, rtx label);
+extern int corev_label_align (rtx_insn *);
+
+
 #endif /* ! GCC_RISCV_PROTOS_H */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index c2bd1c2ed29..291f1bbcdf6 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -314,8 +314,8 @@ const enum reg_class 
riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
   FP_REGS,     FP_REGS,        FP_REGS,        FP_REGS,
   FP_REGS,     FP_REGS,        FP_REGS,        FP_REGS,
   FRAME_REGS,  FRAME_REGS,     NO_REGS,        NO_REGS,
-  NO_REGS,     NO_REGS,        NO_REGS,        NO_REGS,
-  NO_REGS,     NO_REGS,        NO_REGS,        NO_REGS,
+  NO_REGS,     NO_REGS,        LP0START_REGS,  LP0END_REGS,
+  LP0COUNT_REGS,LP1START_REGS, LP1END_REGS,    LP1COUNT_REGS,
   NO_REGS,     NO_REGS,        NO_REGS,        NO_REGS,
   NO_REGS,     NO_REGS,        NO_REGS,        NO_REGS,
   NO_REGS,     NO_REGS,        NO_REGS,        NO_REGS,
@@ -1769,7 +1769,8 @@ riscv_unspec_address (rtx address, enum riscv_symbol_type 
symbol_type)
   return riscv_unspec_address_offset (base, offset, symbol_type);
 }
 
-/* If OP is an UNSPEC address, return the address to which it refers,
+/* If OP is an UNSPEC address, UNSPEC_CV_LP_START_12, UNSPEC_CV_LP_END_5,
+   or UNSPEC_CV_LP_END_12, return the address to which it refers,
    otherwise return OP itself.  */
 
 static rtx
@@ -1778,7 +1779,11 @@ riscv_strip_unspec_address (rtx op)
   rtx base, offset;
 
   split_const (op, &base, &offset);
-  if (UNSPEC_ADDRESS_P (base))
+  if ((UNSPEC_ADDRESS_P (base))
+      || (GET_CODE (base) == UNSPEC
+         && (XINT (base, 1) == UNSPEC_CV_LP_START_12
+             || XINT (base, 1) == UNSPEC_CV_LP_END_5
+             || XINT (base, 1) == UNSPEC_CV_LP_END_12)))
     op = plus_constant (Pmode, UNSPEC_ADDRESS (base), INTVAL (offset));
   return op;
 }
@@ -3436,6 +3441,19 @@ riscv_output_move (rtx dest, rtx src)
            return "fmv.x.d\t%0,%1";
          }
 
+      if (src_code == REG && REGNO (src) == LPCOUNT0_REGNUM)
+       {
+         gcc_assert (width == 4);
+         gcc_assert ("TARGET_XCVHWLP");
+         return "csrr %0, 0xcc2";
+       }
+      if (src_code == REG && REGNO (src) == LPCOUNT1_REGNUM)
+       {
+         gcc_assert (width == 4);
+         gcc_assert ("TARGET_XCVHWLP");
+         return "csrr %0, 0xcc5";
+       }
+
       if (src_code == MEM)
        switch (width)
          {
@@ -9971,6 +9989,12 @@ riscv_preferred_else_value (unsigned ifn, tree vectype, 
unsigned int nops,
 #undef TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN riscv_expand_builtin
 
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P riscv_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP riscv_invalid_within_doloop
+
 #undef TARGET_HARD_REGNO_NREGS
 #define TARGET_HARD_REGNO_NREGS riscv_hard_regno_nregs
 #undef TARGET_HARD_REGNO_MODE_OK
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 6205d7533f4..a17acbf49f2 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -497,6 +497,12 @@ enum reg_class
   GR_REGS,                     /* integer registers */
   FP_REGS,                     /* floating-point registers */
   FRAME_REGS,                  /* arg pointer and frame pointer */
+  LP0START_REGS,               /* xcv loop 0 start register */
+  LP0END_REGS,                 /* xcv loop 0 end register */
+  LP0COUNT_REGS,               /* xcv loop 0 count register */
+  LP1START_REGS,               /* xcv loop 1 start register */
+  LP1END_REGS,                 /* xcv loop 1 end register */
+  LP1COUNT_REGS,               /* xcv loop 1 count register */
   VM_REGS,                     /* v0.t registers */
   VD_REGS,                     /* vector registers except v0.t */
   V_REGS,                      /* vector registers */
@@ -520,6 +526,12 @@ enum reg_class
   "GR_REGS",                                                           \
   "FP_REGS",                                                           \
   "FRAME_REGS",                                                                
\
+  "LP0START_REGS",                                                     \
+  "LP0END_REGS",                                                       \
+  "LP0COUNT_REGS",                                                     \
+  "LP1START_REGS",                                                     \
+  "LP1END_REGS",                                                       \
+  "LP1COUNT_REGS",                                                     \
   "VM_REGS",                                                           \
   "VD_REGS",                                                           \
   "V_REGS",                                                            \
@@ -545,10 +557,16 @@ enum reg_class
   { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },  /* GR_REGS */           
\
   { 0x00000000, 0xffffffff, 0x00000000, 0x00000000 },  /* FP_REGS */           
\
   { 0x00000000, 0x00000000, 0x00000003, 0x00000000 },  /* FRAME_REGS */        
\
+  { 0x00000000, 0x00000000, 0x00000040, 0x00000000 },  /* LP0START_REGS */\
+  { 0x00000000, 0x00000000, 0x00000080, 0x00000000 },  /* LP0END_REGS */\
+  { 0x00000000, 0x00000000, 0x00000100, 0x00000000 },  /* LP0COUNT_REGS */\
+  { 0x00000000, 0x00000000, 0x00000200, 0x00000000 },  /* LP1START_REGS */\
+  { 0x00000000, 0x00000000, 0x00000400, 0x00000000 },  /* LP1END_REGS */\
+  { 0x00000000, 0x00000000, 0x00000800, 0x00000000 },  /* LP1COUNT_REGS */\
   { 0x00000000, 0x00000000, 0x00000000, 0x00000001 },  /* V0_REGS */           
\
   { 0x00000000, 0x00000000, 0x00000000, 0xfffffffe },  /* VNoV0_REGS */        
\
   { 0x00000000, 0x00000000, 0x00000000, 0xffffffff },  /* V_REGS */            
\
-  { 0xffffffff, 0xffffffff, 0x00000003, 0xffffffff }   /* ALL_REGS */          
\
+  { 0xffffffff, 0xffffffff, 0x00000fc3, 0xffffffff }   /* ALL_REGS */          
\
 }
 
 /* A C expression whose value is a register class containing hard
@@ -596,7 +614,7 @@ enum reg_class
   96,                                                                  \
   /* None of the remaining classes have defined call-saved             \
      registers.  */                                                    \
-  64, 65, 66, 67                                                       \
+  64, 65, 66, 67, 70, 71, 72, 73, 74, 75                               \
 }
 
 /* True if VALUE is a signed 12-bit number.  */
@@ -912,8 +930,8 @@ extern enum riscv_cc get_riscv_cc (const rtx use);
   "fs0", "fs1", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5",      \
   "fa6", "fa7", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7",      \
   "fs8", "fs9", "fs10","fs11","ft8", "ft9", "ft10","ft11",     \
-  "arg", "frame", "vl", "vtype", "vxrm", "frm", "N/A", "N/A",   \
-  "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A",      \
+  "arg", "frame", "vl", "vtype", "vxrm", "frm", "lpstart0", "lpend0",   \
+  "lpcount0","lpstart1","lpend1","lpcount1", "N/A", "N/A", "N/A", "N/A",       
\
   "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A",      \
   "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A",      \
   "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",       \
@@ -989,6 +1007,8 @@ extern enum riscv_cc get_riscv_cc (const rtx use);
   { "f31",     31 + FP_REG_FIRST },                                    \
 }
 
+#define LABEL_ALIGN(label) corev_label_align (label)
+
 /* Globalizing directive for a label.  */
 #define GLOBAL_ASM_OP "\t.globl\t"
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 8f28e8e56ab..cdae62044e8 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -157,6 +157,12 @@
    (VTYPE_REGNUM               67)
    (VXRM_REGNUM                        68)
    (FRM_REGNUM                 69)
+   (LPSTART0_REGNUM            70)
+   (LPEND0_REGNUM              71)
+   (LPCOUNT0_REGNUM            72)
+   (LPSTART1_REGNUM            73)
+   (LPEND1_REGNUM              74)
+   (LPCOUNT1_REGNUM            75)
 ])
 
 (include "predicates.md")
@@ -258,7 +264,7 @@
        (const_string "no")))
 
 ;; ISA attributes.
-(define_attr "ext" "base,f,d,vector"
+(define_attr "ext" "base,f,d,vector,xcvhwlp"
   (const_string "base"))
 
 ;; True if the extension is enabled.
@@ -277,6 +283,10 @@
         (and (eq_attr "ext" "vector")
              (match_test "TARGET_VECTOR"))
         (const_string "yes")
+
+        (and (eq_attr "ext" "xcvhwlp")
+             (match_test "TARGET_XCVHWLP"))
+        (const_string "yes")
        ]
        (const_string "no")))
 
@@ -2095,17 +2105,17 @@
 })
 
 (define_insn "*movsi_internal"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r, m,  *f,*f,*r,*m,r")
-       (match_operand:SI 1 "move_operand"         " 
r,T,m,rJ,*r*J,*m,*f,*f,vp"))]
+  [(set (match_operand:SI 0 "move_dest_operand" "=r,r,r, m,  
*f,*f,*r,*m,r,r,r")
+       (match_operand:SI 1 "move_operand"      " 
r,T,m,rJ,*r*J,*m,*f,*f,vp,xcvl0c,xcvl1c"))]
   "(register_operand (operands[0], SImode)
     || reg_or_0_operand (operands[1], SImode))
     && !(register_operand (operands[1], SImode)
          && reg_or_subregno (operands[1]) == VL_REGNUM)"
   { return riscv_output_move (operands[0], operands[1]); }
-  [(set_attr "move_type" 
"move,const,load,store,mtc,fpload,mfc,fpstore,rdvlenb")
+  [(set_attr "move_type" 
"move,const,load,store,mtc,fpload,mfc,fpstore,rdvlenb,move,move")
    (set_attr "mode" "SI")
    (set_attr "type" "move")
-   (set_attr "ext" "base,base,base,base,f,f,f,f,vector")])
+   (set_attr "ext" "base,base,base,base,f,f,f,f,vector,xcvhwlp,xcvhwlp")])
 
 ;; 16-bit Integer moves
 
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 1bd661a3fe4..14ec810e023 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -411,6 +411,8 @@ Mask(XCVMAC) Var(riscv_xcv_subext)
 
 Mask(XCVALU) Var(riscv_xcv_subext)
 
+Mask(XCVHWLP) Var(riscv_xcv_subext)
+
 TargetVariable
 int riscv_xthead_subext
 
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 3b9686daa58..e246923fd82 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -126,6 +126,13 @@ thead.o: $(srcdir)/config/riscv/thead.cc \
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+corev.o: $(srcdir)/config/riscv/corev.cc \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TARGET_H) backend.h $(RTL_H) \
+  memmodel.h $(EMIT_RTL_H) insn-config.h $(RECOG_H) $(FUNCTION_H) \
+  $(TM_P.H) tree-pass.h df.h
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
 PASSES_EXTRA += $(srcdir)/config/riscv/riscv-passes.def
 
 $(common_out_file): $(srcdir)/config/riscv/riscv-cores.def \
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index e01cdcbe22c..a55d07f0497 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -7724,6 +7724,10 @@ Conditional branch instruction that decrements a 
register and
 jumps if the register is nonzero.  Operand 0 is the register to
 decrement and test; operand 1 is the label to jump to if the
 register is nonzero.
+Operand 2 is the original branch back to the loop latch; this is useful
+if you want to accept some nested loops, but need to analyze the nested
+loops, to tell the loop being handled from its nested loops, as they might
+share the loop latch.
 @xref{Looping Patterns}.
 
 This optional instruction pattern should be defined for machines with
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 4feb0a25ab9..d703cb5f2af 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -720,7 +720,8 @@ doloop_optimize (class loop *loop)
   count = copy_rtx (desc->niter_expr);
   start_label = block_label (desc->in_edge->dest);
   doloop_reg = gen_reg_rtx (mode);
-  rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+  rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label,
+                                                BB_END (desc->in_edge->src));
 
   word_mode_size = GET_MODE_PRECISION (word_mode);
   word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
@@ -737,7 +738,8 @@ doloop_optimize (class loop *loop)
       else
        count = lowpart_subreg (word_mode, count, mode);
       PUT_MODE (doloop_reg, word_mode);
-      doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+      doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label,
+                                          BB_END (desc->in_edge->src));
     }
   if (! doloop_seq)
     {
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index c4415d00735..962c5cc51d1 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -48,7 +48,7 @@ DEF_TARGET_INSN (casesi, (rtx x0, rtx x1, rtx x2, rtx x3, rtx 
x4))
 DEF_TARGET_INSN (check_stack, (rtx x0))
 DEF_TARGET_INSN (clear_cache, (rtx x0, rtx x1))
 DEF_TARGET_INSN (doloop_begin, (rtx x0, rtx x1))
-DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1))
+DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1, rtx opt2))
 DEF_TARGET_INSN (eh_return, (rtx x0))
 DEF_TARGET_INSN (epilogue, (void))
 DEF_TARGET_INSN (exception_receiver, (void))
diff --git a/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c 
b/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c
new file mode 100644
index 00000000000..c0ead8056b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/cv-hwlp-shiftsub.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-Os" "-Og" "-O3" "-Oz" "-flto"} } */
+/* { dg-options "-march=rv32imc_xcvhwlp -mabi=ilp32 -O2" } */
+
+int f (int i, int j)
+{
+ while (--j)
+   i = (i << 1) - 13;
+  return i;
+}
+/* { dg-final { scan-assembler {\mcv.setup\M} } } */

Reply via email to