Kunal Pai has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/70798?usp=email )

Change subject: stdlib: Edit RISCVMatched Configuration
......................................................................

stdlib: Edit RISCVMatched Configuration

This patch changes the RISCVMatched Cache Hierarchy to
private L1 shared L2.
It also changes the RISCVMatched Core's parameters to
better match hardware performance.
Also, sizes are changed to MiB or KiB instead of MB
or KB, to match the datasheet.
All the changes that deviate from the datasheet and the
ARM HPI CPU (reference for pipeline parameters)
are documented.

Change-Id: I4235140f33be6a3b529a819ae6a7223cb88bb7ab
---
M src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
M src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
M src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
3 files changed, 89 insertions(+), 55 deletions(-)



diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
index ae483cc..a11927e 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_board.py
@@ -109,7 +109,7 @@
     def __init__(
         self,
         clk_freq: str = "1.2GHz",
-        l2_size: str = "2MB",
+        l2_size: str = "2MiB",
         is_fs: bool = False,
     ) -> None:
         """
@@ -321,7 +321,7 @@
         root.appendCompatible(["riscv-virtio"])

         for mem_range in self.mem_ranges:
-            node = FdtNode(f"memory@{int(mem_range.start):x}")
+            node = FdtNode("memory@%x" % int(mem_range.start))
             node.append(FdtPropertyStrings("device_type", ["memory"]))
             node.append(
                 FdtPropertyWords(
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
index dc66af3..a424419 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_cache.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 The Regents of the University of California
+# Copyright (c) 2023 The Regents of the University of California
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -42,6 +42,7 @@
 from m5.objects import Cache, L2XBar, BaseXBar, SystemXBar, BadAddr, Port

 from gem5.utils.override import *
+from typing import Type


 class RISCVMatchedCacheHierarchy(
@@ -50,17 +51,7 @@
     """

A cache setup where each core has a private L1 Data and Instruction Cache,
-    and a private L2 cache.
- The HiFive board has a partially inclusive cache hierarchy, hence this hierarchy is chosen. - The details of the cache hierarchy are in Table 7, page 36 of the datasheet.
-
-    - L1 Instruction Cache:
-        - 32 KiB 4-way set associative
-    - L1 Data Cache
-        - 32 KiB 8-way set associative
-    - L2 Cache
-        - 2 MiB 16-way set associative
-
+    and a shared L2 cache.
     """

     def __init__(
@@ -74,9 +65,9 @@
         AbstractClassicCacheHierarchy.__init__(self=self)
         AbstractTwoLevelCacheHierarchy.__init__(
             self,
-            l1i_size="32kB",
+            l1i_size="32KiB",
             l1i_assoc=4,
-            l1d_size="32kB",
+            l1d_size="32KiB",
             l1d_assoc=8,
             l2_size=l2_size,
             l2_assoc=16,
@@ -108,16 +99,17 @@
             for i in range(board.get_processor().get_num_cores())
         ]
         self.l1dcaches = [
-            L1DCache(size=self._l1d_size, assoc=self._l1d_assoc)
+            L1DCache(
+ size=self._l1d_size, assoc=self._l1d_assoc, response_latency=10
+            )
             for i in range(board.get_processor().get_num_cores())
         ]
-        self.l2buses = [
-            L2XBar() for i in range(board.get_processor().get_num_cores())
-        ]
-        self.l2caches = [
-            L2Cache(size=self._l2_size, assoc=self._l2_assoc)
-            for i in range(board.get_processor().get_num_cores())
-        ]
+        self.l2bus = L2XBar()
+
+        self.l2cache = L2Cache(
+            size=self._l2_size, assoc=self._l2_assoc, data_latency=20
+        )
+
         # ITLB Page walk caches
         self.iptw_caches = [
             MMUCache(size="4KiB")
@@ -137,14 +129,10 @@
             cpu.connect_icache(self.l1icaches[i].cpu_side)
             cpu.connect_dcache(self.l1dcaches[i].cpu_side)

-            self.l1icaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.l1dcaches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.iptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-            self.dptw_caches[i].mem_side = self.l2buses[i].cpu_side_ports
-
-            self.l2buses[i].mem_side_ports = self.l2caches[i].cpu_side
-
-            self.membus.cpu_side_ports = self.l2caches[i].mem_side
+            self.l1icaches[i].mem_side = self.l2bus.cpu_side_ports
+            self.l1dcaches[i].mem_side = self.l2bus.cpu_side_ports
+            self.iptw_caches[i].mem_side = self.l2bus.cpu_side_ports
+            self.dptw_caches[i].mem_side = self.l2bus.cpu_side_ports

             cpu.connect_walker_ports(
                 self.iptw_caches[i].cpu_side, self.dptw_caches[i].cpu_side
@@ -157,6 +145,9 @@
             else:
                 cpu.connect_interrupt()

+        self.l2bus.mem_side_ports = self.l2cache.cpu_side
+        self.membus.cpu_side_ports = self.l2cache.mem_side
+
     def _setup_io_cache(self, board: AbstractBoard) -> None:
         """Create a cache for coherent I/O connections"""
         self.iocache = Cache(
diff --git a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
index 0b4375c..48291bf 100644
--- a/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
+++ b/src/python/gem5/prebuilt/riscvmatched/riscvmatched_core.py
@@ -61,8 +61,14 @@
     pass


-class U74MemFU(MinorDefaultMemFU):
-    opLat = 3
+class U74MemReadFU(MinorDefaultMemFU):
+    opClasses = minorMakeOpClassSet(["MemRead", "FloatMemRead"])
+    opLat = 2
+
+
+class U74MemWriteFU(MinorDefaultMemFU):
+    opClasses = minorMakeOpClassSet(["MemWrite", "FloatMemWrite"])
+    opLat = 2


 class U74MiscFU(MinorDefaultMiscFU):
@@ -77,18 +83,24 @@
         U74IntDivFU(),
         U74FloatSimdFU(),
         U74PredFU(),
-        U74MemFU(),
+        U74MemReadFU(),
+        U74MemWriteFU(),
         U74MiscFU(),
     ]


 class U74BP(TournamentBP):
-    BTBEntries = 16
-    RASSize = 6
+    BTBEntries = 32
+    RASSize = 12
     localHistoryTableSize = 4096  # is 3.6 KiB but gem5 requires power of 2
-
+    localPredictorSize = 16384
+    globalPredictorSize = 16384
+    choicePredictorSize = 16384
+    localCtrBits = 4
+    globalCtrBits = 4
+    choiceCtrBits = 4
     indirectBranchPred = SimpleIndirectPredictor()
-    indirectBranchPred.indirectSets = 8
+    indirectBranchPred.indirectSets = 16


 class U74CPU(RiscvMinorCPU):
@@ -97,26 +109,49 @@
     This information about the CPU can be found on page 15 of
gem5_rsk_gem5-21.2.pdf at https://github.com/arm-university/arm-gem5-rsk

-    The only parameter that is changed is the decodeToExecuteForwardDelay.
-    This is changed from 1 to 2 to avoid a PMC address fault.
+    The parameters that are changed are:
+    - threadPolicy:
+        This is initialized to "SingleThreaded".
+    - decodeToExecuteForwardDelay:
+        This is changed from 1 to 2 to avoid a PMC address fault.
+    - fetch1ToFetch2BackwardDelay:
+        This is changed from 1 to 0 to better match hardware performance.
+    - fetch2InputBufferSize:
+        This is changed from 2 to 1 to better match hardware performance.
+    - decodeInputBufferSize:
+        This is changed from 3 to 2 to better match hardware performance.
+    - decodeToExecuteForwardDelay:
+        This is changed from 2 to 1 to better match hardware performance.
+    - executeInputBufferSize:
+        This is changed from 7 to 4 to better match hardware performance.
+    - executeMaxAccessesInMemory:
+        This is changed from 2 to 1 to better match hardware performance.
+    - executeLSQStoreBufferSize:
+        This is changed from 5 to 3 to better match hardware performance.
+    - executeBranchDelay:
+        This is changed from 1 to 2 to better match hardware performance.
+    - enableIdling:
+        This is changed to False to better match hardware performance.

     """

+    threadPolicy = "SingleThreaded"
+
     # Fetch1 stage
     fetch1LineSnapWidth = 0
     fetch1LineWidth = 0
     fetch1FetchLimit = 1
     fetch1ToFetch2ForwardDelay = 1
-    fetch1ToFetch2BackwardDelay = 1
+    fetch1ToFetch2BackwardDelay = 0

     # Fetch2 stage
-    fetch2InputBufferSize = 2
+    fetch2InputBufferSize = 1
     fetch2ToDecodeForwardDelay = 1
     fetch2CycleInput = True

     # Decode stage
-    decodeInputBufferSize = 3
-    decodeToExecuteForwardDelay = 2
+    decodeInputBufferSize = 2
+    decodeToExecuteForwardDelay = 1
     decodeInputWidth = 2
     decodeCycleInput = True

@@ -127,17 +162,17 @@
     executeMemoryIssueLimit = 1
     executeCommitLimit = 2
     executeMemoryCommitLimit = 1
-    executeInputBufferSize = 7
-    executeMaxAccessesInMemory = 2
+    executeInputBufferSize = 4
+    executeMaxAccessesInMemory = 1
     executeLSQMaxStoreBufferStoresPerCycle = 2
     executeLSQRequestsQueueSize = 1
     executeLSQTransfersQueueSize = 2
-    executeLSQStoreBufferSize = 5
-    executeBranchDelay = 1
+    executeLSQStoreBufferSize = 3
+    executeBranchDelay = 2
     executeSetTraceTimeOnCommit = True
     executeSetTraceTimeOnIssue = False
     executeAllowEarlyMemoryIssue = True
-    enableIdling = True
+    enableIdling = False

     # Functional Units and Branch Prediction
     executeFuncUnits = U74FUPool()
@@ -152,13 +187,21 @@
       - IntFU: 1 cycle
       - IntMulFU: 3 cycles
- IntDivFU: 6 cycles (NOTE: latency is variable, but is set to 6 cycles)
-      - MemFU: 3 cycles
+      - MemReadFU: 2 cycles
+      - MemWriteFU: 2 cycles
The branch predictor is a TournamentBP, based on Section 4.2.5 on page 38.
-      - BTBEntries: 16 entries
-      - RASSize: 6 entries
-      - IndirectSets: 8 sets
+      - BTBEntries: 32 entries
+      - RASSize: 12 entries
+      - IndirectSets: 16 sets
+      - localPredictorSize: 16384
+      - globalPredictorSize: 16384
+      - choicePredictorSize: 16384
+      - localCtrBits: 4
+      - globalCtrBits: 4
+      - choiceCtrBits: 4
       - localHistoryTableSize: 4096 B
- NOTE: The BHT of the HiFive Board is 3.6KiB but gem5 requires a power of 2, so the BHT is 4096B.
+    NOTE: The TournamentBP deviates from the actual BP.
+    This configuration performs the best in relation to the hardware.
     """

     def __init__(

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/70798?usp=email To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings?usp=email

Gerrit-MessageType: newchange
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I4235140f33be6a3b529a819ae6a7223cb88bb7ab
Gerrit-Change-Number: 70798
Gerrit-PatchSet: 1
Gerrit-Owner: Kunal Pai <kun...@ucdavis.edu>
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org

Reply via email to