Hi, You can find the configuration I usually use. There are a few things that are a bit unrealistic, for example, large SQ size, but I usually do so to account for better features on a real machine that we do not have at the moment. The command would have:
--cpu-type=DerivO3CPU --bp-typ=TAGE_SC_L_64KB --l1d-hwp-type=IndirectMemoryPrefetcher --l2-hwp-type=L2MultiPrefetcher . I am still working on a better design, especially a better prefetcher configuration, but wanted to post it here to open up the discussion. With this, I usually get 6GB/S for stream on a single-core compared to 1011GB/s on a real machine. This is significantly better than the default configuration that would deliver something around 1-2GB/S. diff --git a/configs/common/Caches.py b/configs/common/Caches.py index 1468b953c..fe22a7c27 100644 --- a/configs/common/Caches.py +++ b/configs/common/Caches.py @@ -47,11 +47,12 @@ from m5.objects import * # specific instantiations. class L1Cache(Cache): - assoc = 2 - tag_latency = 2 - data_latency = 2 - response_latency = 2 - mshrs = 4 + assoc = 8 + tag_latency = 4 + data_latency = 4 + response_latency = 4 + mshrs = 20 + write_buffers = 20 tgts_per_mshr = 20 class L1_ICache(L1Cache): @@ -63,13 +64,13 @@ class L1_DCache(L1Cache): pass class L2Cache(Cache): - assoc = 8 - tag_latency = 20 - data_latency = 20 - response_latency = 20 - mshrs = 20 + assoc = 16 + tag_latency = 12 + data_latency = 12 + response_latency = 12 + mshrs = 32 tgts_per_mshr = 12 - write_buffers = 8 + write_buffers = 32 class IOCache(Cache): assoc = 8 @@ -81,7 +82,7 @@ class IOCache(Cache): tgts_per_mshr = 12 class PageTableWalkerCache(Cache): - assoc = 2 + assoc = 4 tag_latency = 2 data_latency = 2 response_latency = 2 diff --git a/configs/common/Options.py b/configs/common/Options.py index a63cc7b08..ad3a6b25e 100644 --- a/configs/common/Options.py +++ b/configs/common/Options.py @@ -148,9 +148,9 @@ def addNoISAOptions(parser): parser.add_argument("--l1i_size", type=str, default="32kB") parser.add_argument("--l2_size", type=str, default="2MB") parser.add_argument("--l3_size", type=str, default="16MB") - parser.add_argument("--l1d_assoc", type=int, default=2) - parser.add_argument("--l1i_assoc", type=int, default=2) - parser.add_argument("--l2_assoc", type=int, default=8) + parser.add_argument("--l1d_assoc", type=int, default=8) + parser.add_argument("--l1i_assoc", type=int, default=8) + parser.add_argument("--l2_assoc", type=int, default=16) parser.add_argument("--l3_assoc", type=int, default=16) parser.add_argument("--cacheline_size", type=int, default=64) @@ -238,7 +238,7 @@ def addCommonOptions(parser): the selected cache)""") parser.add_argument("--checker", action="store_true") parser.add_argument("--cpu-clock", action="store", type=str, - default='2GHz', + default='3.66GHz', help="Clock for blocks running at CPU speed") parser.add_argument("--smt", action="store_true", default=False, help=""" diff --git a/src/arch/x86/X86TLB.py b/src/arch/x86/X86TLB.py index 8abc93c19..d5139c162 100644 --- a/src/arch/x86/X86TLB.py +++ b/src/arch/x86/X86TLB.py @@ -54,7 +54,7 @@ class X86TLB(BaseTLB): cxx_class = 'gem5::X86ISA::TLB' cxx_header = 'arch/x86/tlb.hh' - size = Param.Unsigned(64, "TLB size") + size = Param.Unsigned(128, "TLB size") system = Param.System(Parent.any, "system object") walker = Param.X86PagetableWalker(\ X86PagetableWalker(), "page table walker") diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py index fb1a9dc9d..e39e73267 100644 --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -73,9 +73,9 @@ class O3CPU(BaseCPU): activity = Param.Unsigned(0, "Initial count") - cacheStorePorts = Param.Unsigned(200, "Cache Ports. " + cacheStorePorts = Param.Unsigned(4, "Cache Ports. " "Constrains stores only.") - cacheLoadPorts = Param.Unsigned(200, "Cache Ports. " + cacheLoadPorts = Param.Unsigned(4, "Cache Ports. " "Constrains loads only.") decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay") @@ -85,7 +85,7 @@ class O3CPU(BaseCPU): commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay") fetchWidth = Param.Unsigned(8, "Fetch width") fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes") - fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops " + fetchQueueSize = Param.Unsigned(128, "Fetch queue size in micro-ops " "per-thread") renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay") @@ -123,15 +123,15 @@ class O3CPU(BaseCPU): backComSize = Param.Unsigned(5, "Time buffer size for backwards communication") forwardComSize = Param.Unsigned(5, "Time buffer size for forward communication") - LQEntries = Param.Unsigned(32, "Number of load queue entries") - SQEntries = Param.Unsigned(32, "Number of store queue entries") + LQEntries = Param.Unsigned(128, "Number of load queue entries") + SQEntries = Param.Unsigned(128, "Number of store queue entries") LSQDepCheckShift = Param.Unsigned(4, "Number of places to shift addr before check") LSQCheckLoads = Param.Bool(True, "Should dependency violations be checked for loads & stores or just stores") store_set_clear_period = Param.Unsigned(250000, "Number of load/store insts before the dep predictor should be invalidated") - LFSTSize = Param.Unsigned(1024, "Last fetched store table size") - SSITSize = Param.Unsigned(1024, "Store set ID table size") + LFSTSize = Param.Unsigned(2048, "Last fetched store table size") + SSITSize = Param.Unsigned(2048, "Store set ID table size") numRobs = Param.Unsigned(1, "Number of Reorder Buffers"); @@ -154,8 +154,8 @@ class O3CPU(BaseCPU): "registers") numPhysCCRegs = Param.Unsigned(_defaultNumPhysCCRegs, "Number of physical cc registers") - numIQEntries = Param.Unsigned(64, "Number of instruction queue entries") - numROBEntries = Param.Unsigned(192, "Number of reorder buffer entries") + numIQEntries = Param.Unsigned(128, "Number of instruction queue entries") + numROBEntries = Param.Unsigned(320, "Number of reorder buffer entries") smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching Threads") smtFetchPolicy = Param.SMTFetchPolicy('RoundRobin', "SMT Fetch policy") diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 78999ee46..6d921bb2b 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -115,6 +115,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const O3CPUParams ¶ms) thread[tid].init(cpu, iew_ptr, params, this, tid); thread[tid].setDcachePort(&dcachePort); } + + std::cout<<"maxLQEntries "<<maxLQEntries<<" maxSQEntries "<<maxSQEntries<<std::endl; } diff --git a/src/mem/MemCtrl.py b/src/mem/MemCtrl.py index 90d0e5004..84b2b6f39 100644 --- a/src/mem/MemCtrl.py +++ b/src/mem/MemCtrl.py @@ -70,11 +70,11 @@ class MemCtrl(QoSMemCtrl): # threshold in percent for when to forcefully trigger writes and # start emptying the write buffer - write_high_thresh_perc = Param.Percent(85, "Threshold to force writes") + write_high_thresh_perc = Param.Percent(60, "Threshold to force writes") # threshold in percentage for when to start writes if the read # queue is empty - write_low_thresh_perc = Param.Percent(50, "Threshold to start writes") + write_low_thresh_perc = Param.Percent(40, "Threshold to start writes") # minimum write bursts to schedule before switching back to reads min_writes_per_switch = Param.Unsigned(16, "Minimum write bursts before " diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py index 7d704881b..a6974c863 100644 --- a/src/mem/cache/prefetch/Prefetcher.py +++ b/src/mem/cache/prefetch/Prefetcher.py @@ -120,13 +120,16 @@ class MultiPrefetcher(BasePrefetcher): prefetchers = VectorParam.BasePrefetcher([], "Array of prefetchers") + + + class QueuedPrefetcher(BasePrefetcher): type = "QueuedPrefetcher" abstract = True cxx_class = 'gem5::prefetch::Queued' cxx_header = "mem/cache/prefetch/queued.hh" latency = Param.Int(1, "Latency for generated prefetches") - queue_size = Param.Int(32, "Maximum number of queued prefetches") + queue_size = Param.Int(64, "Maximum number of queued prefetches") max_prefetch_requests_with_pending_translation = Param.Int(32, "Maximum number of queued prefetches that have a missing translation") queue_squash = Param.Bool(True, "Squash queued prefetch on demand access") @@ -191,7 +194,7 @@ class IndirectMemoryPrefetcher(QueuedPrefetcher): type = 'IndirectMemoryPrefetcher' cxx_class = 'gem5::prefetch::IndirectMemory' cxx_header = "mem/cache/prefetch/indirect_memory.hh" - pt_table_entries = Param.MemorySize("16", + pt_table_entries = Param.MemorySize("32", "Number of entries of the Prefetch Table") pt_table_assoc = Param.Unsigned(16, "Associativity of the Prefetch Table") pt_table_indexing_policy = Param.BaseIndexingPolicy( @@ -257,9 +260,9 @@ class SignaturePathPrefetcher(QueuedPrefetcher): pattern_table_replacement_policy = Param.BaseReplacementPolicy(LRURP(), "Replacement policy of the pattern table") - prefetch_confidence_threshold = Param.Float(0.5, + prefetch_confidence_threshold = Param.Float(0.25, "Minimum confidence to issue prefetches") - lookahead_confidence_threshold = Param.Float(0.75, + lookahead_confidence_threshold = Param.Float(0.275, "Minimum confidence to continue exploring lookahead entries") class SignaturePathPrefetcherV2(SignaturePathPrefetcher): @@ -529,3 +532,6 @@ class PIFPrefetcher(QueuedPrefetcher): if not isinstance(simObj, SimObject): raise TypeError("argument must be of SimObject type") self.addEvent(HWPProbeEventRetiredInsts(self, simObj,"RetiredInstsPC")) + +class L2MultiPrefetcher(MultiPrefetcher): + prefetchers = VectorParam.BasePrefetcher([SignaturePathPrefetcher(), AMPMPrefetcher(), DCPTPrefetcher()], "Array of prefetchers") \ No newline at end of file On Sat, Apr 23, 2022 at 12:21 PM Jason Lowe-Power <ja...@lowepower.com> wrote: > Majid, > > These are all great suggestions! Do you have a configuration file that you > would be willing to share? It would be a huge benefit to the community if > we had some better default configurations in the "examples" for gem5 > configuration files. > > We're also trying to use the new standard library for these kinds of > "good" configurations. We can work with you to create a "prebuilt board" > with all of these parameters and even run nightly/weekly tests to make sure > there are no performance regressions. > > Thanks! > Jason > > On Fri, Apr 22, 2022 at 7:52 PM Majid Jalili <majid...@gmail.com> wrote: > >> I think it is hard to get to a real machine level in terms of BW. But By >> looking at your stats, I found the lsqFullEvents is high. >> You can go after the CPU to make it more aggressive, increasing >> Load/Store queue size, and ROB depth are the minimal changes you can make. >> I usually do at least ROB sizes of 256 or 320. With that, you may set the >> LSQ size to at least 1/4 of ROB size. >> For MSHRs, your numbers are good now, 10 is too little even in intel >> machines, I found recently they increased that to 16-20. >> The other thing you can try to st is the cache latencies, make sure that >> they are reasonable. >> For prefetcher, you can use IMPPrefetcher in addition to DCPT, it has a >> pretty aggressive stream prefetcher inside. >> Also, DRAM memory mapping is important, I do not remember what is the >> default for the the mem type you are using >> >> Majid >> >> >> >> On Sat, Apr 16, 2022 at 2:12 AM 王子聪 <wangzic...@nudt.edu.cn> wrote: >> >>> Hi Majid, >>> >>> Thanks for your suggestion! I check the default number of MSHRs (in >>> configs/common/Caches.py), and found the default #MSHR of L1/L2 are 4 and >>> 20 respectively. >>> >>> According to the PACT’18 paper "Cimple: Instruction and Memory Level >>> Parallelism: A DSL for Uncovering ILP and MLP”, it says that "Modern >>> processors typically have 6–10 L1 cache MSHRs”, and "Intel’s Haswell >>> microarchitecture uses 10 L1 MSHRs (Line Fill Buffers) for >>> handling outstanding L1 misses”. So I change to L1 #MSHRs to 16 and L2 >>> #MSHRs to 32 (which I think it’s enough to handling outstanding misses), >>> and then change the L1/L2 prefetcher type to DCPT. Then I got the STREAM >>> output as shown in below: >>> >>> ./build/X86/gem5.opt configs/example/se.py --cpu-type=O3CPU --caches >>> --l1d_size=256kB --l1i_size=256kB >>> --param="system.cpu[0].dcache.mshrs=16;system.cpu[0].icache.mshrs=16;system.l2.mshrs=32" >>> --l2cache --l2_size=8MB --l1i-hwp-type=DCPTPrefetcher >>> --l1d-hwp-type=DCPTPrefetcher --l2-hwp-type=DCPTPrefetcher >>> --mem-type=DDR3_1600_8x8 -c ../stream/stream >>> ------------------------------------------------------------- >>> Function Best Rate MB/s Avg time Min time Max time >>> Copy: 3479.8 0.004598 0.004598 0.004598 >>> Scale: 3554.0 0.004502 0.004502 0.004502 >>> Add: 4595.0 0.005223 0.005223 0.005223 >>> Triad: 4705.9 0.005100 0.005100 0.005100 >>> ------------------------------------------------------------- >>> >>> The busutil of DRAM also improved: >>> ------------------------------------------------------------- >>> system.mem_ctrls.dram.bytesRead 239947840 # Total bytes read >>> (Byte) >>> system.mem_ctrls.dram.bytesWritten 121160640 # Total bytes >>> written (Byte) >>> system.mem_ctrls.dram.avgRdBW 1611.266685 # Average DRAM read >>> bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.avgWrBW 813.602251 # Average DRAM write >>> bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak >>> bandwidth in MiByte/s ((Byte/Second)) >>> system.mem_ctrls.dram.busUtil 18.94 # Data bus >>> utilization in percentage (Ratio) >>> system.mem_ctrls.dram.busUtilRead 12.59 # Data bus >>> utilization in percentage for reads (Ratio) >>> system.mem_ctrls.dram.busUtilWrite 6.36 # Data bus >>> utilization in percentage for writes (Ratio) >>> system.mem_ctrls.dram.pageHitRate 89.16 # Row buffer hit >>> rate, read and write combined (Ratio) >>> ------------------------------------------------------------- >>> >>> It’s indeed improving the achieved bandwidth, but still a little far >>> away from the peak bandwidth of DDR3_1600 (12800 MiB/s). stats.txt is >>> uploaded for reference ( >>> https://gist.github.com/wzc314/cf29275f853ee0b2fcd865f9b492c355) >>> >>> Any idea is appreciated! >>> Thank you in advance! >>> >>> Bests, >>> Zicong >>> >>> >>> >>> 2022年4月16日 00:08,Majid Jalili <majid...@gmail.com> 写道: >>> >>> Hi, >>> Make sure your system has enough MSHRs, out of the box, L1, and L2 are >>> set to have a few MSHR entries. >>> Also, stride prefetcher is not the best, you may try something better: >>> DCPT gives me better numbers. >>> >>> On Fri, Apr 15, 2022 at 4:57 AM Zicong Wang via gem5-users < >>> gem5-users@gem5.org> wrote: >>> Hi Jason, >>> >>> We are testing the memory bandwidth program STREAM ( >>> https://www.cs.virginia.edu/stream/), but the results show that the >>> CPU cannot fully utilize the DDR bandwidth, and the achieved bandwidth is >>> quite low and about 1/10 of the peak bandwidth (peakBW in stats.txt). I >>> tested the STREAM binary on my x86 computer and got the near >>> peak bandwidth, so I believe the program is ok. >>> >>> I've seen the maillist dialogue >>> https://www.mail-archive.com/gem5-users@gem5.org/msg12965.html, and >>> I think I've met the similar problem. So I tried the suggestions proposed >>> by Andreas, including enable l1/l2 prefetcher, using ARM >>> detailed CPU. Although these methods can improve the bandwidth, the results >>> show it has limited effect. Besides, I've also tested the STREAM program in >>> FS mode with x86 O3/Minor/TimingSimple CPU, and tested it in SE mode with >>> ruby option, but all the results are similar and there is no essential >>> difference. >>> >>> I guess it is a general problem in simulation with gem5. I'm wondering >>> if the result is expected or is there something wrong with the system model? >>> >>> Two of the experimental results are attached for reference: >>> >>> 1. X86 O3CPU, SE-mode, w/o l2 prefetcher: >>> >>> ./build/X86/gem5.opt --outdir=m5out-stream configs/example/se.py >>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache >>> --l2_size=8MB --mem-type=DDR3_1600_8x8 -c ../stream/stream >>> >>> STREAM output: >>> >>> ------------------------------------------------------------- >>> Function Best Rate MB/s Avg time Min time Max time >>> Copy: 1099.0 0.014559 0.014559 0.014559 >>> Scale: 1089.7 0.014683 0.014683 0.014683 >>> Add: 1213.0 0.019786 0.019786 0.019786 >>> Triad: 1222.1 0.019639 0.019639 0.019639 >>> ------------------------------------------------------------- >>> >>> stats.txt (dram related): >>> >>> system.mem_ctrls.dram.bytesRead 238807808 # Total bytes read >>> (Byte) >>> system.mem_ctrls.dram.bytesWritten 121179776 # Total bytes >>> written (Byte) >>> system.mem_ctrls.dram.avgRdBW 718.689026 # Average DRAM read >>> bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.avgWrBW 364.688977 # Average DRAM >>> write bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak >>> bandwidth in MiByte/s ((Byte/Second)) >>> system.mem_ctrls.dram.busUtil 8.46 # Data bus >>> utilization in percentage (Ratio) >>> system.mem_ctrls.dram.busUtilRead 5.61 # Data bus >>> utilization in percentage for reads (Ratio) >>> system.mem_ctrls.dram.busUtilWrite 2.85 # Data bus >>> utilization in percentage for writes (Ratio) >>> system.mem_ctrls.dram.pageHitRate 40.57 # Row buffer hit >>> rate, read and write combined (Ratio) >>> >>> >>> >>> 2. X86 O3CPU, SE-mode, w/ l2 prefetcher: >>> >>> ./build/X86/gem5.opt --outdir=m5out-stream-l2hwp configs/example/se.py >>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache >>> --l2_size=8MB --l2-hwp-typ=StridePrefetcher --mem-type=DDR3_1600_8x8 -c >>> ../stream/stream >>> >>> STREAM output: >>> >>> ------------------------------------------------------------- >>> Function Best Rate MB/s Avg time Min time Max time >>> Copy: 1703.9 0.009390 0.009390 0.009390 >>> Scale: 1718.6 0.009310 0.009310 0.009310 >>> Add: 2087.3 0.011498 0.011498 0.011498 >>> Triad: 2227.2 0.010776 0.010776 0.010776 >>> ------------------------------------------------------------- >>> stats.txt (dram related): >>> >>> system.mem_ctrls.dram.bytesRead 238811712 # Total bytes read >>> (Byte) >>> system.mem_ctrls.dram.bytesWritten 121179840 # Total bytes >>> written (Byte) >>> system.mem_ctrls.dram.avgRdBW 1014.129912 # Average DRAM read >>> bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.avgWrBW 514.598298 # Average DRAM >>> write bandwidth in MiBytes/s ((Byte/Second)) >>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak >>> bandwidth in MiByte/s ((Byte/Second)) >>> system.mem_ctrls.dram.busUtil 11.94 # Data bus >>> utilization in percentage (Ratio) >>> system.mem_ctrls.dram.busUtilRead 7.92 # Data bus >>> utilization in percentage for reads (Ratio) >>> system.mem_ctrls.dram.busUtilWrite 4.02 # Data bus >>> utilization in percentage for writes (Ratio) >>> system.mem_ctrls.dram.pageHitRate 75.37 # Row buffer hit >>> rate, read and write combined (Ratio) >>> >>> >>> >>> STREAM compiling options: >>> >>> gcc -O2 -static -DSTREAM_ARRAY_SIZE=1000000 -DNTIMES=2 stream.c -o >>> stream >>> >>> All the experiments are performed on the latest stable >>> version (141cc37c2d4b93959d4c249b8f7e6a8b2ef75338, v21.2.1). >>> >>> Thank you very much! >>> >>> >>> >>> Best Regards, >>> >>> Zicong >>> >>> >>> >>> _______________________________________________ >>> gem5-users mailing list -- gem5-users@gem5.org >>> To unsubscribe send an email to gem5-users-le...@gem5.org >>> %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s >>> >>> >>>
_______________________________________________ gem5-users mailing list -- gem5-users@gem5.org To unsubscribe send an email to gem5-users-le...@gem5.org %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s