[gem5-dev] Change in gem5/gem5[develop]: configs, mem: Support running VIPER with GCN3

Kyle Roarty (Gerrit) via gem5-dev Thu, 11 Jun 2020 22:08:10 -0700

Hello Tony Gutierrez,

I'd like you to do a code review. Please visit


    https://gem5-review.googlesource.com/c/public/gem5/+/30274

to review the following change.


Change subject: configs, mem: Support running VIPER with GCN3
......................................................................

configs, mem: Support running VIPER with GCN3

This changeset adds the necessary changes for running
GCN3 ISA with VIPER in apu_se.py.

Changes to the VIPER protocol configs are made to add support
for DMA and scalar caches.

hsaTopology is added to help the pseudo FS create the files
needed by ROCm to understand the device on which the SW is
being run.

Change-Id: I0f47a6a36bb241a26972c0faafafcf332a7d7d1f
---
M configs/example/apu_se.py
A configs/example/hsaTopology.py
M configs/ruby/GPU_VIPER.py
M configs/ruby/Ruby.py
M src/mem/packet.cc
M src/mem/packet.hh
M src/mem/request.hh
M src/mem/ruby/protocol/GPU_VIPER-SQC.sm
8 files changed, 295 insertions(+), 78 deletions(-)



diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index fee85f0..bd59da8 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -32,7 +32,7 @@
 from __future__ import print_function
 from __future__ import absolute_import

-import optparse, os, re
+import optparse, os, re, getpass
 import math
 import glob
 import inspect
@@ -49,6 +49,9 @@
 from common import Simulation
 from common import GPUTLBOptions, GPUTLBConfig

+import hsaTopology
+from common import FileSystemConfig
+
 ########################## Script Options ########################
 def setOption(parser, opt_str, value = 1):
     # check to make sure the option actually exists
@@ -77,9 +80,7 @@
 parser.add_option("--cpu-only-mode", action="store_true", default=False,
                   help="APU mode. Used to take care of problems in "\
                        "Ruby.py while running APU protocols")
-parser.add_option("-k", "--kernel-files",

- help="file(s) containing GPU kernel code (colonseparated)")

-parser.add_option("-u", "--num-compute-units", type="int", default=1,
+parser.add_option("-u", "--num-compute-units", type="int", default=4,
                   help="number of GPU compute units"),
 parser.add_option("--num-cp", type="int", default=0,
                   help="Number of GPU Command Processors (CP)")
@@ -89,21 +90,34 @@
 # it an option/knob

parser.add_option("--cu-per-sqc", type="int", default=4, help="number ofCUs" \

                   "sharing an SQC (icache, and thus icache TLB)")
+parser.add_option('--cu-per-scalar-cache', type='int', default=4,
+                  help='Number of CUs sharing a scalar cache')

parser.add_option("--simds-per-cu", type="int", default=4, help="SIMDunits" \

                   "per CU")
+parser.add_option('--cu-per-sa', type='int', default=4,
+                  help='Number of CUs per shader array. This must be a '

+ 'multiple of options.cu-per-sqc andoptions.cu-per-scalar')

+parser.add_option('--sa-per-complex', type='int', default=1,
+                  help='Number of shader arrays per complex')
+parser.add_option('--num-gpu-complexes', type='int', default=1,
+                  help='Number of GPU complexes')
 parser.add_option("--wf-size", type="int", default=64,
                   help="Wavefront size(in workitems)")
 parser.add_option("--sp-bypass-path-length", type="int", default=4, \

- help="Number of stages of bypass path in vector ALU forSingle Precision ops")

+                  help="Number of stages of bypass path in vector ALU for "
+                  "Single Precision ops")
 parser.add_option("--dp-bypass-path-length", type="int", default=4, \

- help="Number of stages of bypass path in vector ALU forDouble Precision ops")

+                  help="Number of stages of bypass path in vector ALU for "
+                  "Double Precision ops")

# issue period per SIMD unit: number of cycles before issuing anothervector

 parser.add_option("--issue-period", type="int", default=4, \

help="Number of cycles per vector instruction issueperiod")

 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \

- help="VGPR to Coalescer (Global Memory) data bus widthin bytes")

+                  help="VGPR to Coalescer (Global Memory) data bus width "
+                  "in bytes")
 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \

- help="Coalescer to VGPR (Global Memory) data bus widthin bytes")+ help="Coalescer to VGPR (Global Memory) data bus widthin "

+                  "bytes")
 # Currently we only support 1 local memory pipe
 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
                   help="Number of Shared Memory pipelines per CU")
@@ -113,13 +127,27 @@

parser.add_option("--wfs-per-simd", type="int", default=10, help="Numberof " \

                   "WF slots per SIMD")

+parser.add_option("--registerManagerPolicy", type="string",default="static",

+                  help="Register manager policy")
 parser.add_option("--vreg-file-size", type="int", default=2048,
                   help="number of physical vector registers per SIMD")
+parser.add_option("--vreg-min-alloc", type="int", default=4,
+                  help="Minimum number of registers that can be allocated "
+                  "from the VRF. The total number of registers will be "
+                  "aligned to this value.")
+
+parser.add_option("--sreg-file-size", type="int", default=2048,
+                  help="number of physical vector registers per SIMD")
+parser.add_option("--sreg-min-alloc", type="int", default=4,
+                  help="Minimum number of registers that can be allocated "
+                  "from the SRF. The total number of registers will be "
+                  "aligned to this value.")
+
 parser.add_option("--bw-scalor", type="int", default=0,
                   help="bandwidth scalor for scalability analysis")
 parser.add_option("--CPUClock", type="string", default="2GHz",
                   help="CPU clock")
-parser.add_option("--GPUClock", type="string", default="1GHz",
+parser.add_option("--gpu-clock", type="string", default="1GHz",
                   help="GPU clock")
 parser.add_option("--cpu-voltage", action="store", type="string",
                   default='1.0V',
@@ -129,8 +157,6 @@
                   help = """CPU  voltage domain""")
 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
                   help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
-parser.add_option("--xact-cas-mode", action="store_true",
-                  help="enable load_compare mode (transactional CAS)")
 parser.add_option("--SegFaultDebug",action="store_true",
                  help="checks for GPU seg fault before TLB access")
 parser.add_option("--FunctionalTLB",action="store_true",
@@ -152,9 +178,8 @@
                   help = 'fast forward using kvm until the m5_switchcpu'
                   ' pseudo-op is encountered, then switch cpus. subsequent'
                   ' m5_switchcpu pseudo-ops will toggle back and forth')
-parser.add_option('--outOfOrderDataDelivery', action='store_true',
-                  default=False, help='enable OoO data delivery in the GM'
-                  ' pipeline')
+parser.add_option("--num-hw-queues", type="int", default=10,
+                  help="number of hw queues in packet processor")

 Ruby.define_options(parser)

@@ -195,12 +220,17 @@
 n_cu = options.num_compute_units
 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 options.num_sqc = num_sqc # pass this to Ruby

+num_scalar_cache = int(math.ceil(float(n_cu) /options.cu_per_scalar_cache))

+options.num_scalar_cache = num_scalar_cache
+
+print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
+      'Num CU = ', n_cu)

 ########################## Creating the GPU system ########################
 # shader is the GPU
 shader = Shader(n_wf = options.wfs_per_simd,
                 clk_domain = SrcClockDomain(
-                    clock = options.GPUClock,
+                    clock = options.gpu_clock,
                     voltage_domain = VoltageDomain(
                         voltage = options.gpu_voltage)))

@@ -227,9 +257,11 @@
 for i in range(n_cu):
     compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
                                      num_SIMDs = options.simds_per_cu,
-                                     wfSize = options.wf_size,

- spbypass_pipe_length =options.sp_bypass_path_length,- dpbypass_pipe_length =options.dp_bypass_path_length,

+                                     wf_size = options.wf_size,
+                                     spbypass_pipe_length = \
+                                     options.sp_bypass_path_length,
+                                     dpbypass_pipe_length = \
+                                     options.dp_bypass_path_length,
                                      issue_period = options.issue_period,
                                      coalescer_to_vrf_bus_width = \
                                      options.glbmem_rd_bus_width,
@@ -241,7 +273,6 @@
                                      options.shr_mem_pipes_per_cu,
                                      n_wf = options.wfs_per_simd,
                                      execPolicy = options.CUExecPolicy,
-                                     xactCasMode = options.xact_cas_mode,
                                      debugSegFault = options.SegFaultDebug,
                                      functionalTLB = options.FunctionalTLB,

localMemBarrier =options.LocalMemBarrier,

@@ -249,19 +280,38 @@
                                      localDataStore = \
                                      LdsState(banks = options.numLdsBanks,
                                               bankConflictPenalty = \

-options.ldsBankConflictPenalty),

-                                     out_of_order_data_delivery =

-options.outOfOrderDataDelivery))+options.ldsBankConflictPenalty)))

     wavefronts = []
     vrfs = []
-    for j in range(options.simds_per_cu):
-        for k in range(shader.n_wf):
+    vrf_pool_mgrs = []
+    srfs = []
+    srf_pool_mgrs = []
+    for j in xrange(options.simds_per_cu):
+        for k in xrange(shader.n_wf):
             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
-                                        wfSize = options.wf_size))
-        vrfs.append(VectorRegisterFile(simd_id=j,
-                              num_regs_per_simd=options.vreg_file_size))
+                                        wf_size = options.wf_size))
+        vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
+                                               options.vreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+
+        vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
+                                       num_regs=options.vreg_file_size))
+
+        srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+                                               options.sreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+        srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
+                                       num_regs=options.sreg_file_size))
+
     compute_units[-1].wavefronts = wavefronts
     compute_units[-1].vector_register_file = vrfs
+    compute_units[-1].scalar_register_file = srfs
+    compute_units[-1].register_manager = \
+        RegisterManager(policy=options.registerManagerPolicy,
+                        vrf_pool_managers=vrf_pool_mgrs,
+                        srf_pool_managers=srf_pool_mgrs)
     if options.TLB_prefetch:
         compute_units[-1].prefetch_depth = options.TLB_prefetch
         compute_units[-1].prefetch_prev_type = options.pf_type
@@ -348,14 +398,29 @@
     else:
         cpu_list.append(cpu)

-########################## Creating the GPU dispatcher########################

-# Dispatcher dispatches work from host CPU to GPU
 host_cpu = cpu_list[0]
-dispatcher = GpuDispatcher()

-########################## Create and assign the workload########################

-# Check for rel_path in elements of base_list using test, returning
-# the first full path that satisfies test
+hsapp_gpu_map_vaddr = 0x200000000
+hsapp_gpu_map_size = 0x1000
+hsapp_gpu_map_paddr = int(Addr(options.mem_size))
+
+# HSA kernel mode driver
+gpu_driver = GPUComputeDriver(filename="kfd")
+
+# Creating the GPU kernel launching components: that is the HSA
+# packet processor (HSAPP), GPU command processor (CP), and the
+# dispatcher.
+gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
+                               numHWQueues=options.num_hw_queues)
+dispatcher = GPUDispatcher()
+gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
+                                   dispatcher=dispatcher)
+gpu_driver.device = gpu_cmd_proc
+shader.dispatcher = dispatcher
+shader.gpu_cmd_proc = gpu_cmd_proc
+
+# Create and assign the workload Check for rel_path in elements of
+# base_list using test, returning the first full path that satisfies test
 def find_path(base_list, rel_path, test):
     for base in base_list:
         if not base:
@@ -370,31 +435,31 @@
     return find_path(base_list, rel_path, os.path.isfile)

 executable = find_path(benchmark_path, options.cmd, os.path.exists)
-# it's common for a benchmark to be in a directory with the same
+# It's common for a benchmark to be in a directory with the same
 # name as the executable, so we handle that automatically
 if os.path.isdir(executable):
     benchmark_path = [executable]
     executable = find_file(benchmark_path, options.cmd)
-if options.kernel_files:
-    kernel_files = [find_file(benchmark_path, f)
-                    for f in options.kernel_files.split(':')]
-else:
-    # if kernel_files is not set, see if there's a unique .asm file
-    # in the same directory as the executable
-    kernel_path = os.path.dirname(executable)
-    kernel_files = glob.glob(os.path.join(kernel_path, '*.asm'))
-    if kernel_files:
-        print("Using GPU kernel code file(s)", ",".join(kernel_files))
-    else:
-        fatal("Can't locate kernel code (.asm) in " + kernel_path)

-# OpenCL driver
-driver = ClDriver(filename="hsa", codefile=kernel_files)
+if options.env:
+    with open(options.env, 'r') as f:
+        env = [line.rstrip() for line in f]
+else:
+    env = ['LD_LIBRARY_PATH=%s' % ':'.join([
+               "/proj/radl_tools/rocm-1.6/lib",
+               "/proj/radl_tools/rocm-1.6/hcc/lib64",
+               "/tool/pandora64/.package/libunwind-1.1/lib",
+               "/tool/pandora64/.package/gcc-6.4.0/lib64"
+           ]),
+           "HSA_ENABLE_INTERRUPT=0"]
+
+process = Process(executable = executable, cmd = [options.cmd]

+ + options.options.split(), drivers = [gpu_driver], env =env)

+
 for cpu in cpu_list:
     cpu.createThreads()
-    cpu.workload = Process(executable = executable,
-                           cmd = [options.cmd] + options.options.split(),
-                           drivers = [driver])
+    cpu.workload = process
+
 for cp in cp_list:
     cp.workload = host_cpu.workload

@@ -409,10 +474,8 @@
     switch_cpu_list = \

[(cpu_list[i], future_cpu_list[i]) for i inrange(options.num_cpus)]


-# Full list of processing cores in the system. Note that
-# dispatcher is also added to cpu_list although it is
-# not a processing element
-cpu_list = cpu_list + [shader] + cp_list + [dispatcher]
+# Full list of processing cores in the system.
+cpu_list = cpu_list + [shader] + cp_list

 # creating the overall system
 # notice the cpu list is explicitly added as a parameter to System
@@ -442,9 +505,17 @@
 # create Ruby system
 system.piobus = IOXBar(width=32, response_latency=0,
                        frontend_latency=0, forward_latency=0)
-Ruby.create_system(options, None, system)
+dma_list = [gpu_hsapp, gpu_cmd_proc]
+Ruby.create_system(options, None, system, None, dma_list, None)
 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
                                     voltage_domain = system.voltage_domain)
+gpu_cmd_proc.pio = system.piobus.master
+#gpu_cmd_proc.dma = system.piobus.slave
+gpu_hsapp.pio = system.piobus.master
+#gpu_hsapp.dma = system.piobus.slave
+
+for i, dma_device in enumerate(dma_list):
+    exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)

 # attach the CPU ports to Ruby
 for i in range(options.num_cpus):
@@ -474,7 +545,8 @@
 # per compute unit and one sequencer per SQC for the math to work out
 # correctly.
 gpu_port_idx = len(system.ruby._cpu_ports) \
-               - options.num_compute_units - options.num_sqc
+               - options.num_compute_units - options.num_sqc \
+               - options.num_scalar_cache
 gpu_port_idx = gpu_port_idx - options.num_cp * 2

 wavefront_size = options.wf_size
@@ -494,6 +566,14 @@
             system.ruby._cpu_ports[gpu_port_idx].slave
 gpu_port_idx = gpu_port_idx + 1

+for i in xrange(n_cu):
+    if i > 0 and not i % options.cu_per_scalar_cache:
+        print("incrementing idx on ", i)
+        gpu_port_idx += 1
+    system.cpu[shader_idx].CUs[i].scalar_port = \
+        system.ruby._cpu_ports[gpu_port_idx].slave
+gpu_port_idx = gpu_port_idx + 1
+
 # attach CP ports to Ruby
 for i in range(options.num_cp):
     system.cpu[cp_idx].createInterruptController()
@@ -506,11 +586,7 @@
     system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
     cp_idx = cp_idx + 1

-# connect dispatcher to the system.piobus
-dispatcher.pio = system.piobus.master
-dispatcher.dma = system.piobus.slave
-

-################# Connect the CPU and GPU via GPU Dispatcher###################+################# Connect the CPU and GPU via GPU Dispatcher##################

 # CPU rings the GPU doorbell to notify a pending task
 # using this interface.
 # And GPU uses this interface to notify the CPU of task completion
@@ -520,16 +596,28 @@
 # parameters must be after the explicit setting of the System cpu list
 if fast_forward:
     shader.cpu_pointer = future_cpu_list[0]
-    dispatcher.cpu = future_cpu_list[0]
 else:
     shader.cpu_pointer = host_cpu
-    dispatcher.cpu = host_cpu
-dispatcher.shader_pointer = shader
-dispatcher.cl_driver = driver

 ########################## Start simulation ########################

+chroot = os.path.expanduser(options.chroot)
+redirect_paths = [RedirectPath(src = "/proc",
+                               dests = ["%s/fs/proc" % m5.options.outdir]),
+                  RedirectPath(src = "/sys",
+                               dests = ["%s/fs/sys"  % m5.options.outdir]),
+                  RedirectPath(src = "/tmp",
+                               dests = ["%s/fs/tmp"  % m5.options.outdir]),
+                  RedirectPath(src = "/dev/shm",
+                               dests = ["/dev/shm/%s/gem5_%s"  %
+                                   (getpass.getuser(), os.getpid())])]
+
+system.redirect_paths = redirect_paths
+
 root = Root(system=system, full_system=False)
+
+hsaTopology.createHsaTopology(options)
+
 m5.ticks.setGlobalFrequency('1THz')
 if options.abs_max_tick:
     maxtick = options.abs_max_tick
@@ -577,4 +665,7 @@

 print("Ticks:", m5.curTick())
 print('Exiting because ', exit_event.getCause())
+
+FileSystemConfig.cleanup_filesystem(options)
+
 sys.exit(exit_event.getCode())
diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py
new file mode 100644
index 0000000..e936c5d
--- /dev/null
+++ b/configs/example/hsaTopology.py
@@ -0,0 +1,74 @@
+import m5
+
+import operator
+from os import mkdir, makedirs, getpid, listdir, fsync
+from os.path import join as joinpath
+from os.path import isdir
+from shutil import rmtree, copyfile
+
+def file_append(path, contents):
+    with open(joinpath(*path), 'a') as f:
+        f.write(str(contents))
+        f.flush()
+        fsync(f.fileno())
+
+def remake_dir(path):
+    if isdir(path):
+        rmtree(path)
+    makedirs(path)
+
+def createHsaTopology(options):
+    topology_dir = joinpath(m5.options.outdir, \
+        'fs/sys/devices/virtual/kfd/kfd/topology')
+    remake_dir(topology_dir)
+
+    # Ripped from real Kaveri platform to appease kmt version checks
+    # Set up generation_id
+    file_append((topology_dir, 'generation_id'), 1)
+
+    # Set up system properties
+    sys_prop = 'platform_oem 2314885673410447169\n' + \
+               'platform_id 35322352389441\n'       + \
+               'platform_rev 1\n'
+    file_append((topology_dir, 'system_properties'), sys_prop)
+
+    # Populate the topology tree
+    # TODO: Just the bare minimum to pass for now
+    node_dir = joinpath(topology_dir, 'nodes/0')
+    remake_dir(node_dir)
+
+    # must show valid kaveri gpu id or massive meltdown
+    file_append((node_dir, 'gpu_id'), 2765)
+
+    # must have marketing name
+    file_append((node_dir, 'name'), 'Carrizo\n')
+
+    # populate global node properties
+    # NOTE: SIMD count triggers a valid GPU agent creation
+    # TODO: Really need to parse these from options
+    node_prop = 'cpu_cores_count %s\n' % options.num_cpus   + \
+                'simd_count 32\n'                           + \
+                'mem_banks_count 0\n'                       + \
+                'caches_count 0\n'                          + \
+                'io_links_count 0\n'                        + \
+                'cpu_core_id_base 16\n'                     + \
+                'simd_id_base 2147483648\n'                 + \
+                'max_waves_per_simd 40\n'                   + \
+                'lds_size_in_kb 64\n'                       + \
+                'gds_size_in_kb 0\n'                        + \
+                'wave_front_size 64\n'                      + \
+                'array_count 1\n'                           + \
+                'simd_arrays_per_engine 1\n'                + \
+                'cu_per_simd_array 10\n'                    + \
+                'simd_per_cu 4\n'                           + \
+                'max_slots_scratch_cu 32\n'                 + \
+                'vendor_id 4098\n'                          + \
+                'device_id 39028\n'                         + \
+                'location_id 8\n'                           + \
+                'max_engine_clk_fcompute 800\n'             + \
+                'local_mem_size 0\n'                        + \
+                'fw_version 699\n'                          + \
+                'capability 4738\n'                         + \
+                'max_engine_clk_ccompute 2100\n'
+
+    file_append((node_dir, 'properties'), node_prop)
diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 58572ae..92dcf5e 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -476,6 +476,15 @@
         dir_cntrl.requestToMemory = MessageBuffer()
         dir_cntrl.responseFromMemory = MessageBuffer()

+        dir_cntrl.requestFromDMA = MessageBuffer(ordered=True)
+        dir_cntrl.requestFromDMA.slave = ruby_system.network.master
+
+        dir_cntrl.responseToDMA = MessageBuffer()
+        dir_cntrl.responseToDMA.master = ruby_system.network.slave
+
+        dir_cntrl.requestToMemory = MessageBuffer()
+        dir_cntrl.responseFromMemory = MessageBuffer()
+
         exec("ruby_system.dir_cntrl%d = dir_cntrl" % i)
         dir_cntrl_nodes.append(dir_cntrl)

@@ -640,7 +649,29 @@
         # SQC also in GPU cluster
         gpuCluster.add(sqc_cntrl)

-    for i in range(options.num_cp):
+    for i in xrange(options.num_scalar_cache):
+        scalar_cntrl = SQCCntrl(TCC_select_num_bits = TCC_bits)
+        scalar_cntrl.create(options, ruby_system, system)
+
+        exec('ruby_system.scalar_cntrl%d = scalar_cntrl' % i)
+
+        cpu_sequencers.append(scalar_cntrl.sequencer)
+
+        scalar_cntrl.requestFromSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.requestFromSQC.master = ruby_system.network.slave
+
+        scalar_cntrl.probeToSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.probeToSQC.slave = ruby_system.network.master
+
+        scalar_cntrl.responseToSQC = MessageBuffer(ordered = True)
+        scalar_cntrl.responseToSQC.slave = ruby_system.network.master
+
+        scalar_cntrl.mandatoryQueue = \
+            MessageBuffer(buffer_size=options.buffers_size)
+
+        gpuCluster.add(scalar_cntrl)
+
+    for i in xrange(options.num_cp):

         tcp_ID = options.num_compute_units + i
         sqc_ID = options.num_sqc + i
@@ -735,13 +766,27 @@
         # TCC cntrls added to the GPU cluster
         gpuCluster.add(tcc_cntrl)

-    # Assuming no DMA devices
-    assert(len(dma_devices) == 0)
+    for i, dma_device in enumerate(dma_devices):
+        dma_seq = DMASequencer(version=i, ruby_system=ruby_system)
+        dma_cntrl = DMA_Controller(version=i, dma_sequencer=dma_seq,
+                                   ruby_system=ruby_system)
+        exec('system.dma_cntrl%d = dma_cntrl' % i)
+        if dma_device.type == 'MemTest':

+ exec('system.dma_cntrl%d.dma_sequencer.slave =dma_devices.test'

+                 % i)
+        else:

+ exec('system.dma_cntrl%d.dma_sequencer.slave =dma_device.dma' % i)

+        dma_cntrl.requestToDir = MessageBuffer(buffer_size=0)
+        dma_cntrl.requestToDir.master = ruby_system.network.slave
+        dma_cntrl.responseFromDir = MessageBuffer(buffer_size=0)
+        dma_cntrl.responseFromDir.slave = ruby_system.network.master
+        dma_cntrl.mandatoryQueue = MessageBuffer(buffer_size = 0)
+        gpuCluster.add(dma_cntrl)

     # Add cpu/gpu clusters to main cluster
     mainCluster.add(cpuCluster)
     mainCluster.add(gpuCluster)

-    ruby_system.network.number_of_virtual_networks = 10
+    ruby_system.network.number_of_virtual_networks = 11

     return (cpu_sequencers, dir_cntrl_nodes, mainCluster)
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index 9bceaa3..6587a85 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -188,7 +188,7 @@
     try:
         (cpu_sequencers, dir_cntrls, topology) = \

eval("%s.create_system(options, full_system, system,dma_ports,\

-                                    bootmem, ruby)"
+                                    ruby)"
                   % protocol)
     except:

print("Error: could not create sytem for ruby protocol %s" %protocol)

diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index f29f091..2a0f569 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -179,8 +179,15 @@
     /* SwapResp -- for Swap ldstub type operations */
     { SET4(IsRead, IsWrite, IsResponse, HasData),
             InvalidCmd, "SwapResp" },
-    { 0, InvalidCmd, "Deprecated_MessageReq" },
-    { 0, InvalidCmd, "Deprecated_MessageResp" },
+    /* IntReq -- for interrupts */
+    { SET4(IsWrite, IsRequest, NeedsResponse, HasData),
+        MessageResp, "MessageReq" },
+    /* IntResp -- for interrupts */
+    { SET2(IsWrite, IsResponse), InvalidCmd, "MessageResp" },
+    /* MemSyncReq -- for synchronization requests */
+    {SET2(IsRequest, NeedsResponse), MemSyncResp, "MemSyncReq"},
+    /* MemSyncResp -- for synchronization responses */
+    {SET1(IsResponse), InvalidCmd, "MemSyncResp"},
     /* MemFenceReq -- for synchronization requests */
     {SET2(IsRequest, NeedsResponse), MemFenceResp, "MemFenceReq"},
     /* MemSyncReq */
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 4af0d0b..01b79a7 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -109,10 +109,11 @@
         StoreCondResp,
         SwapReq,
         SwapResp,
-        // MessageReq and MessageResp are deprecated.
-        MemFenceReq = SwapResp + 3,
+        MessageReq,
+        MessageResp,

MemSyncReq, // memory synchronization request (e.g., cacheinvalidate)

         MemSyncResp, // memory synchronization response
+        MemFenceReq,
         MemFenceResp,
         CleanSharedReq,
         CleanSharedResp,
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 4e0ba97..7a23cf9 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -808,7 +808,6 @@
     bool isCondSwap() const { return _flags.isSet(MEM_SWAP_COND); }
     bool isSecure() const { return _flags.isSet(SECURE); }
     bool isPTWalk() const { return _flags.isSet(PT_WALK); }
-    bool isAcquire() const { return _flags.isSet(ACQUIRE); }
     bool isRelease() const { return _flags.isSet(RELEASE); }
     bool isKernel() const { return _flags.isSet(KERNEL); }
     bool isAtomicReturn() const { return _flags.isSet(ATOMIC_RETURN_OP); }
@@ -832,6 +831,8 @@
     bool isToPOC() const { return _flags.isSet(DST_POC); }
     Flags getDest() const { return _flags & DST_BITS; }

+    bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }
+
     /**

* Accessor functions for the memory space configuration flags andused by* GPU ISAs such as the Heterogeneous System Architecture (HSA). Notethatdiff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.smb/src/mem/ruby/protocol/GPU_VIPER-SQC.sm

index 1885a68..84e86dc 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -239,8 +239,6 @@
       peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
         Entry cache_entry := getCacheEntry(in_msg.LineAddress);
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
-
-        assert(in_msg.Type == RubyRequestType:IFETCH);
         trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
       }
     }

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/30274

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I0f47a6a36bb241a26972c0faafafcf332a7d7d1f
Gerrit-Change-Number: 30274
Gerrit-PatchSet: 1
Gerrit-Owner: Kyle Roarty <[email protected]>
Gerrit-Reviewer: Tony Gutierrez <[email protected]>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: configs, mem: Support running VIPER with GCN3

Reply via email to