[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Adding separate class for dynamic-reg-alloc...

GAURAV JAIN (Gerrit) via gem5-dev Fri, 31 Jul 2020 00:48:14 -0700

GAURAV JAIN has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/32034 )

Change subject: gpu-compute: Adding separate class fordynamic-reg-allocation

......................................................................

gpu-compute: Adding separate class for
             dynamic-reg-allocation

SimplePoolManager doesn't allow mapping of two WGs
simultaneously on the same Compute Unit (provided
the previous WG has been mapped to all the SIMDs)
even if there is sufficient VRF and SRF space
available.

DynPoolManager takes care of that by dynamically
allocating and deallocating register file space
to wavefronts

Change-Id: I2255c68d4b421615d7b231edc05d3ebb27cbd66c
---
M configs/example/apu_se.py
M src/gpu-compute/GPU.py
M src/gpu-compute/SConscript
M src/gpu-compute/compute_unit.cc
M src/gpu-compute/compute_unit.hh
A src/gpu-compute/dyn_pool_manager.cc
A src/gpu-compute/dyn_pool_manager.hh
M src/gpu-compute/pool_manager.hh
M src/gpu-compute/shader.cc
M src/gpu-compute/static_register_manager_policy.cc
10 files changed, 283 insertions(+), 16 deletions(-)



diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 82e4022..2993143 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -180,6 +180,8 @@
                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 parser.add_option("--num-hw-queues", type="int", default=10,
                   help="number of hw queues in packet processor")
+parser.add_option("--reg-alloc-policy",type="string", default="simple",
+                  help="register allocation policy (simple/dynamic)")

 Ruby.define_options(parser)

@@ -300,18 +302,28 @@
         for k in xrange(shader.n_wf):
             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
                                         wf_size = options.wf_size))
-        vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
+
+        if options.reg_alloc_policy == "simple":
+            vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
                                                options.vreg_file_size,
                                                min_alloc = \
                                                options.vreg_min_alloc))
+            srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+                                               options.sreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+        elif options.reg_alloc_policy == "dynamic":
+            vrf_pool_mgrs.append(DynPoolManager(pool_size = \
+                                               options.vreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+            srf_pool_mgrs.append(DynPoolManager(pool_size = \
+                                               options.sreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))

         vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
                                        num_regs=options.vreg_file_size))
-
-        srf_pool_mgrs.append(SimplePoolManager(pool_size = \
-                                               options.sreg_file_size,
-                                               min_alloc = \
-                                               options.vreg_min_alloc))
         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
                                        num_regs=options.sreg_file_size))

diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 7408bf9..5d2e6c5 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -30,6 +30,7 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #
 # Authors: Steve Reinhardt
+#          Gaurav Jain

 from m5.defines import buildEnv
 from m5.params import *
@@ -67,6 +68,12 @@
     cxx_class = 'SimplePoolManager'
     cxx_header = "gpu-compute/simple_pool_manager.hh"

+## This is for allowing multiple workgroups on one CU
+class DynPoolManager(PoolManager):
+    type = 'DynPoolManager'
+    cxx_class = 'DynPoolManager'
+    cxx_header = "gpu-compute/dyn_pool_manager.hh"
+
 class RegisterFile(SimObject):
     type = 'RegisterFile'
     cxx_class = 'RegisterFile'
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index 0f1afbc..f242818 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -65,6 +65,7 @@
 Source('scheduler.cc')
 Source('scoreboard_check_stage.cc')
 Source('shader.cc')
+Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
 Source('tlb_coalescer.cc')
@@ -83,6 +84,7 @@
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
 DebugFlag('GPUReg')
+DebugFlag('GPURegAlloc')
 DebugFlag('GPURename')
 DebugFlag('GPURF')
 DebugFlag('GPURfState')
@@ -100,4 +102,4 @@
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                         'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
                         'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
-                        'GPUInitAbi'])
+                        'GPUInitAbi', 'GPURegAlloc'])

diff --git a/src/gpu-compute/compute_unit.ccb/src/gpu-compute/compute_unit.cc

index 067c254..d7b8059 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -31,8 +31,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */

-#include "gpu-compute/compute_unit.hh"
-
 #include <limits>

 #include "base/output.hh"
@@ -46,6 +44,7 @@
 #include "debug/GPURename.hh"
 #include "debug/GPUSync.hh"
 #include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
@@ -395,6 +394,19 @@
     injectGlobalMemFence(gpuDynInst, true);
 }

+// reseting SIMD register pools
+// I couldn't think of any other place and
+// I think it is needed in my implementation
+void
+ComputeUnit::resetRegisterPool()
+{
+    for (int i=0; i<numVectorALUs; i++)
+    {
+        registerManager.vrfPoolMgrs[i]->resetRegion(numVecRegsPerSimd);
+        registerManager.srfPoolMgrs[i]->resetRegion(numScalarRegsPerSimd);
+    }
+}
+
 void
 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
 {

diff --git a/src/gpu-compute/compute_unit.hhb/src/gpu-compute/compute_unit.hh

index 22960c0..2f21188 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -419,6 +419,8 @@
     int cacheLineSize() const { return _cacheLineSize; }
     int getCacheLineBits() const { return cacheLineBits; }

+    void resetRegisterPool();
+
   private:
     WFBarrier&
     barrierSlot(int bar_id)

diff --git a/src/gpu-compute/dyn_pool_manager.ccb/src/gpu-compute/dyn_pool_manager.cc

new file mode 100644
index 0000000..35fb6aa
--- /dev/null
+++ b/src/gpu-compute/dyn_pool_manager.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions aremet:

+ *

+ * 1. Redistributions of source code must retain the above copyrightnotice,

+ * this list of conditions and the following disclaimer.
+ *

+ * 2. Redistributions in binary form must reproduce the above copyrightnotice,+ * this list of conditions and the following disclaimer in thedocumentation

+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its

+ * contributors may be used to endorse or promote products derived fromthis

+ * software without specific prior written permission.
+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULARPURPOSE+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORSBE

+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OFTHE

+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gaurav Jain
+ */
+
+#include "base/logging.hh"
+#include "debug/GPURegAlloc.hh"
+#include "gpu-compute/dyn_pool_manager.hh"
+
+DynPoolManager *
+DynPoolManagerParams::create()
+{
+    return new DynPoolManager(this);
+}
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+DynPoolManager::minAllocatedElements(uint32_t size)
+{

+ fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR regionsize=%d\n",

+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+DynPoolManager::printRegion()
+{
+    std::string _cout;
+    _cout = "Placeholder\n";
+    return _cout;
+}
+
+
+// reset freeSpace and reservedSpace
+void
+DynPoolManager::resetRegion(const int & regsPerSimd){
+    reservedSpaceRecord.clear();
+    freeSpaceRecord.clear();
+
+    // reset available free space
+    _totRegSpaceAvailable = regsPerSimd;
+    freeSpaceRecord.push_back(std::make_pair(0,regsPerSimd));
+}
+
+bool
+DynPoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    DPRINTF(GPURegAlloc,"Can Allocate %d\n",actualSize);
+    return (_totRegSpaceAvailable >= actualSize);
+}
+
+uint32_t
+DynPoolManager::allocateRegion(const uint32_t size,
+                                    uint32_t *reservedPoolSize)
+{
+    uint32_t startIdx = (unsigned)-1;
+    uint32_t actualSize = minAllocatedElements(size);
+    auto it = freeSpaceRecord.begin();
+    while (it != freeSpaceRecord.end()) {
+        if (it->second >= actualSize) {
+            // assign the next block starting from here
+            startIdx = it->first;
+            _regionSize = actualSize;
+            *reservedPoolSize = actualSize;
+            _totRegSpaceAvailable -= actualSize;
+
+            // This case sees if this chunk size is exactly equal to
+            // the size of the requested chunk. If yes, then this can't
+            // contribute to future requests and hence, should be removed
+            if (it->second == actualSize) {
+                it = freeSpaceRecord.erase(it);
+            } else {
+                it->first += actualSize;
+                it->second -= actualSize;
+            }
+            break;
+        }
+    }
+    DPRINTF(GPURegAlloc,"totRegSpace %d allocating Register at %d and"

+ "size %d\n",_totRegSpaceAvailable,startIdx,actualSize);

+    return startIdx;
+}
+
+void
+DynPoolManager::freeRegion(uint32_t firstIdx,
+                                uint32_t lastIdx)
+{
+    // lastIdx-firstIdx should give the size of free space
+    DPRINTF(GPURegAlloc,"freeing Region at %d %d, size %d\n",
+                                    firstIdx,lastIdx,lastIdx-firstIdx);
+
+    // Current dynamic register allocation does not handle wraparound
+    assert(firstIdx < lastIdx);
+    _totRegSpaceAvailable += lastIdx-firstIdx;
+    freeSpaceRecord.push_back(std::make_pair(firstIdx,lastIdx-firstIdx));
+}
+
+uint32_t
+DynPoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
+

diff --git a/src/gpu-compute/dyn_pool_manager.hhb/src/gpu-compute/dyn_pool_manager.hh

new file mode 100644
index 0000000..510a2a1
--- /dev/null
+++ b/src/gpu-compute/dyn_pool_manager.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions aremet:

+ *

+ * 1. Redistributions of source code must retain the above copyrightnotice,

+ * this list of conditions and the following disclaimer.
+ *

+ * 2. Redistributions in binary form must reproduce the above copyrightnotice,+ * this list of conditions and the following disclaimer in thedocumentation

+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its

+ * contributors may be used to endorse or promote products derived fromthis

+ * software without specific prior written permission.
+ *

+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OFTHE

+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gaurav Jain
+ */
+
+#ifndef __DYN_POOL_MANAGER_HH__
+#define __DYN_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+#include "params/DynPoolManager.hh"
+
+// Dynamic Pool Manager: allows multiple WGs on the same pool
+class DynPoolManager : public PoolManager
+{
+  public:
+    DynPoolManager(const PoolManagerParams *p)
+        : PoolManager(p), _regionSize(0), _nxtFreeIdx(0)
+    {
+        _totRegSpaceAvailable = p->pool_size;
+    }
+
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion();
+    bool canAllocate(uint32_t numRegions, uint32_t size);

+ uint32_t allocateRegion(const uint32_t size, uint32_t*reservedPoolSize);

+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+    // register allocation
+    int getNextFreeBlock(uint32_t size);
+    void resetRegion(const int & regsPerSimd);
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    int _nxtFreeIdx;
+    // total registers available - across chunks
+    uint32_t _totRegSpaceAvailable;
+    // regIndex and freeSpace record
+    std::list<std::pair<int,int>> freeSpaceRecord;
+    std::list<std::pair<int,int>> reservedSpaceRecord;
+};
+
+#endif // __DYN_POOL_MANAGER_HH__

diff --git a/src/gpu-compute/pool_manager.hhb/src/gpu-compute/pool_manager.hh

index 9bbaa64..3c2034a 100644
--- a/src/gpu-compute/pool_manager.hh
+++ b/src/gpu-compute/pool_manager.hh
@@ -57,6 +57,14 @@

     virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
     uint32_t poolSize() { return _poolSize; }
+    // I don't think with the current API it is possible to do what
+    // we intend to - reset the entire register pool.
+    // Because we need to reset the register pool when all WGs on
+    // the Compute Unit are finished - before launching WGs from
+    // another kernel.
+    // TsungTai Yeh added a virtual method do the very same - at a diff
+    // place though.
+    virtual void resetRegion(const int & regsPerSimd) {}; // do nothing

   private:
     // minimum size that can be reserved per allocation
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 1d88e85..8b294d4 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -211,6 +211,9 @@
         _dispatcher.updateInvCounter(kernId, +1);
         // all necessary INV flags are all set now, call cu to execute
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
+
+        // I don't like this. This is intrusive coding.
+        cuList[i_cu]->resetRegisterPool();
     }
 }

diff --git a/src/gpu-compute/static_register_manager_policy.ccb/src/gpu-compute/static_register_manager_policy.cc

index 85f530b..e91be6c 100644
--- a/src/gpu-compute/static_register_manager_policy.cc
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -53,6 +53,7 @@
 {
 }

+// restoring old definitions
 int
 StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
 {
@@ -152,13 +153,13 @@
              w->simdId,
              w->computeUnit->scalarRegsReserved[w->simdId]);

-    int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
-        w->computeUnit->vrf[w->simdId]->numRegs();
+    // Current dynamic register allocation does not handle wraparound
+    int endIndex = w->startVgprIndex + w->reservedVectorRegs;

     w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
         freeRegion(w->startVgprIndex, endIndex);

-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
     for (int i = 0; i < w->reservedVectorRegs; i++) {
         uint32_t physVgprIdx = mapVgpr(w, i);
         w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
@@ -167,12 +168,11 @@
     w->reservedVectorRegs = 0;
     w->startVgprIndex = 0;

-    endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
-        w->computeUnit->srf[w->simdId]->numRegs();
-    w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
+    endIndex = w->startSgprIndex + w->reservedScalarRegs;
+    w->computeUnit->registerManager.srfPoolMgrs[w->simdId]->
         freeRegion(w->startSgprIndex, endIndex);

-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
     for (int i = 0; i < w->reservedScalarRegs; i++) {
         uint32_t physSgprIdx = mapSgpr(w, i);
         w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/32034

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I2255c68d4b421615d7b231edc05d3ebb27cbd66c
Gerrit-Change-Number: 32034
Gerrit-PatchSet: 1
Gerrit-Owner: GAURAV JAIN <[email protected]>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: gpu-compute: Adding separate class for dynamic-reg-alloc...

Reply via email to