This is an automated email from the ASF dual-hosted git repository.
wilfreds pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/yunikorn-core.git
The following commit(s) were added to refs/heads/master by this push:
new 908d1cb0 [YUNIKORN-2030] headroom check for reserved allocations (#793)
908d1cb0 is described below
commit 908d1cb07083b989230a9a18bb0b66180122fb73
Author: Yongjun Zhang <[email protected]>
AuthorDate: Mon Feb 26 20:12:25 2024 +1100
[YUNIKORN-2030] headroom check for reserved allocations (#793)
During the allocation cycle for reservations we fall back to trying all
reservations on all nodes for an application. During this fallback a
headroom check was missing which caused queue update failures to be
logged.
The allocation did not succeed but it caused scheduling delays and log
spew.
Closes: #793
Signed-off-by: Wilfred Spiegelenburg <[email protected]>
---
pkg/scheduler/objects/application.go | 23 ++++++++++++---------
pkg/scheduler/objects/application_test.go | 33 +++++++++++++++++++++++++++++++
pkg/scheduler/objects/utilities_test.go | 1 +
3 files changed, 48 insertions(+), 9 deletions(-)
diff --git a/pkg/scheduler/objects/application.go
b/pkg/scheduler/objects/application.go
index cde6fbd9..a064d321 100644
--- a/pkg/scheduler/objects/application.go
+++ b/pkg/scheduler/objects/application.go
@@ -1223,6 +1223,12 @@ func (sa *Application)
tryPlaceholderAllocate(nodeIterator func() NodeIterator,
return allocResult
}
+// check ask against both user headRoom and queue headRoom
+func (sa *Application) checkHeadRooms(ask *AllocationAsk, userHeadroom
*resources.Resource, headRoom *resources.Resource) bool {
+ // check if this fits in the users' headroom first, if that fits check
the queues' headroom
+ return userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) &&
headRoom.FitInMaxUndef(ask.GetAllocatedResource())
+}
+
// Try a reserved allocation of an outstanding reservation
func (sa *Application) tryReservedAllocate(headRoom *resources.Resource,
nodeIterator func() NodeIterator) *Allocation {
sa.Lock()
@@ -1250,13 +1256,8 @@ func (sa *Application) tryReservedAllocate(headRoom
*resources.Resource, nodeIte
alloc := newUnreservedAllocation(reserve.nodeID,
unreserveAsk)
return alloc
}
- // check if this fits in the users' headroom first, if that
fits check the queues' headroom
- if !userHeadroom.FitInMaxUndef(ask.GetAllocatedResource()) {
- continue
- }
- // check if this fits in the queue's headroom
- if !headRoom.FitInMaxUndef(ask.GetAllocatedResource()) {
+ if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
continue
}
@@ -1280,12 +1281,16 @@ func (sa *Application) tryReservedAllocate(headRoom
*resources.Resource, nodeIte
// lets try this on all other nodes
for _, reserve := range sa.reservations {
// Other nodes cannot be tried if the ask has a required node
- if reserve.ask.GetRequiredNode() != "" {
+ ask := reserve.ask
+ if ask.GetRequiredNode() != "" {
continue
}
iterator := nodeIterator()
if iterator != nil {
- alloc := sa.tryNodesNoReserve(reserve.ask, iterator,
reserve.nodeID)
+ if !sa.checkHeadRooms(ask, userHeadroom, headRoom) {
+ continue
+ }
+ alloc := sa.tryNodesNoReserve(ask, iterator,
reserve.nodeID)
// have a candidate return it, including the node that
was reserved
if alloc != nil {
return alloc
@@ -1490,7 +1495,7 @@ func (sa *Application) tryNode(node *Node, ask
*AllocationAsk) *Allocation {
alloc := NewAllocation(node.NodeID, ask)
if node.AddAllocation(alloc) {
if err :=
sa.queue.IncAllocatedResource(alloc.GetAllocatedResource(), false); err != nil {
- log.Log(log.SchedApplication).Warn("queue update failed
unexpectedly",
+ log.Log(log.SchedApplication).DPanic("queue update
failed unexpectedly",
zap.Error(err))
// revert the node update
node.RemoveAllocation(alloc.GetAllocationID())
diff --git a/pkg/scheduler/objects/application_test.go
b/pkg/scheduler/objects/application_test.go
index bc055ee6..04439672 100644
--- a/pkg/scheduler/objects/application_test.go
+++ b/pkg/scheduler/objects/application_test.go
@@ -2601,6 +2601,39 @@ func TestGetRateLimitedAppLog(t *testing.T) {
assert.Check(t, l != nil)
}
+func TestTryAllocateWithReservedHeadRoomChecking(t *testing.T) {
+ defer func() {
+ if r := recover(); r != nil {
+ t.Fatalf("reserved headroom test regression: %v", r)
+ }
+ }()
+
+ res, err := resources.NewResourceFromConf(map[string]string{"memory":
"2G"})
+ assert.NilError(t, err, "failed to create basic resource")
+ var headRoom *resources.Resource
+ headRoom, err =
resources.NewResourceFromConf(map[string]string{"memory": "1G"})
+ assert.NilError(t, err, "failed to create basic resource")
+
+ app := newApplication(appID1, "default", "root")
+ ask := newAllocationAsk(aKey, appID1, res)
+ var queue *Queue
+ queue, err = createRootQueue(map[string]string{"memory": "1G"})
+ assert.NilError(t, err, "queue create failed")
+ app.queue = queue
+ err = app.AddAllocationAsk(ask)
+ assert.NilError(t, err, "ask should have been added to app")
+
+ node1 := newNodeRes(nodeID1, res)
+ node2 := newNodeRes(nodeID2, res)
+ // reserve that works
+ err = app.Reserve(node1, ask)
+ assert.NilError(t, err, "reservation should not have failed")
+
+ iter := getNodeIteratorFn(node1, node2)
+ alloc := app.tryReservedAllocate(headRoom, iter)
+ assert.Assert(t, alloc == nil, "Alloc is expected to be nil due to
insufficient headroom")
+}
+
func TestUpdateRunnableStatus(t *testing.T) {
app := newApplication(appID0, "default", "root.unknown")
assert.Assert(t, app.runnableInQueue)
diff --git a/pkg/scheduler/objects/utilities_test.go
b/pkg/scheduler/objects/utilities_test.go
index e66e4e98..577e26ff 100644
--- a/pkg/scheduler/objects/utilities_test.go
+++ b/pkg/scheduler/objects/utilities_test.go
@@ -42,6 +42,7 @@ const (
aKey = "alloc-1"
aAllocationID = "alloc-allocationid-1"
nodeID1 = "node-1"
+ nodeID2 = "node-2"
instType1 = "itype-1"
)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]