[
https://issues.apache.org/jira/browse/YUNIKORN-1227?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Craig Condit resolved YUNIKORN-1227.
------------------------------------
Fix Version/s: 1.1.0
Resolution: Fixed
Merged into master.
> Race condition in Predicates code on Node / Pod
> -----------------------------------------------
>
> Key: YUNIKORN-1227
> URL: https://issues.apache.org/jira/browse/YUNIKORN-1227
> Project: Apache YuniKorn
> Issue Type: Bug
> Components: shim - kubernetes
> Reporter: Craig Condit
> Assignee: Craig Condit
> Priority: Major
> Labels: pull-request-available
> Fix For: 1.1.0
>
>
> A data race was recently uncovered during testing:
> {noformat}
> ==================
> WARNING: DATA RACE
> Write at 0x00c000171688 by goroutine 168:
> k8s.io/kubernetes/pkg/scheduler/framework.(*NodeInfo).RemovePod()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/types.go:560
> +0x7ec
>
> github.com/apache/yunikorn-k8shim/pkg/cache/external.(*SchedulerCache).updatePod()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/external/scheduler_cache.go:258
> +0x1fe
>
> github.com/apache/yunikorn-k8shim/pkg/cache/external.(*SchedulerCache).UpdatePod()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/external/scheduler_cache.go:244
> +0x11d
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updatePodInCache()
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:263
> +0x94a
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).updatePodInCache-fm()
> <autogenerated>:1 +0x6d
> k8s.io/client-go/tools/cache.ResourceEventHandlerFuncs.OnUpdate()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/controller.go:238
> +0x8b
> k8s.io/client-go/tools/cache.(*ResourceEventHandlerFuncs).OnUpdate()
> <autogenerated>:1 +0x29
> k8s.io/client-go/tools/cache.FilteringResourceEventHandler.OnUpdate()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/controller.go:273
> +0xf5
> k8s.io/client-go/tools/cache.(*FilteringResourceEventHandler).OnUpdate()
> <autogenerated>:1 +0x8d
> k8s.io/client-go/tools/cache.(*processorListener).run.func1()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:775
> +0x2b7
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:155
> +0x48
> k8s.io/apimachinery/pkg/util/wait.BackoffUntil()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:156
> +0xce
> k8s.io/apimachinery/pkg/util/wait.JitterUntil()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:133
> +0x104
> k8s.io/apimachinery/pkg/util/wait.Until()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:90
> +0x78
> k8s.io/client-go/tools/cache.(*processorListener).run()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:771
> +0x18
> k8s.io/client-go/tools/cache.(*processorListener).run-fm()
> <autogenerated>:1 +0x39
> k8s.io/apimachinery/pkg/util/wait.(*Group).Start.func1()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:73
> +0x73Previous read at 0x00c000171688 by goroutine 46:
>
> k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources.fitsRequest()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/plugins/noderesources/fit.go:234
> +0xd7
>
> k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources.(*Fit).Filter()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/scheduler/framework/plugins/noderesources/fit.go:201
> +0xbe
>
> github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).runFilterPlugin()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:151
> +0x158
>
> github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).runFilterPlugins()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:129
> +0x126
>
> github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).podFitsNode()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:94
> +0x137
>
> github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).predicatesAllocate()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:78
> +0x211
>
> github.com/apache/yunikorn-k8shim/pkg/plugin/predicates.(*predicateManagerImpl).Predicates()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/plugin/predicates/predicate_manager.go:64
> +0x52
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).IsPodFitNode()
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:341
> +0x241
>
> github.com/apache/yunikorn-k8shim/pkg/callback.(*AsyncRMCallback).Predicates()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/callback/scheduler_callback.go:187
> +0xb7
>
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Node).preConditions()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/node.go:386
> +0x1c7
>
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Node).preAllocateConditions()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/node.go:368
> +0xe4
>
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryNode()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:1190
> +0xe7
>
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryNodes()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:1112
> +0x7c4
>
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Application).tryAllocate()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/application.go:849
> +0x7a4
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Queue).TryAllocate()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/queue.go:1070
> +0x18c
> github.com/apache/yunikorn-core/pkg/scheduler/objects.(*Queue).TryAllocate()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/objects/queue.go:1082
> +0xf7
>
> github.com/apache/yunikorn-core/pkg/scheduler.(*PartitionContext).tryAllocate()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/partition.go:831
> +0x15c
> github.com/apache/yunikorn-core/pkg/scheduler.(*ClusterContext).schedule()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/context.go:137
> +0x1b6
>
> github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).internalSchedule()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:77
> +0x47
>
> github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).StartService.func2()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:67
> +0x39Goroutine 168 (running) created at:
> k8s.io/apimachinery/pkg/util/wait.(*Group).Start()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/pkg/util/wait/wait.go:71
> +0xdc
> k8s.io/client-go/tools/cache.(*sharedProcessor).addListener()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:593
> +0x379
>
> k8s.io/client-go/tools/cache.(*sharedIndexInformer).AddEventHandlerWithResyncPeriod()
>
> /home/testuser/go/pkg/mod/k8s.io/[email protected]/tools/cache/shared_informer.go:521
> +0x644
>
> github.com/apache/yunikorn-k8shim/pkg/client.(*APIFactory).addEventHandlers()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/client/apifactory.go:183
> +0x182
> github.com/apache/yunikorn-k8shim/pkg/client.(*APIFactory).AddEventHandler()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/client/apifactory.go:175
> +0x293
>
> github.com/apache/yunikorn-k8shim/pkg/cache.(*Context).AddSchedulingEventHandlers()
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cache/context.go:101
> +0x673
> github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).doScheduling()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:235 +0x55
>
> github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).doScheduling-fm()
> <autogenerated>:1 +0x44
> github.com/looplab/fsm.(*FSM).enterStateCallbacks()
> /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:403 +0xb6
> github.com/looplab/fsm.(*FSM).Event.func1()
> /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:308 +0xa8
> github.com/looplab/fsm.transitionerStruct.transition()
> /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:354 +0x99
> github.com/looplab/fsm.(*transitionerStruct).transition()
> <autogenerated>:1 +0x29
> github.com/looplab/fsm.(*FSM).doTransition()
> /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:339
> +0x701
> github.com/looplab/fsm.(*FSM).Event()
> /home/testuser/go/pkg/mod/github.com/looplab/[email protected]/fsm.go:321
> +0x6da
> github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).handle()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:279
> +0x1e4
>
> github.com/apache/yunikorn-k8shim/pkg/shim.(*KubernetesShim).SchedulerEventHandler.func1()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/shim/scheduler.go:152 +0xa9
> github.com/apache/yunikorn-k8shim/pkg/dispatcher.Start.func1()
>
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/dispatcher/dispatcher.go:199
> +0x36bGoroutine 46 (running) created at:
> github.com/apache/yunikorn-core/pkg/scheduler.(*Scheduler).StartService()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/scheduler/scheduler.go:67
> +0x384
>
> github.com/apache/yunikorn-core/pkg/entrypoint.startAllServicesWithParameters()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:90
> +0x624
> github.com/apache/yunikorn-core/pkg/entrypoint.StartAllServices()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:44
> +0x4f
> github.com/apache/yunikorn-core/pkg/entrypoint.StartAllServicesWithLogger()
>
> /home/testuser/repos/incubator-yunikorn-core/pkg/entrypoint/entrypoint.go:55
> +0x3b
> main.main()
> /home/testuser/repos/incubator-yunikorn-k8shim/pkg/cmd/shim/main.go:50
> +0x4c4{noformat}
> Based on analysis of this race, it appears that we need to make defensive
> copies of Node / Pod information when calling the K8s predicates. The default
> scheduler creates a snapshot per scheduler run; it's likely we need to do
> something similar.
--
This message was sent by Atlassian Jira
(v8.20.7#820007)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]