attilapiros commented on code in PR #53840:
URL: https://github.com/apache/spark/pull/53840#discussion_r2822420319
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,53 @@ class ExecutorPodsAllocator(
.build()
val resources = replacePVCsIfNeeded(
podWithAttachedContainer,
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
- val createdExecutorPod =
-
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
- try {
- addOwnerReference(createdExecutorPod, resources)
- resources
- .filter(_.getKind == "PersistentVolumeClaim")
- .foreach { resource =>
- if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
- addOwnerReference(driverPod.get, Seq(resource))
- }
- val pvc = resource.asInstanceOf[PersistentVolumeClaim]
- logInfo(log"Trying to create PersistentVolumeClaim " +
- log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)}
with " +
- log"StorageClass ${MDC(LogKeys.CLASS_NAME,
pvc.getSpec.getStorageClassName)}")
-
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
- PVC_COUNTER.incrementAndGet()
- }
- newlyCreatedExecutors(newExecutorId) = (resourceProfileId,
clock.getTimeMillis())
- logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+ val optCreatedExecutorPod = try {
+ Some(kubernetesClient
+ .pods()
+ .inNamespace(namespace)
+ .resource(podWithAttachedContainer)
+ .create())
} catch {
case NonFatal(e) =>
- kubernetesClient.pods()
- .inNamespace(namespace)
- .resource(createdExecutorPod)
- .delete()
- throw e
+ // Register failure with global tracker if lifecycle manager is
available
+ val failureCount = totalFailedPodCreations.incrementAndGet()
+ if (executorPodsLifecycleManager != null) {
+ executorPodsLifecycleManager.registerPodCreationFailure()
+ }
+ logError(log"Failed to create executor pod
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+ log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)
+ None
+ }
+ optCreatedExecutorPod.foreach { createdExecutorPod =>
+ try {
Review Comment:
Why an error during pod creation handled differently from an error coming
during adding the owner reference and creating PVC?
In both case the nonfatal error ends in deleting the pod so why the 2nd case
does not tracked as a failure?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]