attilapiros commented on code in PR #53840:
URL: https://github.com/apache/spark/pull/53840#discussion_r2831585872
##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,53 @@ class ExecutorPodsAllocator(
.build()
val resources = replacePVCsIfNeeded(
podWithAttachedContainer,
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
- val createdExecutorPod =
-
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
- try {
- addOwnerReference(createdExecutorPod, resources)
- resources
- .filter(_.getKind == "PersistentVolumeClaim")
- .foreach { resource =>
- if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
- addOwnerReference(driverPod.get, Seq(resource))
- }
- val pvc = resource.asInstanceOf[PersistentVolumeClaim]
- logInfo(log"Trying to create PersistentVolumeClaim " +
- log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)}
with " +
- log"StorageClass ${MDC(LogKeys.CLASS_NAME,
pvc.getSpec.getStorageClassName)}")
-
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
- PVC_COUNTER.incrementAndGet()
- }
- newlyCreatedExecutors(newExecutorId) = (resourceProfileId,
clock.getTimeMillis())
- logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+ val optCreatedExecutorPod = try {
+ Some(kubernetesClient
+ .pods()
+ .inNamespace(namespace)
+ .resource(podWithAttachedContainer)
+ .create())
} catch {
case NonFatal(e) =>
- kubernetesClient.pods()
- .inNamespace(namespace)
- .resource(createdExecutorPod)
- .delete()
- throw e
+ // Register failure with global tracker if lifecycle manager is
available
+ val failureCount = totalFailedPodCreations.incrementAndGet()
+ if (executorPodsLifecycleManager != null) {
+ executorPodsLifecycleManager.registerPodCreationFailure()
+ }
+ logError(log"Failed to create executor pod
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+ log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)
+ None
+ }
+ optCreatedExecutorPod.foreach { createdExecutorPod =>
+ try {
Review Comment:
Actually I have missed that line. Could you please run a manual test and
check what happens with the exception which thrown here (by throwing an
exception here directly and running one of the integration test)?
I am afraid it can go way up to `CoarseGrainedSchedulerBackend`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]