parthchandra commented on code in PR #53840:
URL: https://github.com/apache/spark/pull/53840#discussion_r2836918757


##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala:
##########
@@ -35,6 +35,20 @@ import org.apache.spark.resource.ResourceProfile
  */
 @DeveloperApi
 abstract class AbstractPodsAllocator {
+  /*
+   * Optional lifecycle manager for tracking executor pod lifecycle events.
+   * Set via setExecutorPodsLifecycleManager for backward compatibility.
+   */
+  protected var executorPodsLifecycleManager: ExecutorPodsLifecycleManager = _
+
+  /*
+   * Set the lifecycle manager for tracking executor pod lifecycle events.
+   * This method is optional and may not exist in custom implementations based 
on older versions.
+   */
+  def setExecutorPodsLifecycleManager(manager: ExecutorPodsLifecycleManager): 
Unit = {
+    executorPodsLifecycleManager = manager

Review Comment:
   Done



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,61 @@ class ExecutorPodsAllocator(
         .build()
       val resources = replacePVCsIfNeeded(
         podWithAttachedContainer, 
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
-      val createdExecutorPod =
-        
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
-      try {
-        addOwnerReference(createdExecutorPod, resources)
-        resources
-          .filter(_.getKind == "PersistentVolumeClaim")
-          .foreach { resource =>
-            if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
-              addOwnerReference(driverPod.get, Seq(resource))
-            }
-            val pvc = resource.asInstanceOf[PersistentVolumeClaim]
-            logInfo(log"Trying to create PersistentVolumeClaim " +
-              log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} 
with " +
-              log"StorageClass ${MDC(LogKeys.CLASS_NAME, 
pvc.getSpec.getStorageClassName)}")
-            
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
-            PVC_COUNTER.incrementAndGet()
-          }
-        newlyCreatedExecutors(newExecutorId) = (resourceProfileId, 
clock.getTimeMillis())
-        logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+      val optCreatedExecutorPod = try {
+        Some(kubernetesClient
+          .pods()
+          .inNamespace(namespace)
+          .resource(podWithAttachedContainer)
+          .create())
       } catch {
         case NonFatal(e) =>
-          kubernetesClient.pods()
-            .inNamespace(namespace)
-            .resource(createdExecutorPod)
-            .delete()
-          throw e
+          // Register failure with global tracker if lifecycle manager is 
available
+          val failureCount = totalFailedPodCreations.incrementAndGet()
+          if (executorPodsLifecycleManager != null) {
+            executorPodsLifecycleManager.registerPodCreationFailure()
+          }

Review Comment:
   Done



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala:
##########
@@ -184,13 +188,30 @@ private[spark] class KubernetesClusterManager extends 
ExternalClusterManager wit
       classOf[SparkConf], classOf[org.apache.spark.SecurityManager],
       classOf[KubernetesExecutorBuilder], classOf[KubernetesClient],
       classOf[ExecutorPodsSnapshotsStore], classOf[Clock])
-    cstr.newInstance(
+    val allocatorInstance = cstr.newInstance(
       sc.conf,
       sc.env.securityManager,
       new KubernetesExecutorBuilder(),
       kubernetesClient,
       snapshotsStore,
       new SystemClock())
+
+    // Try to set the lifecycle manager using reflection for backward 
compatibility
+    // with custom allocators that may not have this method
+    lifecycleManager.foreach { manager =>
+      try {
+        val setLifecycleManagerMethod = cls.getMethod(
+          "setExecutorPodsLifecycleManager",
+          classOf[ExecutorPodsLifecycleManager])
+        setLifecycleManagerMethod.invoke(allocatorInstance, manager)
+      } catch {
+        case _: NoSuchMethodException =>
+          logInfo("Allocator does not support setExecutorPodsLifecycleManager 
method. " +
+            "Pod creation failures will not be tracked.")

Review Comment:
   I was following the previous comment about needing to use reflection to 
maintain backwards compatibility. But you're right reflection is not needed.  
Removed.



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,61 @@ class ExecutorPodsAllocator(
         .build()
       val resources = replacePVCsIfNeeded(
         podWithAttachedContainer, 
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
-      val createdExecutorPod =
-        
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
-      try {
-        addOwnerReference(createdExecutorPod, resources)
-        resources
-          .filter(_.getKind == "PersistentVolumeClaim")
-          .foreach { resource =>
-            if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
-              addOwnerReference(driverPod.get, Seq(resource))
-            }
-            val pvc = resource.asInstanceOf[PersistentVolumeClaim]
-            logInfo(log"Trying to create PersistentVolumeClaim " +
-              log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} 
with " +
-              log"StorageClass ${MDC(LogKeys.CLASS_NAME, 
pvc.getSpec.getStorageClassName)}")
-            
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
-            PVC_COUNTER.incrementAndGet()
-          }
-        newlyCreatedExecutors(newExecutorId) = (resourceProfileId, 
clock.getTimeMillis())
-        logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+      val optCreatedExecutorPod = try {
+        Some(kubernetesClient
+          .pods()
+          .inNamespace(namespace)
+          .resource(podWithAttachedContainer)
+          .create())
       } catch {
         case NonFatal(e) =>
-          kubernetesClient.pods()
-            .inNamespace(namespace)
-            .resource(createdExecutorPod)
-            .delete()
-          throw e
+          // Register failure with global tracker if lifecycle manager is 
available
+          val failureCount = totalFailedPodCreations.incrementAndGet()
+          if (executorPodsLifecycleManager != null) {
+            executorPodsLifecycleManager.registerPodCreationFailure()
+          }
+          logError(log"Failed to create executor pod 
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+            log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)

Review Comment:
   Done



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala:
##########
@@ -459,32 +462,53 @@ class ExecutorPodsAllocator(
         .build()
       val resources = replacePVCsIfNeeded(
         podWithAttachedContainer, 
resolvedExecutorSpec.executorKubernetesResources, reusablePVCs)
-      val createdExecutorPod =
-        
kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create()
-      try {
-        addOwnerReference(createdExecutorPod, resources)
-        resources
-          .filter(_.getKind == "PersistentVolumeClaim")
-          .foreach { resource =>
-            if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) {
-              addOwnerReference(driverPod.get, Seq(resource))
-            }
-            val pvc = resource.asInstanceOf[PersistentVolumeClaim]
-            logInfo(log"Trying to create PersistentVolumeClaim " +
-              log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} 
with " +
-              log"StorageClass ${MDC(LogKeys.CLASS_NAME, 
pvc.getSpec.getStorageClassName)}")
-            
kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create()
-            PVC_COUNTER.incrementAndGet()
-          }
-        newlyCreatedExecutors(newExecutorId) = (resourceProfileId, 
clock.getTimeMillis())
-        logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+      val optCreatedExecutorPod = try {
+        Some(kubernetesClient
+          .pods()
+          .inNamespace(namespace)
+          .resource(podWithAttachedContainer)
+          .create())
       } catch {
         case NonFatal(e) =>
-          kubernetesClient.pods()
-            .inNamespace(namespace)
-            .resource(createdExecutorPod)
-            .delete()
-          throw e
+          // Register failure with global tracker if lifecycle manager is 
available
+          val failureCount = totalFailedPodCreations.incrementAndGet()
+          if (executorPodsLifecycleManager != null) {
+            executorPodsLifecycleManager.registerPodCreationFailure()
+          }
+          logError(log"Failed to create executor pod 
${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " +
+            log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e)
+          None
+      }
+      optCreatedExecutorPod.foreach { createdExecutorPod =>
+        try {

Review Comment:
   I think the exception being thrown is fine. It. is propagated up  through 
`requestNewExecutors() → onNewSnapshots()`. The exception is caught at 
`ExecutorPodsSnapshotStoreImpl.processSnapshotsInternal` and logged as a 
warning.
   The test `  test("SPARK-41410: An exception during PVC creation should not 
increase PVC counter")` is testing explicitly for an exception to be thrown in 
this case. I modified the code to throw an Exception and the test passed (as 
long as the exception is a KubernetesClientException).



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala:
##########
@@ -35,6 +35,20 @@ import org.apache.spark.resource.ResourceProfile
  */
 @DeveloperApi
 abstract class AbstractPodsAllocator {
+  /*
+   * Optional lifecycle manager for tracking executor pod lifecycle events.
+   * Set via setExecutorPodsLifecycleManager for backward compatibility.
+   */
+  protected var executorPodsLifecycleManager: ExecutorPodsLifecycleManager = _

Review Comment:
   Done



##########
resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala:
##########
@@ -75,6 +75,14 @@ private[spark] class ExecutorPodsLifecycleManager(
 
   protected[spark] def getNumExecutorsFailed: Int = 
failureTracker.numFailedExecutors
 
+  /**
+   * Register a pod creation failure. This increments the global executor 
failure count
+   * which is checked against spark.executor.maxNumFailures.
+   */
+  protected[spark] def registerPodCreationFailure(): Unit = {

Review Comment:
   Changed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to