From 7e199d6fdbdf52222bb3eadd056b9e5a2295f36e Mon Sep 17 00:00:00 2001 From: "Wang, Fei" Date: Wed, 16 Apr 2025 10:12:10 -0700 Subject: [PATCH] [KYUUBI #7025] [KYUUBI #6686][FOLLOWUP] Prefer terminated container app state than terminated pod state ### Why are the changes needed? I found that, for a kyuubi batch on kubernetes. 1. It has been `FINISHED`. 2. then I delete the pod manually, then I check the k8s-audit.log, then the appState became `FAILED`. ``` 2025-04-15 11:16:30.453 INFO [-675216314-pool-44-thread-839] org.apache.kyuubi.engine.KubernetesApplicationAuditLogger: label=61e7d8c1-e5a9-46cd-83e7-c611003f0224 context=97 namespace=dls-prod pod=kyuubi-spark-61e7d8c1-e5a9-46cd-83e7-c611003f0224-driver podState=Running containers=[microvault->ContainerState(running=ContainerStateRunning(startedAt=2025-04-15T18:13:48Z, additionalProperties={}), terminated=null, waiting=null, additionalProperties={}),spark-kubernetes-driver->ContainerState(running=null, terminated=ContainerStateTerminated(containerID=containerd://72704f8e7ccb5e877c8f6b10bf6ad810d0c019e07e0cb5975be733e79762c1ec, exitCode=0, finishedAt=2025-04-15T18:14:22Z, message=null, reason=Completed, signal=null, startedAt=2025-04-15T18:13:49Z, additionalProperties={}), waiting=null, additionalProperties={})] appId=spark-228c62e0dc37402bacac189d01b871e4 appState=FINISHED appError='' :2025-04-15 11:16:30.854 INFO [-675216314-pool-44-thread-840] org.apache.kyuubi.engine.KubernetesApplicationAuditLogger: label=61e7d8c1-e5a9-46cd-83e7-c611003f0224 context=97 namespace=dls-prod pod=kyuubi-spark-61e7d8c1-e5a9-46cd-83e7-c611003f0224-driver podState=Failed containers=[microvault->ContainerState(running=null, terminated=ContainerStateTerminated(containerID=containerd://91654e3ee74e2c31218e14be201b50a4a604c2ad15d3afd84dc6f620e59894b7, exitCode=2, finishedAt=2025-04-15T18:16:30Z, message=null, reason=Error, signal=null, startedAt=2025-04-15T18:13:48Z, additionalProperties={}), waiting=null, additionalProperties={}),spark-kubernetes-driver->ContainerState(running=null, terminated=ContainerStateTerminated(containerID=containerd://72704f8e7ccb5e877c8f6b10bf6ad810d0c019e07e0cb5975be733e79762c1ec, exitCode=0, finishedAt=2025-04-15T18:14:22Z, message=null, reason=Completed, signal=null, startedAt=2025-04-15T18:13:49Z, additionalProperties={}), waiting=null, additionalProperties={})] appId=spark-228c62e0dc37402bacac189d01b871e4 appState=FAILED appError='{ ``` This PR is a followup for #6690 , which ignore the container state if POD is terminated. It is more reasonable to respect the terminated container state than terminated pod state. ### How was this patch tested? Integration testing. ``` :2025-04-15 13:53:24.551 INFO [-1077768163-pool-36-thread-3] org.apache.kyuubi.engine.KubernetesApplicationAuditLogger: eventType=DELETE label=e0eb4580-3cfa-43bf-bdcc-efeabcabc93c context=97 namespace=dls-prod pod=kyuubi-spark-e0eb4580-3cfa-43bf-bdcc-efeabcabc93c-driver podState=Failed containers=[microvault->ContainerState(running=null, terminated=ContainerStateTerminated(containerID=containerd://66c42206730950bd422774e3c1b0f426d7879731788cea609bbfe0daab24a763, exitCode=2, finishedAt=2025-04-15T20:53:22Z, message=null, reason=Error, signal=null, startedAt=2025-04-15T20:52:00Z, additionalProperties={}), waiting=null, additionalProperties={}),spark-kubernetes-driver->ContainerState(running=null, terminated=ContainerStateTerminated(containerID=containerd://9179a73d9d9e148dcd9c13ee6cc29dc3e257f95a33609065e061866bb611cb3b, exitCode=0, finishedAt=2025-04-15T20:52:28Z, message=null, reason=Completed, signal=null, startedAt=2025-04-15T20:52:01Z, additionalProperties={}), waiting=null, additionalProperties={})] appId=spark-578df0facbfd4958a07f8d1ae79107dc appState=FINISHED appError='' ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #7025 from turboFei/container_terminated. Closes #7025 Closes #6686 a3b2a5a56 [Wang, Fei] comments 4356d1bc9 [Wang, Fei] fix the app state logical Authored-by: Wang, Fei Signed-off-by: Wang, Fei --- .../engine/KubernetesApplicationOperation.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala b/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala index c7ce750f2..2e57c722f 100644 --- a/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala +++ b/kyuubi-server/src/main/scala/org/apache/kyuubi/engine/KubernetesApplicationOperation.scala @@ -553,15 +553,18 @@ object KubernetesApplicationOperation extends Logging { } val podAppState = podStateToApplicationState(pod.getStatus.getPhase) - val containerAppState = containerStatusToBuildAppState + val containerAppStateOpt = containerStatusToBuildAppState .map(_.getState) .map(containerStateToApplicationState) - // When the pod app state is terminated, the container app state will be ignored - val applicationState = if (ApplicationState.isTerminated(podAppState)) { - podAppState - } else { - containerAppState.getOrElse(podAppState) + val applicationState = containerAppStateOpt match { + // for cases that spark container already terminated, but sidecar containers live + case Some(containerAppState) + if ApplicationState.isTerminated(containerAppState) => containerAppState + // we don't need to care about container state if pod is already terminated + case _ if ApplicationState.isTerminated(podAppState) => podAppState + case Some(containerAppState) => containerAppState + case None => podAppState } val applicationError = if (ApplicationState.isFailed(applicationState)) {