diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 71ca042a9..08ee5c6f5 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -1924,7 +1924,20 @@ class LifecycleManager(appId: String, val conf: CelebornConf) extends RpcEndpoin : Unit = { val failedWorker = new ConcurrentHashMap[WorkerInfo, (StatusCode, Long)](failures) logInfo(s"Report Worker Failure: ${failedWorker.asScala}, current blacklist $blacklist") - blacklist.putAll(failedWorker) + failedWorker.asScala.foreach { case (worker, (statusCode, registerTime)) => + if (!blacklist.containsKey(worker)) { + blacklist.put(worker, (statusCode, registerTime)) + } else { + statusCode match { + case StatusCode.WORKER_SHUTDOWN | + StatusCode.NO_AVAILABLE_WORKING_DIR | + StatusCode.RESERVE_SLOTS_FAILED | + StatusCode.UNKNOWN_WORKER => + blacklist.put(worker, (statusCode, blacklist.get(worker)._2)) + case _ => // Not cover + } + } + } } def checkQuota(): Boolean = {