From 5eaad136a09e4783568ee0bdb2353ebaf84fff4f Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 5 Dec 2022 14:02:33 +0800 Subject: [PATCH] [CELEBORN-84][IMPROVEMENT] Blacklist critical reason should avoid been covered by normal reason (#1043) * [CELEBORN-84][IMPROVEMENT] Blacklist critical reason should avoid been covered by normal reason --- .../apache/celeborn/client/LifecycleManager.scala | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 71ca042a9..08ee5c6f5 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -1924,7 +1924,20 @@ class LifecycleManager(appId: String, val conf: CelebornConf) extends RpcEndpoin : Unit = { val failedWorker = new ConcurrentHashMap[WorkerInfo, (StatusCode, Long)](failures) logInfo(s"Report Worker Failure: ${failedWorker.asScala}, current blacklist $blacklist") - blacklist.putAll(failedWorker) + failedWorker.asScala.foreach { case (worker, (statusCode, registerTime)) => + if (!blacklist.containsKey(worker)) { + blacklist.put(worker, (statusCode, registerTime)) + } else { + statusCode match { + case StatusCode.WORKER_SHUTDOWN | + StatusCode.NO_AVAILABLE_WORKING_DIR | + StatusCode.RESERVE_SLOTS_FAILED | + StatusCode.UNKNOWN_WORKER => + blacklist.put(worker, (statusCode, blacklist.get(worker)._2)) + case _ => // Not cover + } + } + } } def checkQuota(): Boolean = {