From f04ebccd4d34561335d7b54be189264e58d3f7e7 Mon Sep 17 00:00:00 2001 From: Aravind Patnam Date: Mon, 8 Apr 2024 15:11:35 +0800 Subject: [PATCH] [CELEBORN-1368] Log celeborn config for debugging purposes ### What changes were proposed in this pull request? Log celeborn config for debugging purposes. ### Why are the changes needed? Help with debugging ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? tested the patch internally. Closes #2442 from akpatnam25/CELEBORN-1368. Authored-by: Aravind Patnam Signed-off-by: mingji --- .../apache/celeborn/common/CelebornConf.scala | 22 ++++++++++ .../apache/celeborn/common/util/Utils.scala | 41 +++++++++++++++++++ docs/configuration/master.md | 2 + docs/configuration/worker.md | 2 + .../service/deploy/master/Master.scala | 4 ++ .../celeborn/server/common/HttpService.scala | 6 ++- .../service/deploy/worker/Worker.scala | 5 +++ 7 files changed, 80 insertions(+), 2 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index d8b482877..6ab67fb89 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1281,6 +1281,10 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se // Rack Resolver // // ////////////////////////////////////////////////////// def rackResolverRefreshInterval = get(RACKRESOLVER_REFRESH_INTERVAL) + + def logCelebornConfEnabled = get(LOG_CELEBORN_CONF_ENABLED) + + def secretRedactionPattern = get(SECRET_REDACTION_PATTERN) } object CelebornConf extends Logging { @@ -4924,4 +4928,22 @@ object CelebornConf extends Logging { s"Invalid maxEncryptedBlockSize, must be a position number upto ${Int.MaxValue}") .createWithDefaultString("64k") + val SECRET_REDACTION_PATTERN = + buildConf("celeborn.redaction.regex") + .categories("master", "worker") + .doc("Regex to decide which Celeborn configuration properties and environment variables in " + + "master and worker environments contain sensitive information. When this regex matches " + + "a property key or value, the value is redacted from the logging.") + .version("0.5.0") + .regexConf + .createWithDefault("(?i)secret|password|token|access[.]key".r) + + val LOG_CELEBORN_CONF_ENABLED: ConfigEntry[Boolean] = + buildConf("celeborn.logConf.enabled") + .categories("master", "worker") + .version("0.5.0") + .doc("When `true`, log the CelebornConf for debugging purposes.") + .booleanConf + .createWithDefault(false) + } diff --git a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala index bab5b5d8b..07e9fe30b 100644 --- a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala +++ b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala @@ -34,6 +34,7 @@ import scala.io.Source import scala.reflect.ClassTag import scala.util.{Random => ScalaRandom, Try} import scala.util.control.{ControlThrowable, NonFatal} +import scala.util.matching.Regex import com.google.protobuf.{ByteString, GeneratedMessageV3} import io.netty.channel.unix.Errors.NativeIoException @@ -1101,4 +1102,44 @@ object Utils extends Logging { val host = components.dropRight(portsNum).mkString(":") Array(host) ++ portsArr } + + private val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + + /** + * Redact the sensitive values in the given map. If a map key matches the redaction pattern then + * its value is replaced with a dummy text. + */ + def redact(conf: CelebornConf, kvs: Seq[(String, String)]): Seq[(String, String)] = { + val redactionPattern = conf.secretRedactionPattern + redact(redactionPattern, kvs) + } + + private def redact[K, V](redactionPattern: Regex, kvs: Seq[(K, V)]): Seq[(K, V)] = { + // If the sensitive information regex matches with either the key or the value, redact the value + // While the original intent was to only redact the value if the key matched with the regex, + // we've found that especially in verbose mode, the value of the property may contain sensitive + // information like so: + // + // celeborn.dynamicConfig.store.db.hikari.password=secret_password ... + // + // And, in such cases, simply searching for the sensitive information regex in the key name is + // not sufficient. The values themselves have to be searched as well and redacted if matched. + // This does mean we may be accounting more false positives - for example, if the value of an + // arbitrary property contained the term 'password', we may redact the value from the UI and + // logs. In order to work around it, user would have to make the celeborn.redaction.regex property + // more specific. + kvs.map { + case (key: String, value: String) => + redactionPattern.findFirstIn(key) + .orElse(redactionPattern.findFirstIn(value)) + .map { _ => (key, REDACTION_REPLACEMENT_TEXT) } + .getOrElse((key, value)) + case (key, value: String) => + redactionPattern.findFirstIn(value) + .map { _ => (key, REDACTION_REPLACEMENT_TEXT) } + .getOrElse((key, value)) + case (key, value) => + (key, value) + }.asInstanceOf[Seq[(K, V)]] + } } diff --git a/docs/configuration/master.md b/docs/configuration/master.md index e2217868b..582f3f8b3 100644 --- a/docs/configuration/master.md +++ b/docs/configuration/master.md @@ -33,6 +33,7 @@ license: | | celeborn.dynamicConfig.store.db.hikari.username | | false | The username of db store backend. | 0.5.0 | | | celeborn.dynamicConfig.store.fs.path | <undefined> | false | The path of dynamic config file for fs store backend. The file format should be yaml. The default path is `${CELEBORN_CONF_DIR}/dynamicConfig.yaml`. | 0.5.0 | | | celeborn.internal.port.enabled | false | false | Whether to create a internal port on Masters/Workers for inter-Masters/Workers communication. This is beneficial when SASL authentication is enforced for all interactions between clients and Celeborn Services, but the services can exchange messages without being subject to SASL authentication. | 0.5.0 | | +| celeborn.logConf.enabled | false | false | When `true`, log the CelebornConf for debugging purposes. | 0.5.0 | | | celeborn.master.estimatedPartitionSize.initialSize | 64mb | false | Initial partition size for estimation, it will change according to runtime stats. | 0.3.0 | celeborn.shuffle.initialEstimatedPartitionSize | | celeborn.master.estimatedPartitionSize.maxSize | <undefined> | false | Max partition size for estimation. Default value should be celeborn.worker.shuffle.partitionSplit.max * 2. | 0.4.1 | | | celeborn.master.estimatedPartitionSize.minSize | 8mb | false | Ignore partition size smaller than this configuration of partition size for estimation. | 0.3.0 | celeborn.shuffle.minPartitionSizeToEstimate | @@ -60,6 +61,7 @@ license: | | celeborn.master.userResourceConsumption.update.interval | 30s | false | Time length for a window about compute user resource consumption. | 0.3.0 | | | celeborn.master.workerUnavailableInfo.expireTimeout | 1800s | false | Worker unavailable info would be cleared when the retention period is expired | 0.3.1 | | | celeborn.quota.enabled | true | false | When Master side sets to true, the master will enable to check the quota via QuotaManager. When Client side sets to true, LifecycleManager will request Master side to check whether the current user has enough quota before registration of shuffle. Fallback to the default shuffle service of Spark when Master side checks that there is no enough quota for current user. | 0.2.0 | | +| celeborn.redaction.regex | (?i)secret|password|token|access[.]key | false | Regex to decide which Celeborn configuration properties and environment variables in master and worker environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the logging. | 0.5.0 | | | celeborn.storage.availableTypes | HDD | false | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | celeborn.storage.activeTypes | | celeborn.storage.hdfs.dir | <undefined> | false | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | | | celeborn.storage.hdfs.kerberos.keytab | <undefined> | false | Kerberos keytab file path for HDFS storage connection. | 0.3.2 | | diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md index 675ec7064..63bd69b7a 100644 --- a/docs/configuration/worker.md +++ b/docs/configuration/worker.md @@ -33,9 +33,11 @@ license: | | celeborn.dynamicConfig.store.db.hikari.username | | false | The username of db store backend. | 0.5.0 | | | celeborn.dynamicConfig.store.fs.path | <undefined> | false | The path of dynamic config file for fs store backend. The file format should be yaml. The default path is `${CELEBORN_CONF_DIR}/dynamicConfig.yaml`. | 0.5.0 | | | celeborn.internal.port.enabled | false | false | Whether to create a internal port on Masters/Workers for inter-Masters/Workers communication. This is beneficial when SASL authentication is enforced for all interactions between clients and Celeborn Services, but the services can exchange messages without being subject to SASL authentication. | 0.5.0 | | +| celeborn.logConf.enabled | false | false | When `true`, log the CelebornConf for debugging purposes. | 0.5.0 | | | celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of master nodes for celeborn client to connect, allowed pattern is: `:[,:]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. | 0.2.0 | | | celeborn.master.estimatedPartitionSize.minSize | 8mb | false | Ignore partition size smaller than this configuration of partition size for estimation. | 0.3.0 | celeborn.shuffle.minPartitionSizeToEstimate | | celeborn.master.internal.endpoints | <localhost>:8097 | false | Endpoints of master nodes just for celeborn workers to connect, allowed pattern is: `:[,:]*`, e.g. `clb1:8097,clb2:8097,clb3:8097`. If the port is omitted, 8097 will be used. | 0.5.0 | | +| celeborn.redaction.regex | (?i)secret|password|token|access[.]key | false | Regex to decide which Celeborn configuration properties and environment variables in master and worker environments contain sensitive information. When this regex matches a property key or value, the value is redacted from the logging. | 0.5.0 | | | celeborn.shuffle.chunk.size | 8m | false | Max chunk size of reducer's merged shuffle data. For example, if a reducer's shuffle data is 128M and the data will need 16 fetch chunk requests to fetch. | 0.2.0 | | | celeborn.storage.availableTypes | HDD | false | Enabled storages. Available options: MEMORY,HDD,SSD,HDFS. Note: HDD and SSD would be treated as identical. | 0.3.0 | celeborn.storage.activeTypes | | celeborn.storage.hdfs.dir | <undefined> | false | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | | diff --git a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala index 736cc7e45..835ba96ab 100644 --- a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala +++ b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala @@ -82,6 +82,10 @@ private[celeborn] class Master( // Send ApplicationMeta to workers private var sendApplicationMetaExecutor: ExecutorService = _ + if (conf.logCelebornConfEnabled) { + logInfo(getConf) + } + override val rpcEnv: RpcEnv = if (!authEnabled) { RpcEnv.create( diff --git a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala index 18e016187..540b3eadd 100644 --- a/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala +++ b/service/src/main/scala/org/apache/celeborn/server/common/HttpService.scala @@ -23,6 +23,7 @@ import scala.collection.JavaConverters._ import org.apache.celeborn.common.CelebornConf import org.apache.celeborn.common.internal.Logging +import org.apache.celeborn.common.util.Utils import org.apache.celeborn.server.common.http.HttpServer import org.apache.celeborn.server.common.http.api.ApiRootResource import org.apache.celeborn.server.common.service.config.ConfigLevel @@ -35,8 +36,9 @@ abstract class HttpService extends Service with Logging { val sb = new StringBuilder sb.append("=========================== Configuration ============================\n") if (conf.getAll.nonEmpty) { - val maxKeyLength = conf.getAll.toMap.keys.map(_.length).max - conf.getAll.sortBy(_._1).foreach { case (key, value) => + val redactedConf = Utils.redact(conf, conf.getAll) + val maxKeyLength = redactedConf.toMap.keys.map(_.length).max + redactedConf.sortBy(_._1).foreach { case (key, value) => sb.append(config(key, value, maxKeyLength)) } } diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala index 0da4b5513..e1d4bf812 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala @@ -86,6 +86,11 @@ private[celeborn] class Worker( val workerStatusManager = new WorkerStatusManager(conf) private val authEnabled = conf.authEnabled private val secretRegistry = new WorkerSecretRegistryImpl(conf.workerApplicationRegistryCacheSize) + + if (conf.logCelebornConfEnabled) { + logInfo(getConf) + } + val rpcEnv: RpcEnv = if (!authEnabled) { RpcEnv.create(