[MINOR] Fix missing blanks in docs
### What changes were proposed in this pull request? When looking into the source code, I found some blank delimiter is missing in the ConfigEntry doc. In this PR, I go through all the ConfigEntry docs to fix the missing blank in the description. ### Why are the changes needed? Fix typo. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? GA. Closes #2917 from turboFei/nit_docs. Authored-by: Wang, Fei <fwang12@ebay.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
This commit is contained in:
parent
def5254ec2
commit
7fbf0e2fa5
@ -2044,7 +2044,7 @@ object CelebornConf extends Logging {
|
||||
s"If setting <module> to `${TransportModuleConstants.DATA_MODULE}`, " +
|
||||
s"it works for shuffle client push and fetch data. " +
|
||||
s"If setting <module> to `${TransportModuleConstants.REPLICATE_MODULE}`, " +
|
||||
s"it works for replicate client of worker replicating data to peer worker." +
|
||||
s"it works for replicate client of worker replicating data to peer worker. " +
|
||||
s"If setting <module> to `${TransportModuleConstants.PUSH_MODULE}`, " +
|
||||
s"it works for Flink shuffle client push data.")
|
||||
.intConf
|
||||
@ -2058,7 +2058,7 @@ object CelebornConf extends Logging {
|
||||
s"If setting <module> to `${TransportModuleConstants.DATA_MODULE}`, " +
|
||||
s"it works for shuffle client push and fetch data. " +
|
||||
s"If setting <module> to `${TransportModuleConstants.REPLICATE_MODULE}`, " +
|
||||
s"it works for replicate client of worker replicating data to peer worker." +
|
||||
s"it works for replicate client of worker replicating data to peer worker. " +
|
||||
s"If setting <module> to `${TransportModuleConstants.PUSH_MODULE}`, " +
|
||||
s"it works for Flink shuffle client push data.")
|
||||
.version("0.2.0")
|
||||
@ -2169,11 +2169,11 @@ object CelebornConf extends Logging {
|
||||
s"If setting <module> to `${TransportModuleConstants.DATA_MODULE}`, " +
|
||||
s"it works for shuffle client push and fetch data. " +
|
||||
s"If setting <module> to `${TransportModuleConstants.REPLICATE_MODULE}`, " +
|
||||
s"it works for replicate client of worker replicating data to peer worker." +
|
||||
s"it works for replicate client of worker replicating data to peer worker. " +
|
||||
"If you are using the \"celeborn.client.heartbeat.interval\", " +
|
||||
"please use the new configs for each module according to your needs or " +
|
||||
"replace it with \"celeborn.rpc.heartbeat.interval\", " +
|
||||
"\"celeborn.data.heartbeat.interval\" and" +
|
||||
"\"celeborn.data.heartbeat.interval\" and " +
|
||||
"\"celeborn.replicate.heartbeat.interval\". ")
|
||||
.timeConf(TimeUnit.MILLISECONDS)
|
||||
.createWithDefaultString("60s")
|
||||
@ -2196,7 +2196,7 @@ object CelebornConf extends Logging {
|
||||
val MASTER_ENDPOINTS: ConfigEntry[Seq[String]] =
|
||||
buildConf("celeborn.master.endpoints")
|
||||
.categories("client", "worker")
|
||||
.doc("Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided by" +
|
||||
.doc("Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided by " +
|
||||
s"${MASTER_ENDPOINTS_RESOLVER.key} to resolve the master endpoints. By default Celeborn uses " +
|
||||
"`org.apache.celeborn.common.client.StaticMasterEndpointResolver` which take static master endpoints " +
|
||||
"as input. Allowed pattern: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. " +
|
||||
@ -3375,7 +3375,7 @@ object CelebornConf extends Logging {
|
||||
val WORKER_REPLICATE_FAST_FAIL_DURATION: ConfigEntry[Long] =
|
||||
buildConf("celeborn.worker.replicate.fastFail.duration")
|
||||
.categories("worker")
|
||||
.doc("If a replicate request not replied during the duration, worker will mark the replicate data request as failed." +
|
||||
.doc("If a replicate request not replied during the duration, worker will mark the replicate data request as failed. " +
|
||||
"It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`.")
|
||||
.version("0.2.0")
|
||||
.timeConf(TimeUnit.MILLISECONDS)
|
||||
@ -3845,7 +3845,7 @@ object CelebornConf extends Logging {
|
||||
buildConf("celeborn.worker.congestionControl.diskBuffer.high.watermark")
|
||||
.withAlternative("celeborn.worker.congestionControl.high.watermark")
|
||||
.categories("worker")
|
||||
.doc("If the total bytes in disk buffer exceeds this configure, will start to congest" +
|
||||
.doc("If the total bytes in disk buffer exceeds this configure, will start to congest " +
|
||||
"users whose produce rate is higher than the potential average consume rate. " +
|
||||
"The congestion will stop if the produce rate is lower or equal to the " +
|
||||
"average consume rate, or the total pending bytes lower than " +
|
||||
@ -4225,7 +4225,7 @@ object CelebornConf extends Logging {
|
||||
.withAlternative("celeborn.worker.excluded.expireTimeout")
|
||||
.categories("client")
|
||||
.version("0.3.0")
|
||||
.doc("Timeout time for LifecycleManager to clear reserved excluded worker. Default to be 1.5 * `celeborn.master.heartbeat.worker.timeout`" +
|
||||
.doc("Timeout time for LifecycleManager to clear reserved excluded worker. Default to be 1.5 * `celeborn.master.heartbeat.worker.timeout` " +
|
||||
"to cover worker heartbeat timeout check period")
|
||||
.timeConf(TimeUnit.MILLISECONDS)
|
||||
.createWithDefaultString("180s")
|
||||
@ -4235,7 +4235,7 @@ object CelebornConf extends Logging {
|
||||
.internal
|
||||
.categories("client")
|
||||
.version("0.3.0")
|
||||
.doc("When true, Celeborn will use local allocated workers as candidate being checked workers(check the workers" +
|
||||
.doc("When true, Celeborn will use local allocated workers as candidate being checked workers(check the workers " +
|
||||
"whether unKnown in master), this may be more useful for map partition to regenerate the lost data), " +
|
||||
"otherwise use local black list as candidate being checked workers.")
|
||||
.booleanConf
|
||||
@ -4441,7 +4441,7 @@ object CelebornConf extends Logging {
|
||||
buildConf("celeborn.client.push.limit.inFlight.timeout")
|
||||
.withAlternative("celeborn.push.limit.inFlight.timeout")
|
||||
.categories("client")
|
||||
.doc("Timeout for netty in-flight requests to be done." +
|
||||
.doc("Timeout for netty in-flight requests to be done. " +
|
||||
s"Default value should be `${CLIENT_PUSH_DATA_TIMEOUT.key} * 2`.")
|
||||
.version("0.3.0")
|
||||
.timeConf(TimeUnit.MILLISECONDS)
|
||||
@ -4577,8 +4577,9 @@ object CelebornConf extends Logging {
|
||||
val CLIENT_FETCH_EXCLUDED_WORKER_EXPIRE_TIMEOUT: ConfigEntry[Long] =
|
||||
buildConf("celeborn.client.fetch.excludedWorker.expireTimeout")
|
||||
.categories("client")
|
||||
.doc("ShuffleClient is a static object, it will be used in the whole lifecycle of Executor," +
|
||||
"We give a expire time for excluded workers to avoid a transient worker issues.")
|
||||
.doc(
|
||||
"ShuffleClient is a static object, it will be used in the whole lifecycle of Executor, " +
|
||||
"We give a expire time for excluded workers to avoid a transient worker issues.")
|
||||
.version("0.3.0")
|
||||
.fallbackConf(CLIENT_EXCLUDED_WORKER_EXPIRE_TIMEOUT)
|
||||
|
||||
@ -5257,7 +5258,7 @@ object CelebornConf extends Logging {
|
||||
val METRICS_WORKER_PAUSE_SPENT_TIME_FORCE_APPEND_THRESHOLD: ConfigEntry[Int] =
|
||||
buildConf("celeborn.metrics.worker.pauseSpentTime.forceAppend.threshold")
|
||||
.categories("metrics")
|
||||
.doc("Force append worker pause spent time even if worker still in pause serving state." +
|
||||
.doc("Force append worker pause spent time even if worker still in pause serving state. " +
|
||||
"Help user can find worker pause spent time increase, when worker always been pause state.")
|
||||
.intConf
|
||||
.createWithDefault(10)
|
||||
@ -5558,8 +5559,8 @@ object CelebornConf extends Logging {
|
||||
val CLIENT_INPUTSTREAM_CREATION_WINDOW: ConfigEntry[Int] =
|
||||
buildConf("celeborn.client.inputStream.creation.window")
|
||||
.categories("client")
|
||||
.doc(s"Window size that CelebornShuffleReader pre-creates CelebornInputStreams, for coalesced scenario" +
|
||||
s"where multiple Partitions are read")
|
||||
.doc("Window size that CelebornShuffleReader pre-creates CelebornInputStreams, for coalesced scenario " +
|
||||
"where multiple Partitions are read")
|
||||
.version("0.6.0")
|
||||
.intConf
|
||||
.createWithDefault(16)
|
||||
@ -5595,7 +5596,7 @@ object CelebornConf extends Logging {
|
||||
"Store backend for dynamic config service. The store backend can be specified in two ways:" +
|
||||
" - Using the short name of the store backend defined in the implementation of `ConfigStore#getName` " +
|
||||
"whose return value can be mapped to the corresponding backend implementation. Available options: FS, DB." +
|
||||
" - Using the service class name of the store backend implementation." +
|
||||
" - Using the service class name of the store backend implementation. " +
|
||||
"If not provided, it means that dynamic configuration is disabled.")
|
||||
.version("0.4.0")
|
||||
.stringConf
|
||||
|
||||
@ -27,11 +27,11 @@ license: |
|
||||
| celeborn.client.commitFiles.ignoreExcludedWorker | false | false | When true, LifecycleManager will skip workers which are in the excluded list. | 0.3.0 | |
|
||||
| celeborn.client.eagerlyCreateInputStream.threads | 32 | false | Threads count for streamCreatorPool in CelebornShuffleReader. | 0.3.1 | |
|
||||
| celeborn.client.excludePeerWorkerOnFailure.enabled | true | false | When true, Celeborn will exclude partition's peer worker on failure when push data to replica failed. | 0.3.0 | |
|
||||
| celeborn.client.excludedWorker.expireTimeout | 180s | false | Timeout time for LifecycleManager to clear reserved excluded worker. Default to be 1.5 * `celeborn.master.heartbeat.worker.timeout`to cover worker heartbeat timeout check period | 0.3.0 | celeborn.worker.excluded.expireTimeout |
|
||||
| celeborn.client.excludedWorker.expireTimeout | 180s | false | Timeout time for LifecycleManager to clear reserved excluded worker. Default to be 1.5 * `celeborn.master.heartbeat.worker.timeout` to cover worker heartbeat timeout check period | 0.3.0 | celeborn.worker.excluded.expireTimeout |
|
||||
| celeborn.client.fetch.buffer.size | 64k | false | Size of reducer partition buffer memory for shuffle reader. The fetched data will be buffered in memory before consuming. For performance consideration keep this buffer size not less than `celeborn.client.push.buffer.max.size`. | 0.4.0 | |
|
||||
| celeborn.client.fetch.dfsReadChunkSize | 8m | false | Max chunk size for DfsPartitionReader. | 0.3.1 | |
|
||||
| celeborn.client.fetch.excludeWorkerOnFailure.enabled | false | false | Whether to enable shuffle client-side fetch exclude workers on failure. | 0.3.0 | |
|
||||
| celeborn.client.fetch.excludedWorker.expireTimeout | <value of celeborn.client.excludedWorker.expireTimeout> | false | ShuffleClient is a static object, it will be used in the whole lifecycle of Executor,We give a expire time for excluded workers to avoid a transient worker issues. | 0.3.0 | |
|
||||
| celeborn.client.fetch.excludedWorker.expireTimeout | <value of celeborn.client.excludedWorker.expireTimeout> | false | ShuffleClient is a static object, it will be used in the whole lifecycle of Executor, We give a expire time for excluded workers to avoid a transient worker issues. | 0.3.0 | |
|
||||
| celeborn.client.fetch.maxReqsInFlight | 3 | false | Amount of in-flight chunk fetch request. | 0.3.0 | celeborn.fetch.maxReqsInFlight |
|
||||
| celeborn.client.fetch.maxRetriesForEachReplica | 3 | false | Max retry times of fetch chunk on each replica | 0.3.0 | celeborn.fetch.maxRetriesForEachReplica,celeborn.fetch.maxRetries |
|
||||
| celeborn.client.fetch.timeout | 600s | false | Timeout for a task to open stream and fetch chunk. | 0.3.0 | celeborn.fetch.timeout |
|
||||
@ -41,13 +41,13 @@ license: |
|
||||
| celeborn.client.flink.inputGate.supportFloatingBuffer | true | false | Whether to support floating buffer in Flink input gates. | 0.3.0 | remote-shuffle.job.support-floating-buffer-per-input-gate |
|
||||
| celeborn.client.flink.resultPartition.memory | 64m | false | Memory reserved for a result partition. | 0.3.0 | remote-shuffle.job.memory-per-partition |
|
||||
| celeborn.client.flink.resultPartition.supportFloatingBuffer | true | false | Whether to support floating buffer for result partitions. | 0.3.0 | remote-shuffle.job.support-floating-buffer-per-output-gate |
|
||||
| celeborn.client.inputStream.creation.window | 16 | false | Window size that CelebornShuffleReader pre-creates CelebornInputStreams, for coalesced scenariowhere multiple Partitions are read | 0.6.0 | |
|
||||
| celeborn.client.inputStream.creation.window | 16 | false | Window size that CelebornShuffleReader pre-creates CelebornInputStreams, for coalesced scenario where multiple Partitions are read | 0.6.0 | |
|
||||
| celeborn.client.mr.pushData.max | 32m | false | Max size for a push data sent from mr client. | 0.4.0 | |
|
||||
| celeborn.client.push.buffer.initial.size | 8k | false | | 0.3.0 | celeborn.push.buffer.initial.size |
|
||||
| celeborn.client.push.buffer.max.size | 64k | false | Max size of reducer partition buffer memory for shuffle hash writer. The pushed data will be buffered in memory before sending to Celeborn worker. For performance consideration keep this buffer size higher than 32K. Example: If reducer amount is 2000, buffer size is 64K, then each task will consume up to `64KiB * 2000 = 125MiB` heap memory. | 0.3.0 | celeborn.push.buffer.max.size |
|
||||
| celeborn.client.push.excludeWorkerOnFailure.enabled | false | false | Whether to enable shuffle client-side push exclude workers on failures. | 0.3.0 | |
|
||||
| celeborn.client.push.limit.inFlight.sleepInterval | 50ms | false | Sleep interval when check netty in-flight requests to be done. | 0.3.0 | celeborn.push.limit.inFlight.sleepInterval |
|
||||
| celeborn.client.push.limit.inFlight.timeout | <undefined> | false | Timeout for netty in-flight requests to be done.Default value should be `celeborn.client.push.timeout * 2`. | 0.3.0 | celeborn.push.limit.inFlight.timeout |
|
||||
| celeborn.client.push.limit.inFlight.timeout | <undefined> | false | Timeout for netty in-flight requests to be done. Default value should be `celeborn.client.push.timeout * 2`. | 0.3.0 | celeborn.push.limit.inFlight.timeout |
|
||||
| celeborn.client.push.limit.strategy | SIMPLE | false | The strategy used to control the push speed. Valid strategies are SIMPLE and SLOWSTART. The SLOWSTART strategy usually works with congestion control mechanism on the worker side. | 0.3.0 | |
|
||||
| celeborn.client.push.maxReqsInFlight.perWorker | 32 | false | Amount of Netty in-flight requests per worker. Default max memory of in flight requests per worker is `celeborn.client.push.maxReqsInFlight.perWorker` * `celeborn.client.push.buffer.max.size` * compression ratio(1 in worst case): 64KiB * 32 = 2MiB. The maximum memory will not exceed `celeborn.client.push.maxReqsInFlight.total`. | 0.3.0 | |
|
||||
| celeborn.client.push.maxReqsInFlight.total | 256 | false | Amount of total Netty in-flight requests. The maximum memory is `celeborn.client.push.maxReqsInFlight.total` * `celeborn.client.push.buffer.max.size` * compression ratio(1 in worst case): 64KiB * 256 = 16MiB | 0.3.0 | celeborn.push.maxReqsInFlight |
|
||||
@ -121,7 +121,7 @@ license: |
|
||||
| celeborn.client.spark.shuffle.forceFallback.enabled | false | false | Always use spark built-in shuffle implementation. This configuration is deprecated, consider configuring `celeborn.client.spark.shuffle.fallback.policy` instead. | 0.3.0 | celeborn.shuffle.forceFallback.enabled |
|
||||
| celeborn.client.spark.shuffle.writer | HASH | false | Celeborn supports the following kind of shuffle writers. 1. hash: hash-based shuffle writer works fine when shuffle partition count is normal; 2. sort: sort-based shuffle writer works fine when memory pressure is high or shuffle partition count is huge. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is false. | 0.3.0 | celeborn.shuffle.writer |
|
||||
| celeborn.client.tagsExpr | | false | Expression to filter workers by tags. The expression is a comma-separated list of tags. The expression is evaluated as a logical AND of all tags. For example, `prod,high-io` filters workers that have both the `prod` and `high-io` tags. | 0.6.0 | |
|
||||
| celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided byceleborn.master.endpoints.resolver to resolve the master endpoints. By default Celeborn uses `org.apache.celeborn.common.client.StaticMasterEndpointResolver` which take static master endpoints as input. Allowed pattern: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. If the master endpoints are not static then users can pass custom resolver implementation to discover master endpoints actively using celeborn.master.endpoints.resolver. | 0.2.0 | |
|
||||
| celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided by celeborn.master.endpoints.resolver to resolve the master endpoints. By default Celeborn uses `org.apache.celeborn.common.client.StaticMasterEndpointResolver` which take static master endpoints as input. Allowed pattern: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. If the master endpoints are not static then users can pass custom resolver implementation to discover master endpoints actively using celeborn.master.endpoints.resolver. | 0.2.0 | |
|
||||
| celeborn.master.endpoints.resolver | org.apache.celeborn.common.client.StaticMasterEndpointResolver | false | Resolver class that can be used for discovering and updating the master endpoints. This allows users to provide a custom master endpoint resolver implementation. This is useful in environments where the master nodes might change due to scaling operations or infrastructure updates. Clients need to ensure that provided resolver class should be present in the classpath. | 0.6.0 | |
|
||||
| celeborn.quota.enabled | true | false | When Master side sets to true, the master will enable to check the quota via QuotaManager. When Client side sets to true, LifecycleManager will request Master side to check whether the current user has enough quota before registration of shuffle. Fallback to the default shuffle service of Spark when Master side checks that there is no enough quota for current user. | 0.2.0 | |
|
||||
| celeborn.quota.identity.provider | org.apache.celeborn.common.identity.DefaultIdentityProvider | false | IdentityProvider class name. Default class is `org.apache.celeborn.common.identity.DefaultIdentityProvider`. Optional values: org.apache.celeborn.common.identity.HadoopBasedIdentityProvider user name will be obtained by UserGroupInformation.getUserName; org.apache.celeborn.common.identity.DefaultIdentityProvider user name and tenant id are default values or user-specific values. | 0.2.0 | |
|
||||
|
||||
@ -22,7 +22,7 @@ license: |
|
||||
| celeborn.cluster.name | default | false | Celeborn cluster name. | 0.5.0 | |
|
||||
| celeborn.container.info.provider | org.apache.celeborn.server.common.container.DefaultContainerInfoProvider | false | ContainerInfoProvider class name. Default class is `org.apache.celeborn.server.common.container.DefaultContainerInfoProvider`. | 0.6.0 | |
|
||||
| celeborn.dynamicConfig.refresh.interval | 120s | false | Interval for refreshing the corresponding dynamic config periodically. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.backend | <undefined> | false | Store backend for dynamic config service. The store backend can be specified in two ways: - Using the short name of the store backend defined in the implementation of `ConfigStore#getName` whose return value can be mapped to the corresponding backend implementation. Available options: FS, DB. - Using the service class name of the store backend implementation.If not provided, it means that dynamic configuration is disabled. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.backend | <undefined> | false | Store backend for dynamic config service. The store backend can be specified in two ways: - Using the short name of the store backend defined in the implementation of `ConfigStore#getName` whose return value can be mapped to the corresponding backend implementation. Available options: FS, DB. - Using the service class name of the store backend implementation. If not provided, it means that dynamic configuration is disabled. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.db.fetch.pageSize | 1000 | false | The page size for db store to query configurations. | 0.5.0 | |
|
||||
| celeborn.dynamicConfig.store.db.hikari.connectionTimeout | 30s | false | The connection timeout that a client will wait for a connection from the pool for db store backend. | 0.5.0 | |
|
||||
| celeborn.dynamicConfig.store.db.hikari.driverClassName | | false | The jdbc driver class name of db store backend. | 0.5.0 | |
|
||||
|
||||
@ -33,5 +33,5 @@ license: |
|
||||
| celeborn.metrics.sample.rate | 1.0 | false | It controls if Celeborn collect timer metrics for some operations. Its value should be in [0.0, 1.0]. | 0.2.0 | |
|
||||
| celeborn.metrics.timer.slidingWindow.size | 4096 | false | The sliding window size of timer metric. | 0.2.0 | |
|
||||
| celeborn.metrics.worker.app.topResourceConsumption.count | 50 | false | Size for top items about top resource consumption applications list of worker. The top resource consumption is determined by sum of diskBytesWritten and hdfsBytesWritten. The top resource consumption count prevents the total number of metrics from exceeding the metrics capacity. | 0.6.0 | |
|
||||
| celeborn.metrics.worker.pauseSpentTime.forceAppend.threshold | 10 | false | Force append worker pause spent time even if worker still in pause serving state.Help user can find worker pause spent time increase, when worker always been pause state. | | |
|
||||
| celeborn.metrics.worker.pauseSpentTime.forceAppend.threshold | 10 | false | Force append worker pause spent time even if worker still in pause serving state. Help user can find worker pause spent time increase, when worker always been pause state. | | |
|
||||
<!--end-include-->
|
||||
|
||||
@ -21,19 +21,19 @@ license: |
|
||||
| --- | ------- | --------- | ----------- | ----- | ---------- |
|
||||
| celeborn.<module>.fetch.timeoutCheck.interval | 5s | false | Interval for checking fetch data timeout. It only support setting <module> to `data` since it works for shuffle client fetch data. | 0.3.0 | |
|
||||
| celeborn.<module>.fetch.timeoutCheck.threads | 4 | false | Threads num for checking fetch data timeout. It only support setting <module> to `data` since it works for shuffle client fetch data. | 0.3.0 | |
|
||||
| celeborn.<module>.heartbeat.interval | 60s | false | The heartbeat interval between worker and client. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker.If you are using the "celeborn.client.heartbeat.interval", please use the new configs for each module according to your needs or replace it with "celeborn.rpc.heartbeat.interval", "celeborn.data.heartbeat.interval" and"celeborn.replicate.heartbeat.interval". | 0.3.0 | celeborn.client.heartbeat.interval |
|
||||
| celeborn.<module>.heartbeat.interval | 60s | false | The heartbeat interval between worker and client. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker. If you are using the "celeborn.client.heartbeat.interval", please use the new configs for each module according to your needs or replace it with "celeborn.rpc.heartbeat.interval", "celeborn.data.heartbeat.interval" and "celeborn.replicate.heartbeat.interval". | 0.3.0 | celeborn.client.heartbeat.interval |
|
||||
| celeborn.<module>.io.backLog | 0 | false | Requested maximum length of the queue of incoming connections. Default 0 for no backlog. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | | |
|
||||
| celeborn.<module>.io.clientThreads | 0 | false | Number of threads used in the client thread pool. Default to 0, which is 2x#cores. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker. | | |
|
||||
| celeborn.<module>.io.connectTimeout | <value of celeborn.network.connect.timeout> | false | Socket connect timeout. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for the replicate client of worker replicating data to peer worker. | | |
|
||||
| celeborn.<module>.io.connectionTimeout | <value of celeborn.network.timeout> | false | Connection active timeout. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server or client of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | | |
|
||||
| celeborn.<module>.io.enableVerboseMetrics | false | false | Whether to track Netty memory detailed metrics. If true, the detailed metrics of Netty PoolByteBufAllocator will be gotten, otherwise only general memory usage will be tracked. | | |
|
||||
| celeborn.<module>.io.lazyFD | true | false | Whether to initialize FileDescriptor lazily or not. If true, file descriptors are created only when data is going to be transferred. This can reduce the number of open files. If setting <module> to `fetch`, it works for worker fetch server. | | |
|
||||
| celeborn.<module>.io.maxRetries | 3 | false | Max number of times we will try IO exceptions (such as connection timeouts) per request. If set to 0, we will not do any retries. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker.If setting <module> to `push`, it works for Flink shuffle client push data. | | |
|
||||
| celeborn.<module>.io.maxRetries | 3 | false | Max number of times we will try IO exceptions (such as connection timeouts) per request. If set to 0, we will not do any retries. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker. If setting <module> to `push`, it works for Flink shuffle client push data. | | |
|
||||
| celeborn.<module>.io.mode | NIO | false | Netty EventLoopGroup backend, available options: NIO, EPOLL. | | |
|
||||
| celeborn.<module>.io.numConnectionsPerPeer | 1 | false | Number of concurrent connections between two nodes. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker. | | |
|
||||
| celeborn.<module>.io.preferDirectBufs | true | false | If true, we will prefer allocating off-heap byte buffers within Netty. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server or client of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | | |
|
||||
| celeborn.<module>.io.receiveBuffer | 0b | false | Receive buffer size (SO_RCVBUF). Note: the optimal size for receive buffer and send buffer should be latency * network_bandwidth. Assuming latency = 1ms, network_bandwidth = 10Gbps buffer size should be ~ 1.25MB. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server or client of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | 0.2.0 | |
|
||||
| celeborn.<module>.io.retryWait | 5s | false | Time that we will wait in order to perform a retry after an IOException. Only relevant if maxIORetries > 0. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker.If setting <module> to `push`, it works for Flink shuffle client push data. | 0.2.0 | |
|
||||
| celeborn.<module>.io.retryWait | 5s | false | Time that we will wait in order to perform a retry after an IOException. Only relevant if maxIORetries > 0. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `replicate`, it works for replicate client of worker replicating data to peer worker. If setting <module> to `push`, it works for Flink shuffle client push data. | 0.2.0 | |
|
||||
| celeborn.<module>.io.saslTimeout | 30s | false | Timeout for a single round trip of auth message exchange, in milliseconds. | 0.5.0 | |
|
||||
| celeborn.<module>.io.sendBuffer | 0b | false | Send buffer size (SO_SNDBUF). If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `data`, it works for shuffle client push and fetch data. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server or client of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | 0.2.0 | |
|
||||
| celeborn.<module>.io.serverThreads | 0 | false | Number of threads used in the server thread pool. Default to 0, which is 2x#cores. If setting <module> to `rpc_app`, works for shuffle client. If setting <module> to `rpc_service`, works for master or worker. If setting <module> to `push`, it works for worker receiving push data. If setting <module> to `replicate`, it works for replicate server of worker replicating data to peer worker. If setting <module> to `fetch`, it works for worker fetch server. | | |
|
||||
|
||||
@ -22,7 +22,7 @@ license: |
|
||||
| celeborn.cluster.name | default | false | Celeborn cluster name. | 0.5.0 | |
|
||||
| celeborn.container.info.provider | org.apache.celeborn.server.common.container.DefaultContainerInfoProvider | false | ContainerInfoProvider class name. Default class is `org.apache.celeborn.server.common.container.DefaultContainerInfoProvider`. | 0.6.0 | |
|
||||
| celeborn.dynamicConfig.refresh.interval | 120s | false | Interval for refreshing the corresponding dynamic config periodically. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.backend | <undefined> | false | Store backend for dynamic config service. The store backend can be specified in two ways: - Using the short name of the store backend defined in the implementation of `ConfigStore#getName` whose return value can be mapped to the corresponding backend implementation. Available options: FS, DB. - Using the service class name of the store backend implementation.If not provided, it means that dynamic configuration is disabled. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.backend | <undefined> | false | Store backend for dynamic config service. The store backend can be specified in two ways: - Using the short name of the store backend defined in the implementation of `ConfigStore#getName` whose return value can be mapped to the corresponding backend implementation. Available options: FS, DB. - Using the service class name of the store backend implementation. If not provided, it means that dynamic configuration is disabled. | 0.4.0 | |
|
||||
| celeborn.dynamicConfig.store.db.fetch.pageSize | 1000 | false | The page size for db store to query configurations. | 0.5.0 | |
|
||||
| celeborn.dynamicConfig.store.db.hikari.connectionTimeout | 30s | false | The connection timeout that a client will wait for a connection from the pool for db store backend. | 0.5.0 | |
|
||||
| celeborn.dynamicConfig.store.db.hikari.driverClassName | | false | The jdbc driver class name of db store backend. | 0.5.0 | |
|
||||
@ -35,7 +35,7 @@ license: |
|
||||
| celeborn.dynamicConfig.store.fs.path | <undefined> | false | The path of dynamic config file for fs store backend. The file format should be yaml. The default path is `${CELEBORN_CONF_DIR}/dynamicConfig.yaml`. | 0.5.0 | |
|
||||
| celeborn.internal.port.enabled | false | false | Whether to create a internal port on Masters/Workers for inter-Masters/Workers communication. This is beneficial when SASL authentication is enforced for all interactions between clients and Celeborn Services, but the services can exchange messages without being subject to SASL authentication. | 0.5.0 | |
|
||||
| celeborn.logConf.enabled | false | false | When `true`, log the CelebornConf for debugging purposes. | 0.5.0 | |
|
||||
| celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided byceleborn.master.endpoints.resolver to resolve the master endpoints. By default Celeborn uses `org.apache.celeborn.common.client.StaticMasterEndpointResolver` which take static master endpoints as input. Allowed pattern: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. If the master endpoints are not static then users can pass custom resolver implementation to discover master endpoints actively using celeborn.master.endpoints.resolver. | 0.2.0 | |
|
||||
| celeborn.master.endpoints | <localhost>:9097 | false | Endpoints of master nodes for celeborn clients to connect. Client uses resolver provided by celeborn.master.endpoints.resolver to resolve the master endpoints. By default Celeborn uses `org.apache.celeborn.common.client.StaticMasterEndpointResolver` which take static master endpoints as input. Allowed pattern: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. If the master endpoints are not static then users can pass custom resolver implementation to discover master endpoints actively using celeborn.master.endpoints.resolver. | 0.2.0 | |
|
||||
| celeborn.master.endpoints.resolver | org.apache.celeborn.common.client.StaticMasterEndpointResolver | false | Resolver class that can be used for discovering and updating the master endpoints. This allows users to provide a custom master endpoint resolver implementation. This is useful in environments where the master nodes might change due to scaling operations or infrastructure updates. Clients need to ensure that provided resolver class should be present in the classpath. | 0.6.0 | |
|
||||
| celeborn.master.estimatedPartitionSize.minSize | 8mb | false | Ignore partition size smaller than this configuration of partition size for estimation. | 0.3.0 | celeborn.shuffle.minPartitionSizeToEstimate |
|
||||
| celeborn.master.internal.endpoints | <localhost>:8097 | false | Endpoints of master nodes just for celeborn workers to connect, allowed pattern is: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:8097,clb2:8097,clb3:8097`. If the port is omitted, 8097 will be used. | 0.5.0 | |
|
||||
@ -59,7 +59,7 @@ license: |
|
||||
| celeborn.worker.commitFiles.timeout | 120s | false | Timeout for a Celeborn worker to commit files of a shuffle. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.3.0 | celeborn.worker.shuffle.commit.timeout |
|
||||
| celeborn.worker.commitFiles.wait.threads | 32 | false | Thread number of worker to wait for commit shuffle data files to finish. | 0.5.0 | |
|
||||
| celeborn.worker.congestionControl.check.interval | 10ms | false | Interval of worker checks congestion if celeborn.worker.congestionControl.enabled is true. | 0.3.2 | |
|
||||
| celeborn.worker.congestionControl.diskBuffer.high.watermark | 9223372036854775807b | false | If the total bytes in disk buffer exceeds this configure, will start to congestusers whose produce rate is higher than the potential average consume rate. The congestion will stop if the produce rate is lower or equal to the average consume rate, or the total pending bytes lower than celeborn.worker.congestionControl.diskBuffer.low.watermark | 0.3.0 | celeborn.worker.congestionControl.high.watermark |
|
||||
| celeborn.worker.congestionControl.diskBuffer.high.watermark | 9223372036854775807b | false | If the total bytes in disk buffer exceeds this configure, will start to congest users whose produce rate is higher than the potential average consume rate. The congestion will stop if the produce rate is lower or equal to the average consume rate, or the total pending bytes lower than celeborn.worker.congestionControl.diskBuffer.low.watermark | 0.3.0 | celeborn.worker.congestionControl.high.watermark |
|
||||
| celeborn.worker.congestionControl.diskBuffer.low.watermark | 9223372036854775807b | false | Will stop congest users if the total pending bytes of disk buffer is lower than this configuration | 0.3.0 | celeborn.worker.congestionControl.low.watermark |
|
||||
| celeborn.worker.congestionControl.enabled | false | false | Whether to enable congestion control or not. | 0.3.0 | |
|
||||
| celeborn.worker.congestionControl.sample.time.window | 10s | false | The worker holds a time sliding list to calculate users' produce/consume rate | 0.3.0 | |
|
||||
@ -155,7 +155,7 @@ license: |
|
||||
| celeborn.worker.readBuffer.target.updateInterval | 100ms | false | The interval for memory manager to calculate new read buffer's target memory. | 0.3.0 | |
|
||||
| celeborn.worker.readBuffer.toTriggerReadMin | 32 | false | Min buffers count for map data partition to trigger read. | 0.3.0 | |
|
||||
| celeborn.worker.register.timeout | 180s | false | Worker register timeout. | 0.2.0 | |
|
||||
| celeborn.worker.replicate.fastFail.duration | 60s | false | If a replicate request not replied during the duration, worker will mark the replicate data request as failed.It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.2.0 | |
|
||||
| celeborn.worker.replicate.fastFail.duration | 60s | false | If a replicate request not replied during the duration, worker will mark the replicate data request as failed. It's recommended to set at least `240s` when `HDFS` is enabled in `celeborn.storage.availableTypes`. | 0.2.0 | |
|
||||
| celeborn.worker.replicate.io.threads | <undefined> | false | Netty IO thread number of worker to replicate shuffle data. The default threads number is the number of flush thread. | 0.2.0 | |
|
||||
| celeborn.worker.replicate.port | 0 | false | Server port for Worker to receive replicate data request from other Workers. | 0.2.0 | |
|
||||
| celeborn.worker.replicate.randomConnection.enabled | true | false | Whether worker will create random connection to peer when replicate data. When false, worker tend to reuse the same cached TransportClient to a specific replicate worker; when true, worker tend to use different cached TransportClient. Netty will use the same thread to serve the same connection, so with more connections replicate server can leverage more netty threads | 0.2.1 | |
|
||||
|
||||
Loading…
Reference in New Issue
Block a user