From 05b6ad4a7b6af2ddfb1b196d95ce16064c223d56 Mon Sep 17 00:00:00 2001 From: Sanskar Modi Date: Tue, 11 Mar 2025 07:39:32 +0800 Subject: [PATCH] [MINOR] Change config versions ### What changes were proposed in this pull request? 0.6.0 -> 0.5.4 - `celeborn.rpc.retryWait` - `celeborn.client.rpc.retryWait` `empty` -> 0.5.4 - `celeborn..io.conflictAvoidChooser.enable` ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Closes #3142 from s0nskar/config_rpc_retry. Authored-by: Sanskar Modi Signed-off-by: SteNicholas --- .../main/scala/org/apache/celeborn/common/CelebornConf.scala | 5 +++-- docs/configuration/client.md | 2 +- docs/configuration/network.md | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 00c25b379..c4b8ffb54 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1911,7 +1911,7 @@ object CelebornConf extends Logging { val RPC_RETRY_WAIT: ConfigEntry[Long] = buildConf("celeborn.rpc.retryWait") .categories("network") - .version("0.6.0") + .version("0.5.4") .doc("Time to wait before next retry on RpcTimeoutException.") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("1s") @@ -2094,6 +2094,7 @@ object CelebornConf extends Logging { val NETWORK_IO_CLIENT_CONFLICT_AVOID_CHOOSER_ENABLE: ConfigEntry[Boolean] = buildConf("celeborn..io.conflictAvoidChooser.enable") .categories("network") + .version("0.5.4") .doc("Whether to use conflict avoid event executor chooser in the client thread pool. " + s"If setting to `${TransportModuleConstants.RPC_APP_MODULE}`, " + s"works for shuffle client. " + @@ -4990,7 +4991,7 @@ object CelebornConf extends Logging { val CLIENT_RPC_RETRY_WAIT: ConfigEntry[Long] = buildConf("celeborn.client.rpc.retryWait") .categories("client") - .version("0.6.0") + .version("0.5.4") .doc("Client-specified time to wait before next retry on RpcTimeoutException.") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("1s") diff --git a/docs/configuration/client.md b/docs/configuration/client.md index d9e1a700c..6463edddc 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -85,7 +85,7 @@ license: | | celeborn.client.rpc.registerShuffle.askTimeout | <value of celeborn.rpc.askTimeout> | false | Timeout for ask operations during register shuffle. During this process, there are two times for retry opportunities for requesting slots, one request for establishing a connection with Worker and `celeborn.client.reserveSlots.maxRetries` times for retry opportunities for reserving slots. User can customize this value according to your setting. | 0.3.0 | celeborn.rpc.registerShuffle.askTimeout | | celeborn.client.rpc.requestPartition.askTimeout | <value of celeborn.rpc.askTimeout> | false | Timeout for ask operations during requesting change partition location, such as reviving or splitting partition. During this process, there are `celeborn.client.reserveSlots.maxRetries` times for retry opportunities for reserving slots. User can customize this value according to your setting. | 0.2.0 | | | celeborn.client.rpc.reserveSlots.askTimeout | <value of celeborn.rpc.askTimeout> | false | Timeout for LifecycleManager request reserve slots. | 0.3.0 | | -| celeborn.client.rpc.retryWait | 1s | false | Client-specified time to wait before next retry on RpcTimeoutException. | 0.6.0 | | +| celeborn.client.rpc.retryWait | 1s | false | Client-specified time to wait before next retry on RpcTimeoutException. | 0.5.4 | | | celeborn.client.rpc.shared.threads | 16 | false | Number of shared rpc threads in LifecycleManager. | 0.3.2 | | | celeborn.client.shuffle.batchHandleChangePartition.interval | 100ms | false | Interval for LifecycleManager to schedule handling change partition requests in batch. | 0.3.0 | celeborn.shuffle.batchHandleChangePartition.interval | | celeborn.client.shuffle.batchHandleChangePartition.partitionBuckets | 256 | false | Max number of change partition requests which can be concurrently processed. | 0.5.0 | | diff --git a/docs/configuration/network.md b/docs/configuration/network.md index e60f4f1a6..199a9328f 100644 --- a/docs/configuration/network.md +++ b/docs/configuration/network.md @@ -24,7 +24,7 @@ license: | | celeborn.<module>.heartbeat.interval | 60s | false | The heartbeat interval between worker and client. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `replicate`, it works for replicate client of worker replicating data to peer worker. If you are using the "celeborn.client.heartbeat.interval", please use the new configs for each module according to your needs or replace it with "celeborn.rpc.heartbeat.interval", "celeborn.data.heartbeat.interval" and "celeborn.replicate.heartbeat.interval". | 0.3.0 | celeborn.client.heartbeat.interval | | celeborn.<module>.io.backLog | 0 | false | Requested maximum length of the queue of incoming connections. Default 0 for no backlog. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `push`, it works for worker receiving push data. If setting to `replicate`, it works for replicate server of worker replicating data to peer worker. If setting to `fetch`, it works for worker fetch server. | | | | celeborn.<module>.io.clientThreads | 0 | false | Number of threads used in the client thread pool. Default to 0, which is 2x#cores. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `replicate`, it works for replicate client of worker replicating data to peer worker. | | | -| celeborn.<module>.io.conflictAvoidChooser.enable | false | false | Whether to use conflict avoid event executor chooser in the client thread pool. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `replicate`, it works for replicate client of worker replicating data to peer worker. | | | +| celeborn.<module>.io.conflictAvoidChooser.enable | false | false | Whether to use conflict avoid event executor chooser in the client thread pool. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `replicate`, it works for replicate client of worker replicating data to peer worker. | 0.5.4 | | | celeborn.<module>.io.connectTimeout | <value of celeborn.network.connect.timeout> | false | Socket connect timeout. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `replicate`, it works for the replicate client of worker replicating data to peer worker. | | | | celeborn.<module>.io.connectionTimeout | <value of celeborn.network.timeout> | false | Connection active timeout. If setting to `rpc_app`, works for shuffle client. If setting to `rpc_service`, works for master or worker. If setting to `data`, it works for shuffle client push and fetch data. If setting to `push`, it works for worker receiving push data. If setting to `replicate`, it works for replicate server or client of worker replicating data to peer worker. If setting to `fetch`, it works for worker fetch server. | | | | celeborn.<module>.io.lazyFD | true | false | Whether to initialize FileDescriptor lazily or not. If true, file descriptors are created only when data is going to be transferred. This can reduce the number of open files. If setting to `fetch`, it works for worker fetch server. | | | @@ -56,7 +56,7 @@ license: | | celeborn.rpc.inbox.capacity | 0 | false | Specifies size of the in memory bounded capacity. | 0.5.0 | | | celeborn.rpc.io.threads | <undefined> | false | Netty IO thread number of NettyRpcEnv to handle RPC request. The default threads number is the number of runtime available processors. | 0.2.0 | | | celeborn.rpc.lookupTimeout | 30s | false | Timeout for RPC lookup operations. | 0.2.0 | | -| celeborn.rpc.retryWait | 1s | false | Time to wait before next retry on RpcTimeoutException. | 0.6.0 | | +| celeborn.rpc.retryWait | 1s | false | Time to wait before next retry on RpcTimeoutException. | 0.5.4 | | | celeborn.rpc.slow.interval | <undefined> | false | min interval (ms) for RPC framework to log slow RPC | 0.6.0 | | | celeborn.rpc.slow.threshold | 1s | false | threshold for RPC framework to log slow RPC | 0.6.0 | | | celeborn.shuffle.io.maxChunksBeingTransferred | <undefined> | false | The max number of chunks allowed to be transferred at the same time on shuffle service. Note that new incoming connections will be closed when the max number is hit. The client will retry according to the shuffle retry configs (see `celeborn..io.maxRetries` and `celeborn..io.retryWait`), if those limits are reached the task will fail with fetch failure. | 0.2.0 | |