Spark Conf temp

2020-12-31 15:02:21 +08:00 · 2020-12-31 15:02:21 +08:00 · d70aec651d
commit d70aec651d
parent a1bfcc1273
3 changed files with 302 additions and 8 deletions
--- a/conf/kyuubi-defaults.conf.template
+++ b/conf/kyuubi-defaults.conf.template
@ -20,11 +20,151 @@
 # kyuubi.authentication           NONE
 # kyuubi.frontend.bind.port       10009
 #
+
 ## Spark Configurations, they will override those in $SPARK_HOME/conf/spark-defaults.conf
-#
-# spark.master                    local
-# spark.ui.enabled                false
-# spark.driver.extraJavaOptions   -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
+## Dummy Ones
+# spark.master                      local
+# spark.submit.deployMode           client
+# spark.ui.enabled                  false
+# spark.ui.port                     0
+# spark.driver.extraJavaOptions     -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
+# spark.scheduler.mode              FAIR
+# spark.serializer                  org.apache.spark.serializer.KryoSerializer
+# spark.kryoserializer.buffer.max   128m
+# spark.buffer.size                 131072
+# spark.local.dir                   ./local
+# spark.network.timeout             120s
+# spark.cleaner.periodicGC.interval 10min
+
+## Spark Driver / AM Sizing
+# spark.driver.cores            4
+# spark.driver.memory           8g
+# spark.driver.memoryOverhead   2048
+# spark.driver.extraJavaOptions -XX:MaxDirectMemorySize=2048m
+# spark.driver.maxResultSize    3g
+# spark.yarn.am.cores           4
+# spark.yarn.am.memory	        2g
+# spark.yarn.am.memoryOverhead	1024
+
+## Spark Executor Sizing
+# spark.executor.instances        100
+# spark.executor.cores            4
+# spark.executor.memory           16g
+# spark.executor.memoryOverhead   4096
+# spark.executor.extraJavaOptions -XX:MaxDirectMemorySize=2048m
+
+## Executor Heartbeat
+# spark.storage.blockManagerHeartbeatTimeoutMs                       300s
+# spark.executor.heartbeatInterval                                   15s
+# spark.executor.heartbeat.maxFailures                               30
+
+
+## Event Queue Capacity
+# spark.scheduler.revive.interval                                    1s
+# spark.scheduler.listenerbus.eventqueue.capacity                    100000
+# spark.scheduler.listenerbus.eventqueue.executorManagement.capacity 100000
+# spark.scheduler.listenerbus.eventqueue.appStatus.capacity          100000
+# spark.scheduler.listenerbus.eventqueue.shared.capacity             100000
+# spark.scheduler.listenerbus.eventqueue.eventLog.capacity           20000
+
+## Dynamic Allocation
+# spark.dynamicAllocation.enabled                           true
+# spark.dynamicAllocation.initialExecutors                  10
+# spark.dynamicAllocation.minExecutors                      10
+# spark.dynamicAllocation.maxExecutors                      500
+# spark.dynamicAllocation.executorAllocationRatio           0.8
+# spark.dynamicAllocation.executorIdleTimeout               60s
+# spark.dynamicAllocation.cachedExecutorIdleTimeout         1h
+# spark.dynamicAllocation.shuffleTracking.enabled           false
+# spark.dynamicAllocation.shuffleTracking.timeout           30min
+# spark.dynamicAllocation.schedulerBacklogTimeout           1s
+# spark.dynamicAllocation.sustainedSchedulerBacklogTimeout  1s
+
+## External Shuffle Service
+# spark.shuffle.service.enabled                             true
+# spark.shuffle.service.fetch.rdd.enabled                   true
+# spark.shuffle.service.port                                7337
+
+## Speculation
+# spark.speculation                         true
+# spark.speculation.interval                1s
+# spark.speculation.multiplier              1.5
+# spark.speculation.quantile                0.9
+# spark.speculation.task.duration.threshold 10min
+
+## Shuffle Behavior
+# spark.shuffle.compress                                    true
+# spark.shuffle.detectCorrupt                               true
+# spark.shuffle.detectCorrupt.useExtraMemory                true
+# spark.shuffle.file.buffer                                 64k
+# spark.shuffle.unsafe.file.output.buffer                   64k
+# spark.shuffle.spill.diskWriteBufferSize                   8k
+# spark.shuffle.spill.compress                              true
+# spark.shuffle.mapOutput.dispatcher.numThreads             12
+# spark.shuffle.mapOutput.parallelAggregationThreshold      5000
+# spark.shuffle.readHostLocalDisk                           true
+# spark.shuffle.io.maxRetries                               10
+# spark.shuffle.io.retryWait                                6s
+# spark.shuffle.io.preferDirectBufs                         false
+# spark.shuffle.io.serverThreads                            8
+# spark.shuffle.io.clientThreads                            8
+# spark.shuffle.io.connectionTimeout                        240s
+# spark.shuffle.registration.timeout                        6000
+# spark.shuffle.registration.maxAttempts                    10
+# spark.shuffle.sync                                        false
+# spark.shuffle.useOldFetchProtocol                         true
+# spark.shuffle.unsafe.fastMergeEnabled                     true
+# spark.shuffle.minNumPartitionsToHighlyCompress            100
+# spark.network.maxRemoteBlockSizeFetchToMem                128m
+# spark.reducer.maxSizeInFlight                             48m
+# spark.reducer.maxReqsInFlight                             256
+# spark.reducer.maxBlocksInFlightPerAddress                 256
+
+## Data Locality for Task Schedule
+# spark.locality.wait                                       0s
+# spark.locality.wait.process                               0s
+# spark.locality.wait.node                                  0s
+# spark.locality.wait.rack                                  0s
+
+## Event Logging for History Server
+# spark.eventLog.enabled                            true
+# spark.eventLog.dir                                hdfs://hadoop-dfs/history
+# spark.eventLog.compress                           true
+# spark.eventLog.longForm.enabled                   true
+# spark.eventLog.rolling.enabled                    true
+# spark.yarn.historyServer.address                  http://historyserver:18080
+
+## SQL
+## General SQL Settings
+# spark.sql.shuffle.partitions                              8192
+# spark.sql.optimizer.inSetConversionThreshold              2
+# spark.sql.autoBroadcastJoinThreshold                      64m
+# spark.sql.broadcastTimeout                                600s
+# spark.sql.join.preferSortMergeJoin                        true
+# spark.sql.hive.metastorePartitionPruning                  true
+# spark.sql.parquet.filterPushdown                          true
+# spark.sql.parquet.recordLevelFilter.enabled	            true
+# spark.sql.statistics.fallBackToHdfs	                    true
+## Dynamic Partition Pruning
+# spark.sql.optimizer.dynamicPartitionPruning.enabled             true
+# spark.sql.optimizer.dynamicPartitionPruning.useStats            true
+# spark.sql.optimizer.dynamicPartitionPruning.fallbackFilterRatio 0.5
+# spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcastOnly  true
+
+# Adaptive Query Execution
+# spark.sql.adaptive.enabled                                true
+# spark.sql.adaptive.forceApply                             false
+# spark.sql.adaptive.logLevel                               info
+# spark.sql.adaptive.advisoryPartitionSizeInBytes           128m
+# spark.sql.adaptive.coalescePartitions.enabled             true
+# spark.sql.adaptive.coalescePartitions.minPartitionNum     64
+# spark.sql.adaptive.coalescePartitions.initialPartitionNum
+# spark.sql.adaptive.fetchShuffleBlocksInBatch              true
+# spark.sql.adaptive.localShuffleReader.enabled             true
+# spark.sql.adaptive.skewJoin.enabled                       true
+# spark.sql.adaptive.skewJoin.skewedPartitionFactor         5
+# spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes 256m
+# spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin 0.2

 ## Hadoop Configurations, they will override those in $HADOOP_CONF_DIR
 #
--- a/conf/kyuubi-env.sh.template
+++ b/conf/kyuubi-env.sh.template
@ -39,3 +39,10 @@
 # - SPARK_CONF_DIR          Optional directory where the Spark configuration lives.
 #                           (Default: $SPARK_HOME/conf)
 #
+
+
+## Examples ##
+
+# export JAVA_HOME=/usr/jdk64/jdk1.8.0_152
+# export HADOOP_CONF_DIR=/usr/ndp/current/mapreduce_client/conf
+# export KYUUBI_JAVA_OPTS="-Xmx10g -XX:+UnlockDiagnosticVMOptions -XX:ParGCCardsPerStrideChunk=4096 -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:+CMSConcurrentMTEnabled -XX:CMSInitiatingOccupancyFraction=70 -XX:+UseCMSInitiatingOccupancyOnly -XX:+CMSClassUnloadingEnabled -XX:+CMSParallelRemarkEnabled -XX:+UseCondCardMark -XX:MaxDirectMemorySize=1024m  -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=./logs -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintTenuringDistribution -Xloggc:./logs/kyuubi-server-gc-%t.log -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=5M -XX:NewRatio=3 -XX:MetaspaceSize=512m"
--- a/docs/deployment/settings.md
+++ b/docs/deployment/settings.md
@ -58,6 +58,13 @@ You can configure the environment variables in `$KYUUBI_HOME/conf/kyuubi-env.sh`
 # - SPARK_CONF_DIR          Optional directory where the Spark configuration lives.
 #                           (Default: $SPARK_HOME/conf)
 #
+
+
+## Examples ##
+
+# export JAVA_HOME=/usr/jdk64/jdk1.8.0_152
+# export HADOOP_CONF_DIR=/usr/ndp/current/mapreduce_client/conf
+# export KYUUBI_JAVA_OPTS="-Xmx10g -XX:+UnlockDiagnosticVMOptions -XX:ParGCCardsPerStrideChunk=4096 -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:+CMSConcurrentMTEnabled -XX:CMSInitiatingOccupancyFraction=70 -XX:+UseCMSInitiatingOccupancyOnly -XX:+CMSClassUnloadingEnabled -XX:+CMSParallelRemarkEnabled -XX:+UseCondCardMark -XX:MaxDirectMemorySize=1024m  -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=./logs -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+PrintTenuringDistribution -Xloggc:./logs/kyuubi-server-gc-%t.log -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=5M -XX:NewRatio=3 -XX:MetaspaceSize=512m"
 ```
 ## Kyuubi Configurations

@ -86,11 +93,151 @@ You can configure the Kyuubi properties in `$KYUUBI_HOME/conf/kyuubi-defaults.co
 # kyuubi.authentication           NONE
 # kyuubi.frontend.bind.port       10009
 #
+
 ## Spark Configurations, they will override those in $SPARK_HOME/conf/spark-defaults.conf
-#
-# spark.master                    local
-# spark.ui.enabled                false
-# spark.driver.extraJavaOptions   -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
+## Dummy Ones
+# spark.master                      local
+# spark.submit.deployMode           client
+# spark.ui.enabled                  false
+# spark.ui.port                     0
+# spark.driver.extraJavaOptions     -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
+# spark.scheduler.mode              FAIR
+# spark.serializer                  org.apache.spark.serializer.KryoSerializer
+# spark.kryoserializer.buffer.max   128m
+# spark.buffer.size                 131072
+# spark.local.dir                   ./local
+# spark.network.timeout             120s
+# spark.cleaner.periodicGC.interval 10min
+
+## Spark Driver / AM Sizing
+# spark.driver.cores            4
+# spark.driver.memory           8g
+# spark.driver.memoryOverhead   2048
+# spark.driver.extraJavaOptions -XX:MaxDirectMemorySize=2048m
+# spark.driver.maxResultSize    3g
+# spark.yarn.am.cores           4
+# spark.yarn.am.memory	        2g
+# spark.yarn.am.memoryOverhead	1024
+
+## Spark Executor Sizing
+# spark.executor.instances        100
+# spark.executor.cores            4
+# spark.executor.memory           16g
+# spark.executor.memoryOverhead   4096
+# spark.executor.extraJavaOptions -XX:MaxDirectMemorySize=2048m
+
+## Executor Heartbeat
+# spark.storage.blockManagerHeartbeatTimeoutMs                       300s
+# spark.executor.heartbeatInterval                                   15s
+# spark.executor.heartbeat.maxFailures                               30
+
+
+## Event Queue Capacity
+# spark.scheduler.revive.interval                                    1s
+# spark.scheduler.listenerbus.eventqueue.capacity                    100000
+# spark.scheduler.listenerbus.eventqueue.executorManagement.capacity 100000
+# spark.scheduler.listenerbus.eventqueue.appStatus.capacity          100000
+# spark.scheduler.listenerbus.eventqueue.shared.capacity             100000
+# spark.scheduler.listenerbus.eventqueue.eventLog.capacity           20000
+
+## Dynamic Allocation
+# spark.dynamicAllocation.enabled                           true
+# spark.dynamicAllocation.initialExecutors                  10
+# spark.dynamicAllocation.minExecutors                      10
+# spark.dynamicAllocation.maxExecutors                      500
+# spark.dynamicAllocation.executorAllocationRatio           0.8
+# spark.dynamicAllocation.executorIdleTimeout               60s
+# spark.dynamicAllocation.cachedExecutorIdleTimeout         1h
+# spark.dynamicAllocation.shuffleTracking.enabled           false
+# spark.dynamicAllocation.shuffleTracking.timeout           30min
+# spark.dynamicAllocation.schedulerBacklogTimeout           1s
+# spark.dynamicAllocation.sustainedSchedulerBacklogTimeout  1s
+
+## External Shuffle Service
+# spark.shuffle.service.enabled                             true
+# spark.shuffle.service.fetch.rdd.enabled                   true
+# spark.shuffle.service.port                                7337
+
+## Speculation
+# spark.speculation                         true
+# spark.speculation.interval                1s
+# spark.speculation.multiplier              1.5
+# spark.speculation.quantile                0.9
+# spark.speculation.task.duration.threshold 10min
+
+## Shuffle Behavior
+# spark.shuffle.compress                                    true
+# spark.shuffle.detectCorrupt                               true
+# spark.shuffle.detectCorrupt.useExtraMemory                true
+# spark.shuffle.file.buffer                                 64k
+# spark.shuffle.unsafe.file.output.buffer                   64k
+# spark.shuffle.spill.diskWriteBufferSize                   8k
+# spark.shuffle.spill.compress                              true
+# spark.shuffle.mapOutput.dispatcher.numThreads             12
+# spark.shuffle.mapOutput.parallelAggregationThreshold      5000
+# spark.shuffle.readHostLocalDisk                           true
+# spark.shuffle.io.maxRetries                               10
+# spark.shuffle.io.retryWait                                6s
+# spark.shuffle.io.preferDirectBufs                         false
+# spark.shuffle.io.serverThreads                            8
+# spark.shuffle.io.clientThreads                            8
+# spark.shuffle.io.connectionTimeout                        240s
+# spark.shuffle.registration.timeout                        6000
+# spark.shuffle.registration.maxAttempts                    10
+# spark.shuffle.sync                                        false
+# spark.shuffle.useOldFetchProtocol                         true
+# spark.shuffle.unsafe.fastMergeEnabled                     true
+# spark.shuffle.minNumPartitionsToHighlyCompress            100
+# spark.network.maxRemoteBlockSizeFetchToMem                128m
+# spark.reducer.maxSizeInFlight                             48m
+# spark.reducer.maxReqsInFlight                             256
+# spark.reducer.maxBlocksInFlightPerAddress                 256
+
+## Data Locality for Task Schedule
+# spark.locality.wait                                       0s
+# spark.locality.wait.process                               0s
+# spark.locality.wait.node                                  0s
+# spark.locality.wait.rack                                  0s
+
+## Event Logging for History Server
+# spark.eventLog.enabled                            true
+# spark.eventLog.dir                                hdfs://hadoop-dfs/history
+# spark.eventLog.compress                           true
+# spark.eventLog.longForm.enabled                   true
+# spark.eventLog.rolling.enabled                    true
+# spark.yarn.historyServer.address                  http://historyserver:18080
+
+## SQL
+## General SQL Settings
+# spark.sql.shuffle.partitions                              8192
+# spark.sql.optimizer.inSetConversionThreshold              2
+# spark.sql.autoBroadcastJoinThreshold                      64m
+# spark.sql.broadcastTimeout                                600s
+# spark.sql.join.preferSortMergeJoin                        true
+# spark.sql.hive.metastorePartitionPruning                  true
+# spark.sql.parquet.filterPushdown                          true
+# spark.sql.parquet.recordLevelFilter.enabled	            true
+# spark.sql.statistics.fallBackToHdfs	                    true
+## Dynamic Partition Pruning
+# spark.sql.optimizer.dynamicPartitionPruning.enabled             true
+# spark.sql.optimizer.dynamicPartitionPruning.useStats            true
+# spark.sql.optimizer.dynamicPartitionPruning.fallbackFilterRatio 0.5
+# spark.sql.optimizer.dynamicPartitionPruning.reuseBroadcastOnly  true
+
+# Adaptive Query Execution
+# spark.sql.adaptive.enabled                                true
+# spark.sql.adaptive.forceApply                             false
+# spark.sql.adaptive.logLevel                               info
+# spark.sql.adaptive.advisoryPartitionSizeInBytes           128m
+# spark.sql.adaptive.coalescePartitions.enabled             true
+# spark.sql.adaptive.coalescePartitions.minPartitionNum     64
+# spark.sql.adaptive.coalescePartitions.initialPartitionNum
+# spark.sql.adaptive.fetchShuffleBlocksInBatch              true
+# spark.sql.adaptive.localShuffleReader.enabled             true
+# spark.sql.adaptive.skewJoin.enabled                       true
+# spark.sql.adaptive.skewJoin.skewedPartitionFactor         5
+# spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes 256m
+# spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin 0.2

 ## Hadoop Configurations, they will override those in $HADOOP_CONF_DIR
 #