diff --git a/dev/kyuubi-tpcds/README.md b/dev/kyuubi-tpcds/README.md index a9a6487aa..717c1b0ed 100644 --- a/dev/kyuubi-tpcds/README.md +++ b/dev/kyuubi-tpcds/README.md @@ -48,14 +48,15 @@ $SPARK_HOME/bin/spark-submit \ Support options: -| key | default | description | -|-------------|------------------------|---------------------------------------------------------------| -| db | none(required) | the TPC-DS database | -| benchmark | tpcds-v2.4-benchmark | the name of application | -| iterations | 3 | the number of iterations to run | -| breakdown | false | whether to record breakdown results of an execution | -| filter | a | filter on the name of the queries to run, e.g. q1-v2.4 | -| results-dir | /spark/sql/performance | dir to store benchmark results, e.g. hdfs://hdfs-nn:9870/pref | +| key | default | description | +|-------------|------------------------|-------------------------------------------------------------------------------| +| db | none(required) | the TPC-DS database | +| benchmark | tpcds-v2.4-benchmark | the name of application | +| iterations | 3 | the number of iterations to run | +| breakdown | false | whether to record breakdown results of an execution | +| results-dir | /spark/sql/performance | dir to store benchmark results, e.g. hdfs://hdfs-nn:9870/pref | +| include | none(optional) | name of the queries to run, use comma to split multiple names, e.g. q1,q2 | +| exclude | none(optional) | name of the queries to exclude, use comma to split multiple names, e.g. q2,q4 | Example: the following command to benchmark TPC-DS sf10 with exists database `tpcds_sf10`. @@ -65,17 +66,52 @@ $SPARK_HOME/bin/spark-submit \ kyuubi-tpcds_*.jar --db tpcds_sf10 ``` -We also support run one of the TPC-DS query: +We also support run specified SQL collections of the TPC-DS query: ```shell $SPARK_HOME/bin/spark-submit \ --class org.apache.kyuubi.tpcds.benchmark.RunBenchmark \ - kyuubi-tpcds_*.jar --db tpcds_sf10 --filter q1-v2.4 + kyuubi-tpcds_*.jar --db tpcds_sf10 --include q1,q2 ``` The result of TPC-DS benchmark like: -| name | minTimeMs | maxTimeMs | avgTimeMs | stdDev | stdDevPercent | -|---------|-----------|------------|------------|----------|----------------| -| q1-v2.4 | 50.522384 | 868.010383 | 323.398267 | 471.6482 | 145.8413108576 | +| name | minTimeMs | maxTimeMs | avgTimeMs | stdDev | stdDevPercent | +|---------|--------------|--------------|------------------|------------------|------------------| +| q1-v2.4 | 8329.884508 | 14159.307004 | 10537.235825 | 3161.74253777417 | 30.0054263782615 | +| q2-v2.4 | 16600.979609 | 18932.613523 | 18137.6516166666 | 1331.06332796139 | 7.33867512781137 | +If you want to exclude some SQL, you can use exclude: + +```shell +$SPARK_HOME/bin/spark-submit \ + --class org.apache.kyuubi.tpcds.benchmark.RunBenchmark \ + kyuubi-tpcds_*.jar --db tpcds_sf10 --exclude q2,q4 +``` + +The result of TPC-DS benchmark like: + +| name | minTimeMs | maxTimeMs | avgTimeMs | stdDev | stdDevPercent | +|----------|--------------|--------------|------------------|------------------|-------------------| +| q1-v2.4 | 8329.884508 | 14159.307004 | 10537.235825 | 3161.74253777417 | 30.0054263782615 | +| q3-v2.4 | 3841.009061 | 4685.16345 | 4128.583224 | 482.102016761038 | 11.6771781166603 | +| q5-v2.4 | 39405.654981 | 48845.359253 | 43530.6847113333 | 4830.98802198401 | 11.0978911864583 | +| q6-v2.4 | 2998.962221 | 7793.096796 | 4658.37355366666 | 2716.310089792 | 58.3102677039276 | +| ... | ... | ... | ... | ... | ... | +| q99-v2.4 | 11747.22389 | 11900.570288 | 11813.018609 | 78.9544389266673 | 0.668368022941351 | + +When both include and exclude exist simultaneously, the final SQL collections executed is include minus exclude: + +```shell +$SPARK_HOME/bin/spark-submit \ + --class org.apache.kyuubi.tpcds.benchmark.RunBenchmark \ + kyuubi-tpcds_*.jar --db tpcds_sf10 --include q1,q2,q3,q4,q5 --exclude q2,q4 +``` + +The result of TPC-DS benchmark like: + +| name | minTimeMs | maxTimeMs | avgTimeMs | stdDev | stdDevPercent | +|----------|--------------|--------------|------------------|------------------|-------------------| +| q1-v2.4 | 8329.884508 | 14159.307004 | 10537.235825 | 3161.74253777417 | 30.0054263782615 | +| q3-v2.4 | 3841.009061 | 4685.16345 | 4128.583224 | 482.102016761038 | 11.6771781166603 | +| q5-v2.4 | 39405.654981 | 48845.359253 | 43530.6847113333 | 4830.98802198401 | 11.0978911864583 | \ No newline at end of file diff --git a/dev/kyuubi-tpcds/src/main/scala/org/apache/kyuubi/tpcds/benchmark/RunBenchmark.scala b/dev/kyuubi-tpcds/src/main/scala/org/apache/kyuubi/tpcds/benchmark/RunBenchmark.scala index 3e2106cff..80f742294 100644 --- a/dev/kyuubi-tpcds/src/main/scala/org/apache/kyuubi/tpcds/benchmark/RunBenchmark.scala +++ b/dev/kyuubi-tpcds/src/main/scala/org/apache/kyuubi/tpcds/benchmark/RunBenchmark.scala @@ -26,11 +26,11 @@ import org.apache.spark.sql.functions._ case class RunConfig( db: String = null, benchmarkName: String = "tpcds-v2.4-benchmark", - filter: Option[String] = None, iterations: Int = 3, breakdown: Boolean = false, resultsDir: String = "/spark/sql/performance", - queries: Set[String] = Set.empty) + include: Set[String] = Set.empty, + exclude: Set[String] = Set.empty) // scalastyle:off /** @@ -54,9 +54,6 @@ object RunBenchmark { opt[String]('b', "benchmark") .action { (x, c) => c.copy(benchmarkName = x) } .text("the name of the benchmark to run") - opt[String]('f', "filter") - .action((x, c) => c.copy(filter = Some(x))) - .text("a filter on the name of the queries to run") opt[Boolean]('B', "breakdown") .action((x, c) => c.copy(breakdown = x)) .text("whether to record breakdown results of an execution") @@ -66,11 +63,16 @@ object RunBenchmark { opt[String]('r', "results-dir") .action((x, c) => c.copy(resultsDir = x)) .text("dir to store benchmark results, e.g. hdfs://hdfs-nn:9870/pref") - opt[String]('q', "queries") + opt[String]("include") .action { case (x, c) => - c.copy(queries = x.split(",").map(_.trim).filter(_.nonEmpty).toSet) + c.copy(include = x.split(",").map(_.trim).filter(_.nonEmpty).toSet) } - .text("name of the queries to run, use , split multiple name") + .text("name of the queries to run, use comma to split multiple names, e.g. q1,q2") + opt[String]("exclude") + .action { case (x, c) => + c.copy(exclude = x.split(",").map(_.trim).filter(_.nonEmpty).toSet) + } + .text("name of the queries to exclude, use comma to split multiple names, e.g. q2,q4") help("help") .text("prints this usage text") } @@ -96,19 +98,18 @@ object RunBenchmark { println(config.db) sparkSession.sql(s"use ${config.db}") - val allQueries = config.filter.map { f => - benchmark.tpcds2_4Queries.filter(_.name contains f) - } getOrElse { - benchmark.tpcds2_4Queries - } - - val runQueries = - if (config.queries.nonEmpty) { - allQueries.filter(q => config.queries.contains(q.name.split('-')(0))) + var runQueries = + if (config.include.nonEmpty) { + benchmark.tpcds2_4Queries.filter(q => config.include.contains(q.name.split('-')(0))) } else { - allQueries + benchmark.tpcds2_4Queries } + // runQueries = include - exclude + if (config.exclude.nonEmpty) { + runQueries = runQueries.filterNot(q => config.exclude.contains(q.name.split('-')(0))) + } + println("== QUERY LIST ==") runQueries.foreach(q => println(q.name))