From 3c1c9e9070db11a06ca3154a545faff5e3c15d77 Mon Sep 17 00:00:00 2001 From: Nico Poggi Date: Mon, 17 Sep 2018 15:18:16 +0200 Subject: [PATCH] Rebase for PR 87: Add -m for custom master, use SBT_HOME if set (#169) * Add -m for custom master * Add ability to use own sbt jar, update readme to include -m option * Add stddev percentage showing --- README.md | 2 ++ build/sbt-launch-lib.bash | 11 +++++++- .../spark/sql/perf/RunBenchmark.scala | 28 +++++++++++++------ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 794537b..833a80a 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Usage: spark-sql-perf [options] -b | --benchmark the name of the benchmark to run + -m | --master | --filter a filter on the name of the queries to run -i | --iterations diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 615f848..2a39936 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -26,6 +26,13 @@ else declare java_cmd=java fi +if test -x "$SBT_HOME"; then + echo -e "Using $SBT_HOME as default SBT_HOME - should be the jar name!" + # Could be at /usr/share/sbt-launcher-packaging/bin/sbt-launch.jar + # so this would be export SBT_HOME=/usr/share/sbt-launcher-packaging/bin/sbt-launch.jar + sbt_jar=${SBT_HOME} +fi + echoerr () { echo 1>&2 "$@" } @@ -165,7 +172,9 @@ process_args () { } run() { - # no jar? download it. + # first check SBT_HOME is present so we use what's already available + sbt_jar=$SBT_HOME + # if there's no jar let's download it. [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || { # still no jar? uh-oh. echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar" diff --git a/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala index ea557b0..f5c5a93 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala @@ -17,14 +17,14 @@ package com.databricks.spark.sql.perf import java.net.InetAddress - +import java.io.File import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions._ import org.apache.spark.{SparkContext, SparkConf} - import scala.util.Try case class RunConfig( + master: String = "local[*]", benchmarkName: String = null, filter: Option[String] = None, iterations: Int = 3, @@ -37,6 +37,9 @@ object RunBenchmark { def main(args: Array[String]): Unit = { val parser = new scopt.OptionParser[RunConfig]("spark-sql-perf") { head("spark-sql-perf", "0.2.0") + opt[String]('m', "master") + .action { (x, c) => c.copy(master = x) } + .text("the Spark master to use, default to local[*]") opt[String]('b', "benchmark") .action { (x, c) => c.copy(benchmarkName = x) } .text("the name of the benchmark to run") @@ -64,14 +67,16 @@ object RunBenchmark { def run(config: RunConfig): Unit = { val conf = new SparkConf() - .setMaster("local[*]") - .setAppName(getClass.getName) + .setMaster(config.master) + .setAppName(getClass.getName) val sc = SparkContext.getOrCreate(conf) val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ - sqlContext.setConf("spark.sql.perf.results", new java.io.File("performance").toURI.toString) + sqlContext.setConf("spark.sql.perf.results", + new File("performance").toURI.toString) + val benchmark = Try { Class.forName(config.benchmarkName) .newInstance() @@ -102,7 +107,8 @@ object RunBenchmark { experiment.waitForFinish(1000 * 60 * 30) sqlContext.setConf("spark.sql.shuffle.partitions", "1") - experiment.getCurrentRuns() + + val toShow = experiment.getCurrentRuns() .withColumn("result", explode($"results")) .select("result.*") .groupBy("name") @@ -110,9 +116,13 @@ object RunBenchmark { min($"executionTime") as 'minTimeMs, max($"executionTime") as 'maxTimeMs, avg($"executionTime") as 'avgTimeMs, - stddev($"executionTime") as 'stdDev) + stddev($"executionTime") as 'stdDev, + (stddev($"executionTime") / avg($"executionTime") * 100) as 'stdDevPercent) .orderBy("name") - .show(truncate = false) + + println("Showing at most 100 query results now") + toShow.show(100) + println(s"""Results: sqlContext.read.json("${experiment.resultPath}")""") config.baseline.foreach { baseTimestamp => @@ -136,4 +146,4 @@ object RunBenchmark { data.show(truncate = false) } } -} \ No newline at end of file +}