diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml new file mode 100644 index 0000000..ebcc24b --- /dev/null +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml @@ -0,0 +1,13 @@ +output: /databricks/spark/sql/mllib-perf-ci +timeoutSeconds: 1000 # This limit is for all benchmarks and should be bumped as more are added. +common: + numExamples: 1000000 + numTestExamples: 1000000 + numFeatures: 4000 + numPartitions: 64 + randomSeed: [1, 1, 1] # Rerun 3 times to accumulate some info +benchmarks: + - name: classification.DecisionTreeClassification + params: + depth: [5, 10] + numClasses: 4 diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml similarity index 100% rename from src/main/scala/configs/mllib-small.yaml rename to src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala index 80f73de..cfb7709 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala @@ -1,5 +1,7 @@ package com.databricks.spark.sql.perf.mllib + +import scala.io.Source import scala.language.implicitConversions import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging} @@ -34,6 +36,14 @@ object MLLib extends Logging { e.getCurrentResults() } + private def getConfig(resourcePath: String): String = { + val stream = getClass.getResourceAsStream(resourcePath) + Source.fromInputStream(stream).mkString + } + + val smallConfig: String = getConfig("config/mllib-small.yaml") + val largeConfig: String = getConfig("config/mllib-large.yaml") + /** * Entry point for running ML tests. Expects a single command-line argument: the path to * a YAML config file specifying which ML tests to run and their parameters.