Add decision tree benchmark (#140)

* Move mllib config file to resources. * Add DecisionTreeClassification as first benchmark in mllib-large.yaml. * Read config files as streams to be jar compatible. * PR feedback #140.
2018-05-08 21:44:11 -07:00 · 2018-05-08 21:44:11 -07:00 · 9ece11ff20
commit 9ece11ff20
parent ed9bbb01a5
3 changed files with 23 additions and 0 deletions
--- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
+++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
@ -0,0 +1,13 @@
+output: /databricks/spark/sql/mllib-perf-ci
+timeoutSeconds: 1000 # This limit is for all benchmarks and should be bumped as more are added.
+common:
+  numExamples: 1000000
+  numTestExamples: 1000000
+  numFeatures: 4000
+  numPartitions: 64
+  randomSeed: [1, 1, 1] # Rerun 3 times to accumulate some info
+benchmarks:
+  - name: classification.DecisionTreeClassification
+    params:
+      depth: [5, 10]
+      numClasses: 4
--- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml
+++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala
@ -1,5 +1,7 @@
 package com.databricks.spark.sql.perf.mllib

+
+import scala.io.Source
 import scala.language.implicitConversions

 import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
@ -34,6 +36,14 @@ object MLLib extends Logging {
    e.getCurrentResults()
  }

+  private def getConfig(resourcePath: String): String = {
+    val stream = getClass.getResourceAsStream(resourcePath)
+    Source.fromInputStream(stream).mkString
+  }
+
+  val smallConfig: String = getConfig("config/mllib-small.yaml")
+  val largeConfig: String = getConfig("config/mllib-large.yaml")
+
  /**
   * Entry point for running ML tests. Expects a single command-line argument: the path to
   * a YAML config file specifying which ML tests to run and their parameters.