From 9ece11ff2047204df9e3a178a810580566c1e78e Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Tue, 8 May 2018 21:44:11 -0700
Subject: [PATCH] Add decision tree benchmark (#140)

* Move mllib config file to resources.

* Add DecisionTreeClassification as first benchmark in mllib-large.yaml.

* Read config files as streams to be jar compatible.

* PR feedback #140.
---
 .../spark/sql/perf/mllib/config/mllib-large.yaml    | 13 +++++++++++++
 .../spark/sql/perf/mllib/config}/mllib-small.yaml   |  0
 .../com/databricks/spark/sql/perf/mllib/MLLib.scala | 10 ++++++++++
 3 files changed, 23 insertions(+)
 create mode 100644 src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
 rename src/main/{scala/configs => resources/com/databricks/spark/sql/perf/mllib/config}/mllib-small.yaml (100%)

diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
new file mode 100644
index 0000000..ebcc24b
--- /dev/null
+++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
@@ -0,0 +1,13 @@
+output: /databricks/spark/sql/mllib-perf-ci
+timeoutSeconds: 1000 # This limit is for all benchmarks and should be bumped as more are added.
+common:
+  numExamples: 1000000
+  numTestExamples: 1000000
+  numFeatures: 4000
+  numPartitions: 64
+  randomSeed: [1, 1, 1] # Rerun 3 times to accumulate some info
+benchmarks:
+  - name: classification.DecisionTreeClassification
+    params:
+      depth: [5, 10]
+      numClasses: 4
diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml
similarity index 100%
rename from src/main/scala/configs/mllib-small.yaml
rename to src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml
diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala
index 80f73de..cfb7709 100644
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala
@@ -1,5 +1,7 @@
 package com.databricks.spark.sql.perf.mllib
 
+
+import scala.io.Source
 import scala.language.implicitConversions
 
 import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
@@ -34,6 +36,14 @@ object MLLib extends Logging {
     e.getCurrentResults()
   }
 
+  private def getConfig(resourcePath: String): String = {
+    val stream = getClass.getResourceAsStream(resourcePath)
+    Source.fromInputStream(stream).mkString
+  }
+
+  val smallConfig: String = getConfig("config/mllib-small.yaml")
+  val largeConfig: String = getConfig("config/mllib-large.yaml")
+
   /**
    * Entry point for running ML tests. Expects a single command-line argument: the path to
    * a YAML config file specifying which ML tests to run and their parameters.