From 9ece11ff2047204df9e3a178a810580566c1e78e Mon Sep 17 00:00:00 2001 From: Bago Amirbekian Date: Tue, 8 May 2018 21:44:11 -0700 Subject: [PATCH] Add decision tree benchmark (#140) * Move mllib config file to resources. * Add DecisionTreeClassification as first benchmark in mllib-large.yaml. * Read config files as streams to be jar compatible. * PR feedback #140. --- .../spark/sql/perf/mllib/config/mllib-large.yaml | 13 +++++++++++++ .../spark/sql/perf/mllib/config}/mllib-small.yaml | 0 .../com/databricks/spark/sql/perf/mllib/MLLib.scala | 10 ++++++++++ 3 files changed, 23 insertions(+) create mode 100644 src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml rename src/main/{scala/configs => resources/com/databricks/spark/sql/perf/mllib/config}/mllib-small.yaml (100%) diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml new file mode 100644 index 0000000..ebcc24b --- /dev/null +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml @@ -0,0 +1,13 @@ +output: /databricks/spark/sql/mllib-perf-ci +timeoutSeconds: 1000 # This limit is for all benchmarks and should be bumped as more are added. +common: + numExamples: 1000000 + numTestExamples: 1000000 + numFeatures: 4000 + numPartitions: 64 + randomSeed: [1, 1, 1] # Rerun 3 times to accumulate some info +benchmarks: + - name: classification.DecisionTreeClassification + params: + depth: [5, 10] + numClasses: 4 diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml similarity index 100% rename from src/main/scala/configs/mllib-small.yaml rename to src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala index 80f73de..cfb7709 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala @@ -1,5 +1,7 @@ package com.databricks.spark.sql.perf.mllib + +import scala.io.Source import scala.language.implicitConversions import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging} @@ -34,6 +36,14 @@ object MLLib extends Logging { e.getCurrentResults() } + private def getConfig(resourcePath: String): String = { + val stream = getClass.getResourceAsStream(resourcePath) + Source.fromInputStream(stream).mkString + } + + val smallConfig: String = getConfig("config/mllib-small.yaml") + val largeConfig: String = getConfig("config/mllib-large.yaml") + /** * Entry point for running ML tests. Expects a single command-line argument: the path to * a YAML config file specifying which ML tests to run and their parameters.