Refactor MLParams for spark-sql-perf (#114)

A case class (MLParams) is currently used to store/access parameters for ML tests in spark-sql-perf. With the addition of new ML tests to spark-sql-perf (in this PR: #112), the number of ML-related test params will be > 22, but Scala only allows up to 22 params in a case class. This PR addresses the issue by: * Introducing a new MLParameters class (class MLParameters) that provides access to the same parameters as MLParams, except as a class instead of a case class. * Replacing usages of MLParams with MLParameters * Storing the members of MLParameters in BenchmarkResult.parameters for logging/persistence. Tested by running default performance tests in src/main/scala/configs/mllib-small.yaml.
2017-08-28 13:23:59 -07:00 · 2017-08-28 13:23:59 -07:00 · 9febc34f66
commit 9febc34f66
parent d0de5ae8aa
7 changed files with 129 additions and 40 deletions
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala
@ -17,7 +17,7 @@ object MLBenchmarks {
  val benchmarks: Seq[MLTest] = List(
      MLTest(
        LogisticRegression,
-        MLParams(
+        new MLParams(
          numFeatures = 10,
          numExamples = 10,
          numTestExamples = 10,
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala
@ -80,17 +80,15 @@ class MLPipelineStageBenchmarkable(
      BenchmarkResult(
        name = name,
        mode = executionMode.toString,
-        parameters = Map.empty,
+        parameters = params.toMap,
        executionTime = Some(trainingTime.toMillis),
-        mlParams = Some(params),
        mlResult = Some(ml))
    } catch {
      case e: Exception =>
        BenchmarkResult(
          name = name,
          mode = executionMode.toString,
-          parameters = Map.empty,
-          mlParams = Some(params),
+          parameters = params.toMap,
          failure = Some(Failure(e.getClass.getSimpleName,
            e.getMessage + ":\n" + e.getStackTraceString)))
    } finally {
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala
@ -0,0 +1,39 @@
+package com.databricks.spark.sql.perf.mllib
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe._
+
+/** Exposes methods to simplify implementation of classes like MLParams. */
+private[perf] object ReflectionUtils {
+
+  private def getConstructor[T: TypeTag: ClassTag](obj: T): MethodSymbol = {
+    typeOf[T].declaration(nme.CONSTRUCTOR).asMethod
+  }
+
+  /**
+   * Given an instance [[obj]] of a class whose constructor arguments are all of type Option[Any],
+   * returns a map of key-value pairs (argName -> argValue) where argName is the name
+   * of a constructor argument with a defined (not None) value and argValue is the corresponding
+   * value.
+   */
+  def getConstructorArgs[T: TypeTag: ClassTag](obj: T): Map[String, Any] = {
+    // Get constructor of passed-in instance
+    val constructor = getConstructor(obj)
+    // Include each constructor argument not equal to None in the output map
+    constructor.paramss.flatten.flatMap { (param: Symbol) =>
+      // Get name and value of the constructor argument
+      val paramName = param.name.toString
+      val getter = obj.getClass.getDeclaredField(paramName)
+      getter.setAccessible(true)
+      val paramValue = getter.get(obj)
+      // If the constructor argument is defined, include it in our output map
+      paramValue match {
+        case value: Option[Any] => if (value.isDefined) Seq(paramName -> paramValue) else Seq.empty
+        case _ => throw new UnsupportedOperationException("ReflectionUtils.getConstructorArgs " +
+          "can only be called on instances of classes whose constructor arguments are all of " +
+          s"type Option[Any]; constructor argument ${paramName} had invalid type.")
+      }
+    }.toMap
+  }
+
+}
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala
@ -57,7 +57,7 @@ object NaiveBayes extends BenchmarkAlgorithm
  override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
    import ctx.params._
    new ml.classification.NaiveBayes()
-      .setSmoothing(naiveBayesSmoothing)
+      .setSmoothing(smoothing)
  }

  override protected def evaluator(ctx: MLBenchContext): Evaluator =
--- a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala
@ -24,14 +24,14 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining {
      numPartitions
    )
    val seed: Int = randomSeed
-    val docLength = ldaDocLength.get
-    val numVocab = ldaNumVocabulary.get
+    val docLen = docLength.get
+    val numVocab = vocabSize.get
    val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) =>
      val rng = new Well19937c(seed ^ idx)
      partition.map { docIndex =>
        var currentSize = 0
        val entries = MHashMap[Int, Int]()
-        while (currentSize < docLength) {
+        while (currentSize < docLen) {
          val index = rng.nextInt(numVocab)
          entries(index) = entries.getOrElse(index, 0) + 1
          currentSize += 1
--- a/src/main/scala/com/databricks/spark/sql/perf/results.scala
+++ b/src/main/scala/com/databricks/spark/sql/perf/results.scala
@ -16,6 +16,8 @@

 package com.databricks.spark.sql.perf

+import com.databricks.spark.sql.perf.mllib.ReflectionUtils
+
 /**
 * The performance results of all given queries for a single iteration.
 *
@ -81,7 +83,6 @@ case class BenchmarkResult(
    breakDown: Seq[BreakdownResult] = Nil,
    queryExecution: Option[String] = None,
    failure: Option[Failure] = None,
-    mlParams: Option[MLParams] = None,
    mlResult: Option[MLResult] = None)

 /**
@ -104,37 +105,88 @@ case class BreakdownResult(

 case class Failure(className: String, message: String)

-// KEEP ARGUMENTS SORTED BY NAME.
-// It simplifies lookup when checking if a parameter is here already.
-case class MLParams(
+/**
+ * Class wrapping parameters for ML tests.
+ *
+ * KEEP CONSTRUCTOR ARGUMENTS SORTED BY NAME.
+ * It simplifies lookup when checking if a parameter is here already.
+ */
+class MLParams(
    // *** Common to all algorithms ***
-    randomSeed: Option[Int] = Some(42),
-    numExamples: Option[Long] = None,
-    numTestExamples: Option[Long] = None,
-    numPartitions: Option[Int] = None,
+    val randomSeed: Option[Int] = Some(42),
+    val numExamples: Option[Long] = None,
+    val numTestExamples: Option[Long] = None,
+    val numPartitions: Option[Int] = None,
    // *** Specialized and sorted by name ***
-    bucketizerNumBuckets: Option[Int] = None,
-    depth: Option[Int] = None,
-    elasticNetParam: Option[Double] = None,
-    family: Option[String] = None,
-    k: Option[Int] = None,
-    ldaDocLength: Option[Int] = None,
-    ldaNumVocabulary: Option[Int] = None,
-    link: Option[String] = None,
-    maxIter: Option[Int] = None,
-    naiveBayesSmoothing: Option[Double] = None,
-    numClasses: Option[Int] = None,
-    numFeatures: Option[Int] = None,
-    numItems: Option[Int] = None,
-    numUsers: Option[Int] = None,
-    optimizer: Option[String] = None,
-    regParam: Option[Double] = None,
-    rank: Option[Int] = None,
-    tol: Option[Double] = None
-)
+    val bucketizerNumBuckets: Option[Int] = None,
+    val depth: Option[Int] = None,
+    val docLength: Option[Int] = None,
+    val elasticNetParam: Option[Double] = None,
+    val family: Option[String] = None,
+    val k: Option[Int] = None,
+    val link: Option[String] = None,
+    val maxIter: Option[Int] = None,
+    val numClasses: Option[Int] = None,
+    val numFeatures: Option[Int] = None,
+    val numItems: Option[Int] = None,
+    val numUsers: Option[Int] = None,
+    val optimizer: Option[String] = None,
+    val regParam: Option[Double] = None,
+    val rank: Option[Int] = None,
+    val smoothing: Option[Double] = None,
+    val tol: Option[Double] = None,
+    val vocabSize: Option[Int] = None) {
+
+  /**
+   * Returns a map of param names to string representations of their values. Only params that
+   * were defined (i.e., not equal to None) are included in the map.
+   */
+  def toMap: Map[String, String] = {
+    val allParams = ReflectionUtils.getConstructorArgs(this)
+    allParams.map { case (key: String, value: Any) =>
+      key -> value.toString
+    }
+  }
+
+  /** Returns a copy of the current MLParams instance */
+  def copy(
+      // *** Common to all algorithms ***
+      randomSeed: Option[Int] = randomSeed,
+      numExamples: Option[Long] = numExamples,
+      numTestExamples: Option[Long] = numTestExamples,
+      numPartitions: Option[Int] = numPartitions,
+      // *** Specialized and sorted by name ***
+      bucketizerNumBuckets: Option[Int] = bucketizerNumBuckets,
+      depth: Option[Int] = depth,
+      docLength: Option[Int] = docLength,
+      elasticNetParam: Option[Double] = elasticNetParam,
+      family: Option[String] = family,
+      k: Option[Int] = k,
+      link: Option[String] = link,
+      maxIter: Option[Int] = maxIter,
+      numClasses: Option[Int] = numClasses,
+      numFeatures: Option[Int] = numFeatures,
+      numItems: Option[Int] = numItems,
+      numUsers: Option[Int] = numUsers,
+      vocabSize: Option[Int] = vocabSize,
+      optimizer: Option[String] = optimizer,
+      regParam: Option[Double] = regParam,
+      rank: Option[Int] = rank,
+      smoothing: Option[Double] = smoothing,
+      tol: Option[Double] = tol): MLParams = {
+    new MLParams(randomSeed = randomSeed, numExamples = numExamples,
+      numTestExamples = numTestExamples, numPartitions = numPartitions,
+      bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
+      elasticNetParam = elasticNetParam, family = family, k = k, link = link, maxIter = maxIter,
+      numClasses = numClasses, numFeatures = numFeatures,
+      numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
+      rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
+  }
+}
+

 object MLParams {
-  val empty = MLParams()
+  val empty = new MLParams()
 }

 /**
--- a/src/main/scala/configs/mllib-small.yaml
+++ b/src/main/scala/configs/mllib-small.yaml
@ -17,8 +17,8 @@ benchmarks:
    params:
      numExamples: 10
      numTestExamples: 10
-      ldaDocLength: 20
-      ldaNumVocabulary: 4
+      docLength: 20
+      vocabSize: 4
      k: 5
      maxIter: 10
      optimizer:
@ -87,6 +87,6 @@ benchmarks:
  - name: classification.NaiveBayes
    params:
      numExamples: 100
-      naiveBayesSmoothing: 1.0
+      smoothing: 1.0
      numClasses: 10
      numFeatures: [10]