Refactor MLParams for spark-sql-perf (#114)

A case class (MLParams) is currently used to store/access parameters for ML tests in spark-sql-perf. With the addition of new ML tests to spark-sql-perf (in this PR: #112), the number of ML-related test params will be > 22, but Scala only allows up to 22 params in a case class.

This PR addresses the issue by:
* Introducing a new MLParameters class (class MLParameters) that provides access to the same parameters as MLParams, except as a class instead of a case class.
* Replacing usages of MLParams with MLParameters
* Storing the members of MLParameters in BenchmarkResult.parameters for logging/persistence.

Tested by running default performance tests in src/main/scala/configs/mllib-small.yaml.
This commit is contained in:
Siddharth Murching 2017-08-28 13:23:59 -07:00 committed by jkbradley
parent d0de5ae8aa
commit 9febc34f66
7 changed files with 129 additions and 40 deletions

View File

@ -17,7 +17,7 @@ object MLBenchmarks {
val benchmarks: Seq[MLTest] = List(
MLTest(
LogisticRegression,
MLParams(
new MLParams(
numFeatures = 10,
numExamples = 10,
numTestExamples = 10,

View File

@ -80,17 +80,15 @@ class MLPipelineStageBenchmarkable(
BenchmarkResult(
name = name,
mode = executionMode.toString,
parameters = Map.empty,
parameters = params.toMap,
executionTime = Some(trainingTime.toMillis),
mlParams = Some(params),
mlResult = Some(ml))
} catch {
case e: Exception =>
BenchmarkResult(
name = name,
mode = executionMode.toString,
parameters = Map.empty,
mlParams = Some(params),
parameters = params.toMap,
failure = Some(Failure(e.getClass.getSimpleName,
e.getMessage + ":\n" + e.getStackTraceString)))
} finally {

View File

@ -0,0 +1,39 @@
package com.databricks.spark.sql.perf.mllib
import scala.reflect.ClassTag
import scala.reflect.runtime.universe._
/** Exposes methods to simplify implementation of classes like MLParams. */
private[perf] object ReflectionUtils {
private def getConstructor[T: TypeTag: ClassTag](obj: T): MethodSymbol = {
typeOf[T].declaration(nme.CONSTRUCTOR).asMethod
}
/**
* Given an instance [[obj]] of a class whose constructor arguments are all of type Option[Any],
* returns a map of key-value pairs (argName -> argValue) where argName is the name
* of a constructor argument with a defined (not None) value and argValue is the corresponding
* value.
*/
def getConstructorArgs[T: TypeTag: ClassTag](obj: T): Map[String, Any] = {
// Get constructor of passed-in instance
val constructor = getConstructor(obj)
// Include each constructor argument not equal to None in the output map
constructor.paramss.flatten.flatMap { (param: Symbol) =>
// Get name and value of the constructor argument
val paramName = param.name.toString
val getter = obj.getClass.getDeclaredField(paramName)
getter.setAccessible(true)
val paramValue = getter.get(obj)
// If the constructor argument is defined, include it in our output map
paramValue match {
case value: Option[Any] => if (value.isDefined) Seq(paramName -> paramValue) else Seq.empty
case _ => throw new UnsupportedOperationException("ReflectionUtils.getConstructorArgs " +
"can only be called on instances of classes whose constructor arguments are all of " +
s"type Option[Any]; constructor argument ${paramName} had invalid type.")
}
}.toMap
}
}

View File

@ -57,7 +57,7 @@ object NaiveBayes extends BenchmarkAlgorithm
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.classification.NaiveBayes()
.setSmoothing(naiveBayesSmoothing)
.setSmoothing(smoothing)
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =

View File

@ -24,14 +24,14 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining {
numPartitions
)
val seed: Int = randomSeed
val docLength = ldaDocLength.get
val numVocab = ldaNumVocabulary.get
val docLen = docLength.get
val numVocab = vocabSize.get
val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) =>
val rng = new Well19937c(seed ^ idx)
partition.map { docIndex =>
var currentSize = 0
val entries = MHashMap[Int, Int]()
while (currentSize < docLength) {
while (currentSize < docLen) {
val index = rng.nextInt(numVocab)
entries(index) = entries.getOrElse(index, 0) + 1
currentSize += 1

View File

@ -16,6 +16,8 @@
package com.databricks.spark.sql.perf
import com.databricks.spark.sql.perf.mllib.ReflectionUtils
/**
* The performance results of all given queries for a single iteration.
*
@ -81,7 +83,6 @@ case class BenchmarkResult(
breakDown: Seq[BreakdownResult] = Nil,
queryExecution: Option[String] = None,
failure: Option[Failure] = None,
mlParams: Option[MLParams] = None,
mlResult: Option[MLResult] = None)
/**
@ -104,37 +105,88 @@ case class BreakdownResult(
case class Failure(className: String, message: String)
// KEEP ARGUMENTS SORTED BY NAME.
// It simplifies lookup when checking if a parameter is here already.
case class MLParams(
/**
* Class wrapping parameters for ML tests.
*
* KEEP CONSTRUCTOR ARGUMENTS SORTED BY NAME.
* It simplifies lookup when checking if a parameter is here already.
*/
class MLParams(
// *** Common to all algorithms ***
randomSeed: Option[Int] = Some(42),
numExamples: Option[Long] = None,
numTestExamples: Option[Long] = None,
numPartitions: Option[Int] = None,
val randomSeed: Option[Int] = Some(42),
val numExamples: Option[Long] = None,
val numTestExamples: Option[Long] = None,
val numPartitions: Option[Int] = None,
// *** Specialized and sorted by name ***
bucketizerNumBuckets: Option[Int] = None,
depth: Option[Int] = None,
elasticNetParam: Option[Double] = None,
family: Option[String] = None,
k: Option[Int] = None,
ldaDocLength: Option[Int] = None,
ldaNumVocabulary: Option[Int] = None,
link: Option[String] = None,
maxIter: Option[Int] = None,
naiveBayesSmoothing: Option[Double] = None,
numClasses: Option[Int] = None,
numFeatures: Option[Int] = None,
numItems: Option[Int] = None,
numUsers: Option[Int] = None,
optimizer: Option[String] = None,
regParam: Option[Double] = None,
rank: Option[Int] = None,
tol: Option[Double] = None
)
val bucketizerNumBuckets: Option[Int] = None,
val depth: Option[Int] = None,
val docLength: Option[Int] = None,
val elasticNetParam: Option[Double] = None,
val family: Option[String] = None,
val k: Option[Int] = None,
val link: Option[String] = None,
val maxIter: Option[Int] = None,
val numClasses: Option[Int] = None,
val numFeatures: Option[Int] = None,
val numItems: Option[Int] = None,
val numUsers: Option[Int] = None,
val optimizer: Option[String] = None,
val regParam: Option[Double] = None,
val rank: Option[Int] = None,
val smoothing: Option[Double] = None,
val tol: Option[Double] = None,
val vocabSize: Option[Int] = None) {
/**
* Returns a map of param names to string representations of their values. Only params that
* were defined (i.e., not equal to None) are included in the map.
*/
def toMap: Map[String, String] = {
val allParams = ReflectionUtils.getConstructorArgs(this)
allParams.map { case (key: String, value: Any) =>
key -> value.toString
}
}
/** Returns a copy of the current MLParams instance */
def copy(
// *** Common to all algorithms ***
randomSeed: Option[Int] = randomSeed,
numExamples: Option[Long] = numExamples,
numTestExamples: Option[Long] = numTestExamples,
numPartitions: Option[Int] = numPartitions,
// *** Specialized and sorted by name ***
bucketizerNumBuckets: Option[Int] = bucketizerNumBuckets,
depth: Option[Int] = depth,
docLength: Option[Int] = docLength,
elasticNetParam: Option[Double] = elasticNetParam,
family: Option[String] = family,
k: Option[Int] = k,
link: Option[String] = link,
maxIter: Option[Int] = maxIter,
numClasses: Option[Int] = numClasses,
numFeatures: Option[Int] = numFeatures,
numItems: Option[Int] = numItems,
numUsers: Option[Int] = numUsers,
vocabSize: Option[Int] = vocabSize,
optimizer: Option[String] = optimizer,
regParam: Option[Double] = regParam,
rank: Option[Int] = rank,
smoothing: Option[Double] = smoothing,
tol: Option[Double] = tol): MLParams = {
new MLParams(randomSeed = randomSeed, numExamples = numExamples,
numTestExamples = numTestExamples, numPartitions = numPartitions,
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
elasticNetParam = elasticNetParam, family = family, k = k, link = link, maxIter = maxIter,
numClasses = numClasses, numFeatures = numFeatures,
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
}
}
object MLParams {
val empty = MLParams()
val empty = new MLParams()
}
/**

View File

@ -17,8 +17,8 @@ benchmarks:
params:
numExamples: 10
numTestExamples: 10
ldaDocLength: 20
ldaNumVocabulary: 4
docLength: 20
vocabSize: 4
k: 5
maxIter: 10
optimizer:
@ -87,6 +87,6 @@ benchmarks:
- name: classification.NaiveBayes
params:
numExamples: 100
naiveBayesSmoothing: 1.0
smoothing: 1.0
numClasses: 10
numFeatures: [10]