Refactor MLParams for spark-sql-perf (#114)
A case class (MLParams) is currently used to store/access parameters for ML tests in spark-sql-perf. With the addition of new ML tests to spark-sql-perf (in this PR: #112), the number of ML-related test params will be > 22, but Scala only allows up to 22 params in a case class. This PR addresses the issue by: * Introducing a new MLParameters class (class MLParameters) that provides access to the same parameters as MLParams, except as a class instead of a case class. * Replacing usages of MLParams with MLParameters * Storing the members of MLParameters in BenchmarkResult.parameters for logging/persistence. Tested by running default performance tests in src/main/scala/configs/mllib-small.yaml.
This commit is contained in:
parent
d0de5ae8aa
commit
9febc34f66
@ -17,7 +17,7 @@ object MLBenchmarks {
|
||||
val benchmarks: Seq[MLTest] = List(
|
||||
MLTest(
|
||||
LogisticRegression,
|
||||
MLParams(
|
||||
new MLParams(
|
||||
numFeatures = 10,
|
||||
numExamples = 10,
|
||||
numTestExamples = 10,
|
||||
|
||||
@ -80,17 +80,15 @@ class MLPipelineStageBenchmarkable(
|
||||
BenchmarkResult(
|
||||
name = name,
|
||||
mode = executionMode.toString,
|
||||
parameters = Map.empty,
|
||||
parameters = params.toMap,
|
||||
executionTime = Some(trainingTime.toMillis),
|
||||
mlParams = Some(params),
|
||||
mlResult = Some(ml))
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
BenchmarkResult(
|
||||
name = name,
|
||||
mode = executionMode.toString,
|
||||
parameters = Map.empty,
|
||||
mlParams = Some(params),
|
||||
parameters = params.toMap,
|
||||
failure = Some(Failure(e.getClass.getSimpleName,
|
||||
e.getMessage + ":\n" + e.getStackTraceString)))
|
||||
} finally {
|
||||
|
||||
@ -0,0 +1,39 @@
|
||||
package com.databricks.spark.sql.perf.mllib
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
import scala.reflect.runtime.universe._
|
||||
|
||||
/** Exposes methods to simplify implementation of classes like MLParams. */
|
||||
private[perf] object ReflectionUtils {
|
||||
|
||||
private def getConstructor[T: TypeTag: ClassTag](obj: T): MethodSymbol = {
|
||||
typeOf[T].declaration(nme.CONSTRUCTOR).asMethod
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an instance [[obj]] of a class whose constructor arguments are all of type Option[Any],
|
||||
* returns a map of key-value pairs (argName -> argValue) where argName is the name
|
||||
* of a constructor argument with a defined (not None) value and argValue is the corresponding
|
||||
* value.
|
||||
*/
|
||||
def getConstructorArgs[T: TypeTag: ClassTag](obj: T): Map[String, Any] = {
|
||||
// Get constructor of passed-in instance
|
||||
val constructor = getConstructor(obj)
|
||||
// Include each constructor argument not equal to None in the output map
|
||||
constructor.paramss.flatten.flatMap { (param: Symbol) =>
|
||||
// Get name and value of the constructor argument
|
||||
val paramName = param.name.toString
|
||||
val getter = obj.getClass.getDeclaredField(paramName)
|
||||
getter.setAccessible(true)
|
||||
val paramValue = getter.get(obj)
|
||||
// If the constructor argument is defined, include it in our output map
|
||||
paramValue match {
|
||||
case value: Option[Any] => if (value.isDefined) Seq(paramName -> paramValue) else Seq.empty
|
||||
case _ => throw new UnsupportedOperationException("ReflectionUtils.getConstructorArgs " +
|
||||
"can only be called on instances of classes whose constructor arguments are all of " +
|
||||
s"type Option[Any]; constructor argument ${paramName} had invalid type.")
|
||||
}
|
||||
}.toMap
|
||||
}
|
||||
|
||||
}
|
||||
@ -57,7 +57,7 @@ object NaiveBayes extends BenchmarkAlgorithm
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
import ctx.params._
|
||||
new ml.classification.NaiveBayes()
|
||||
.setSmoothing(naiveBayesSmoothing)
|
||||
.setSmoothing(smoothing)
|
||||
}
|
||||
|
||||
override protected def evaluator(ctx: MLBenchContext): Evaluator =
|
||||
|
||||
@ -24,14 +24,14 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining {
|
||||
numPartitions
|
||||
)
|
||||
val seed: Int = randomSeed
|
||||
val docLength = ldaDocLength.get
|
||||
val numVocab = ldaNumVocabulary.get
|
||||
val docLen = docLength.get
|
||||
val numVocab = vocabSize.get
|
||||
val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) =>
|
||||
val rng = new Well19937c(seed ^ idx)
|
||||
partition.map { docIndex =>
|
||||
var currentSize = 0
|
||||
val entries = MHashMap[Int, Int]()
|
||||
while (currentSize < docLength) {
|
||||
while (currentSize < docLen) {
|
||||
val index = rng.nextInt(numVocab)
|
||||
entries(index) = entries.getOrElse(index, 0) + 1
|
||||
currentSize += 1
|
||||
|
||||
@ -16,6 +16,8 @@
|
||||
|
||||
package com.databricks.spark.sql.perf
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.ReflectionUtils
|
||||
|
||||
/**
|
||||
* The performance results of all given queries for a single iteration.
|
||||
*
|
||||
@ -81,7 +83,6 @@ case class BenchmarkResult(
|
||||
breakDown: Seq[BreakdownResult] = Nil,
|
||||
queryExecution: Option[String] = None,
|
||||
failure: Option[Failure] = None,
|
||||
mlParams: Option[MLParams] = None,
|
||||
mlResult: Option[MLResult] = None)
|
||||
|
||||
/**
|
||||
@ -104,37 +105,88 @@ case class BreakdownResult(
|
||||
|
||||
case class Failure(className: String, message: String)
|
||||
|
||||
// KEEP ARGUMENTS SORTED BY NAME.
|
||||
// It simplifies lookup when checking if a parameter is here already.
|
||||
case class MLParams(
|
||||
/**
|
||||
* Class wrapping parameters for ML tests.
|
||||
*
|
||||
* KEEP CONSTRUCTOR ARGUMENTS SORTED BY NAME.
|
||||
* It simplifies lookup when checking if a parameter is here already.
|
||||
*/
|
||||
class MLParams(
|
||||
// *** Common to all algorithms ***
|
||||
randomSeed: Option[Int] = Some(42),
|
||||
numExamples: Option[Long] = None,
|
||||
numTestExamples: Option[Long] = None,
|
||||
numPartitions: Option[Int] = None,
|
||||
val randomSeed: Option[Int] = Some(42),
|
||||
val numExamples: Option[Long] = None,
|
||||
val numTestExamples: Option[Long] = None,
|
||||
val numPartitions: Option[Int] = None,
|
||||
// *** Specialized and sorted by name ***
|
||||
bucketizerNumBuckets: Option[Int] = None,
|
||||
depth: Option[Int] = None,
|
||||
elasticNetParam: Option[Double] = None,
|
||||
family: Option[String] = None,
|
||||
k: Option[Int] = None,
|
||||
ldaDocLength: Option[Int] = None,
|
||||
ldaNumVocabulary: Option[Int] = None,
|
||||
link: Option[String] = None,
|
||||
maxIter: Option[Int] = None,
|
||||
naiveBayesSmoothing: Option[Double] = None,
|
||||
numClasses: Option[Int] = None,
|
||||
numFeatures: Option[Int] = None,
|
||||
numItems: Option[Int] = None,
|
||||
numUsers: Option[Int] = None,
|
||||
optimizer: Option[String] = None,
|
||||
regParam: Option[Double] = None,
|
||||
rank: Option[Int] = None,
|
||||
tol: Option[Double] = None
|
||||
)
|
||||
val bucketizerNumBuckets: Option[Int] = None,
|
||||
val depth: Option[Int] = None,
|
||||
val docLength: Option[Int] = None,
|
||||
val elasticNetParam: Option[Double] = None,
|
||||
val family: Option[String] = None,
|
||||
val k: Option[Int] = None,
|
||||
val link: Option[String] = None,
|
||||
val maxIter: Option[Int] = None,
|
||||
val numClasses: Option[Int] = None,
|
||||
val numFeatures: Option[Int] = None,
|
||||
val numItems: Option[Int] = None,
|
||||
val numUsers: Option[Int] = None,
|
||||
val optimizer: Option[String] = None,
|
||||
val regParam: Option[Double] = None,
|
||||
val rank: Option[Int] = None,
|
||||
val smoothing: Option[Double] = None,
|
||||
val tol: Option[Double] = None,
|
||||
val vocabSize: Option[Int] = None) {
|
||||
|
||||
/**
|
||||
* Returns a map of param names to string representations of their values. Only params that
|
||||
* were defined (i.e., not equal to None) are included in the map.
|
||||
*/
|
||||
def toMap: Map[String, String] = {
|
||||
val allParams = ReflectionUtils.getConstructorArgs(this)
|
||||
allParams.map { case (key: String, value: Any) =>
|
||||
key -> value.toString
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a copy of the current MLParams instance */
|
||||
def copy(
|
||||
// *** Common to all algorithms ***
|
||||
randomSeed: Option[Int] = randomSeed,
|
||||
numExamples: Option[Long] = numExamples,
|
||||
numTestExamples: Option[Long] = numTestExamples,
|
||||
numPartitions: Option[Int] = numPartitions,
|
||||
// *** Specialized and sorted by name ***
|
||||
bucketizerNumBuckets: Option[Int] = bucketizerNumBuckets,
|
||||
depth: Option[Int] = depth,
|
||||
docLength: Option[Int] = docLength,
|
||||
elasticNetParam: Option[Double] = elasticNetParam,
|
||||
family: Option[String] = family,
|
||||
k: Option[Int] = k,
|
||||
link: Option[String] = link,
|
||||
maxIter: Option[Int] = maxIter,
|
||||
numClasses: Option[Int] = numClasses,
|
||||
numFeatures: Option[Int] = numFeatures,
|
||||
numItems: Option[Int] = numItems,
|
||||
numUsers: Option[Int] = numUsers,
|
||||
vocabSize: Option[Int] = vocabSize,
|
||||
optimizer: Option[String] = optimizer,
|
||||
regParam: Option[Double] = regParam,
|
||||
rank: Option[Int] = rank,
|
||||
smoothing: Option[Double] = smoothing,
|
||||
tol: Option[Double] = tol): MLParams = {
|
||||
new MLParams(randomSeed = randomSeed, numExamples = numExamples,
|
||||
numTestExamples = numTestExamples, numPartitions = numPartitions,
|
||||
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
|
||||
elasticNetParam = elasticNetParam, family = family, k = k, link = link, maxIter = maxIter,
|
||||
numClasses = numClasses, numFeatures = numFeatures,
|
||||
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
|
||||
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
object MLParams {
|
||||
val empty = MLParams()
|
||||
val empty = new MLParams()
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -17,8 +17,8 @@ benchmarks:
|
||||
params:
|
||||
numExamples: 10
|
||||
numTestExamples: 10
|
||||
ldaDocLength: 20
|
||||
ldaNumVocabulary: 4
|
||||
docLength: 20
|
||||
vocabSize: 4
|
||||
k: 5
|
||||
maxIter: 10
|
||||
optimizer:
|
||||
@ -87,6 +87,6 @@ benchmarks:
|
||||
- name: classification.NaiveBayes
|
||||
params:
|
||||
numExamples: 100
|
||||
naiveBayesSmoothing: 1.0
|
||||
smoothing: 1.0
|
||||
numClasses: 10
|
||||
numFeatures: [10]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user