Additional method test for some ML algos (#139)
Add additional method test for some ML algos. In this PR, I add `associationRules` in `FPGrowth` and `findSynonyms`. After the design is accepted, I will add other methods later. Add an interface in `BenchmarkableAlgorithm`: ``` def testAdditionalMethods(ctx: MLBenchContext, model: Transformer): Map[String, () => _] ```
This commit is contained in:
parent
5af9f6dfc2
commit
be4459fe41
@ -1,7 +1,6 @@
|
||||
package com.databricks.spark.sql.perf.mllib
|
||||
|
||||
import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
|
||||
|
||||
import org.apache.spark.ml.attribute.{NominalAttribute, NumericAttribute}
|
||||
import org.apache.spark.ml.{Estimator, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.evaluation.Evaluator
|
||||
@ -44,6 +43,17 @@ trait BenchmarkAlgorithm extends Logging {
|
||||
def name: String = {
|
||||
this.getClass.getCanonicalName.replace("$", "")
|
||||
}
|
||||
|
||||
/**
|
||||
* Test additional methods for some algorithms.
|
||||
*
|
||||
* @param transformer The transformer which includes additional methods.
|
||||
* @return A map which key is the additional method name, and value is a function which runs
|
||||
* the corresponding method.
|
||||
*/
|
||||
def testAdditionalMethods(
|
||||
ctx: MLBenchContext,
|
||||
transformer: Transformer): Map[String, () => _] = null
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -80,12 +80,18 @@ class MLPipelineStageBenchmarkable(
|
||||
s" s, Scored training dataset in ${scoreTrainTime.toMillis / 1000.0} s," +
|
||||
s" test dataset in ${scoreTestTime.toMillis / 1000.0} s")
|
||||
|
||||
val additionalTests = test.testAdditionalMethods(param, model).map {
|
||||
tuple =>
|
||||
val (additionalMethodTime, _) = measureTime { tuple._2() }
|
||||
tuple._1 -> additionalMethodTime.toMillis.toDouble
|
||||
}
|
||||
|
||||
val ml = MLResult(
|
||||
trainingTime = Some(trainingTime.toMillis),
|
||||
trainingMetric = Some(scoreTraining),
|
||||
testTime = Some(scoreTestTime.toMillis),
|
||||
testMetric = Some(scoreTest / testDataCount.get))
|
||||
testMetric = Some(scoreTest / testDataCount.get),
|
||||
additionalTests = additionalTests)
|
||||
|
||||
BenchmarkResult(
|
||||
name = name,
|
||||
|
||||
@ -1,10 +1,15 @@
|
||||
package com.databricks.spark.sql.perf.mllib.feature
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.PipelineStage
|
||||
import org.apache.spark.ml.{PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.feature.Word2VecModel
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.functions.{col, split}
|
||||
|
||||
import com.databricks.spark.sql.perf.MLResult
|
||||
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||
@ -31,4 +36,18 @@ object Word2Vec extends BenchmarkAlgorithm with TestFromTraining {
|
||||
new ml.feature.Word2Vec().setInputCol("text")
|
||||
}
|
||||
|
||||
override def testAdditionalMethods(
|
||||
ctx: MLBenchContext,
|
||||
model: Transformer): Map[String, () => _] = {
|
||||
import ctx.params._
|
||||
|
||||
val rng = new Random(ctx.seed())
|
||||
val word2vecModel = model.asInstanceOf[Word2VecModel]
|
||||
val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian()))
|
||||
|
||||
Map("findSynonyms" -> (() => {
|
||||
word2vecModel.findSynonyms(testWord, numSynonymsToFind)
|
||||
}))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
package com.databricks.spark.sql.perf.mllib.fpm
|
||||
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.PipelineStage
|
||||
import org.apache.spark.ml.{PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.fpm.FPGrowthModel
|
||||
import org.apache.spark.sql.DataFrame
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib._
|
||||
@ -28,4 +29,14 @@ object FPGrowth extends BenchmarkAlgorithm with TestFromTraining {
|
||||
new ml.fpm.FPGrowth()
|
||||
.setItemsCol("items")
|
||||
}
|
||||
|
||||
override def testAdditionalMethods(
|
||||
ctx: MLBenchContext,
|
||||
model: Transformer): Map[String, () => _] = {
|
||||
|
||||
val fpModel = model.asInstanceOf[FPGrowthModel]
|
||||
Map("associationRules" -> (() => {
|
||||
fpModel.associationRules.count()
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@ -131,6 +131,7 @@ class MLParams(
|
||||
val numClasses: Option[Int] = None,
|
||||
val numFeatures: Option[Int] = None,
|
||||
val numHashTables: Option[Int] = Some(1),
|
||||
val numSynonymsToFind: Option[Int] = None,
|
||||
val numInputCols: Option[Int] = None,
|
||||
val numItems: Option[Int] = None,
|
||||
val numUsers: Option[Int] = None,
|
||||
@ -173,6 +174,7 @@ class MLParams(
|
||||
numClasses: Option[Int] = numClasses,
|
||||
numFeatures: Option[Int] = numFeatures,
|
||||
numHashTables: Option[Int] = numHashTables,
|
||||
numSynonymsToFind: Option[Int] = numSynonymsToFind,
|
||||
numInputCols: Option[Int] = numInputCols,
|
||||
numItems: Option[Int] = numItems,
|
||||
numUsers: Option[Int] = numUsers,
|
||||
@ -188,8 +190,8 @@ class MLParams(
|
||||
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity,
|
||||
itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter,
|
||||
numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables,
|
||||
numInputCols = numInputCols, numItems = numItems, numUsers = numUsers,
|
||||
optimizer = optimizer, regParam = regParam,
|
||||
numInputCols = numInputCols, numItems = numItems, numSynonymsToFind = numSynonymsToFind,
|
||||
numUsers = numUsers, optimizer = optimizer, regParam = regParam,
|
||||
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
|
||||
}
|
||||
}
|
||||
@ -207,9 +209,11 @@ object MLParams {
|
||||
* @param testTime (MLlib) Test time (for prediction on test set, or on training set if there
|
||||
* is no test set).
|
||||
* @param testMetric (MLlib) Test metric, such as accuracy
|
||||
* @param additionalTests (MLlib) Additional methods test results.
|
||||
*/
|
||||
case class MLResult(
|
||||
trainingTime: Option[Double] = None,
|
||||
trainingMetric: Option[Double] = None,
|
||||
testTime: Option[Double] = None,
|
||||
testMetric: Option[Double] = None)
|
||||
testMetric: Option[Double] = None,
|
||||
additionalTests: Map[String, Double])
|
||||
|
||||
@ -120,6 +120,7 @@ benchmarks:
|
||||
numExamples: 100
|
||||
vocabSize: 100
|
||||
docLength: 10
|
||||
numSynonymsToFind: 3
|
||||
- name: recommendation.ALS
|
||||
params:
|
||||
numExamples: 100
|
||||
|
||||
Loading…
Reference in New Issue
Block a user