[ML-2847] Add new tests for (DecisionTree, RandomForest)Regression, GMM, HashingTF (#116)

This PR follows up on #112, adding new performance tests for DecisionTreeRegression, RandomForestRegression, GMM, and HashingTF.

Summary of changes:
* Added new performance tests
* Updated configs in mllib-small.yaml
** Alphabetized configs
** Added new configs for: RandomForestRegression, DecisionTreeRegression, GMM, HashingTF
* Refactored TreeOrForestClassification into a trait (TreeOrForestEstimator) exposing methods for all tree/forest estimator performance tests.
** Copied code from DecisionTreeClassification.scala into TreeOrForestEstimator.scala

I tested this PR by running the performance tests specified in mllib-small.yaml
This commit is contained in:
Siddharth Murching 2017-09-03 22:26:20 -07:00 committed by jkbradley
parent 19c41464c7
commit 3e1bbd00ed
11 changed files with 282 additions and 134 deletions

View File

@ -0,0 +1,74 @@
package com.databricks.spark.sql.perf.mllib
import org.apache.spark.ml.{ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator,
RegressionEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
/** Base trait for BenchmarkAlgorithm objects testing a tree or forest estimator */
private[mllib] trait TreeOrForestEstimator
extends TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
self: BenchmarkAlgorithm =>
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = TreeOrForestEstimator.getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
}
/** Base trait for BenchmarkAlgorithm objects testing a tree or forest classifier */
private[mllib] trait TreeOrForestClassifier extends TreeOrForestEstimator {
self: BenchmarkAlgorithm =>
override protected def evaluator(ctx: MLBenchContext): Evaluator = {
new MulticlassClassificationEvaluator()
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
}
}
/** Base trait for BenchmarkAlgorithm objects testing a tree or forest regressor */
private[mllib] trait TreeOrForestRegressor extends TreeOrForestEstimator {
self: BenchmarkAlgorithm =>
override protected def evaluator(ctx: MLBenchContext): Evaluator = {
new RegressionEvaluator()
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeRegressionModel(ctx.params.depth,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
}
}
private[mllib] object TreeOrForestEstimator {
/**
* Get feature arity for tree and tree ensemble tests.
* Currently, this is hard-coded as:
* - 1/4 binary features
* - 1/4 high-arity (20-category) features
* - 1/2 continuous features
*
* @return Array of length numFeatures, where 0 indicates continuous feature and
* value > 0 indicates a categorical feature of that arity.
*/
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}

View File

@ -2,37 +2,11 @@ package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml._
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
abstract class TreeOrForestClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
getFeatureArity(ctx), ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}
object DecisionTreeClassification extends TreeOrForestClassification {
object DecisionTreeClassification extends BenchmarkAlgorithm with TreeOrForestClassifier {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
@ -41,24 +15,3 @@ object DecisionTreeClassification extends TreeOrForestClassification {
.setSeed(ctx.seed())
}
}
object TreeOrForestClassification {
/**
* Get feature arity for tree and tree ensemble tests.
* Currently, this is hard-coded as:
* - 1/2 binary features
* - 1/2 high-arity (20-category) features
* - 1/2 continuous features
*
* @return Array of length numFeatures, where 0 indicates continuous feature and
* value > 0 indicates a categorical feature of that arity.
*/
def getFeatureArity(ctx: MLBenchContext): Array[Int] = {
val numFeatures = ctx.params.numFeatures
val fourthFeatures = numFeatures / 4
Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical
Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical
Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous
}
}

View File

@ -1,27 +1,14 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer, TreeUtils}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql._
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier {
object GBTClassification extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
import TreeOrForestClassification.getFeatureArity
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val featureArity: Array[Int] = getFeatureArity(ctx)
val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples,
ctx.seed(), numPartitions, featureArity)
TreeUtils.setMetadata(data, "features", featureArity)
}
import TreeOrForestEstimator.getFeatureArity
override protected def trueModel(ctx: MLBenchContext): Transformer = {
import ctx.params._
@ -41,6 +28,4 @@ object GBTClassification extends BenchmarkAlgorithm
.setSeed(ctx.seed())
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}

View File

@ -7,7 +7,7 @@ import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
object RandomForestClassification extends TreeOrForestClassification {
object RandomForestClassification extends BenchmarkAlgorithm with TreeOrForestClassifier {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._

View File

@ -0,0 +1,30 @@
package com.databricks.spark.sql.perf.mllib.clustering
import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql.DataFrame
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
object GaussianMixture extends BenchmarkAlgorithm with TestFromTraining {
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
import ctx.params._
DataGenerator.generateGaussianMixtureData(ctx.sqlContext, numCenters = k,
numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions,
numFeatures = numFeatures)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.clustering.GaussianMixture()
.setK(k)
.setSeed(randomSeed.toLong)
.setMaxIter(maxIter)
.setTol(tol)
}
// TODO(?) add a scoring method here.
}

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.clustering
import org.apache.spark.ml
import org.apache.spark.ml.{Estimator, PipelineStage}
import org.apache.spark.ml.{PipelineStage}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
@ -23,6 +23,7 @@ object KMeans extends BenchmarkAlgorithm with TestFromTraining {
.setK(k)
.setSeed(randomSeed.toLong)
.setMaxIter(maxIter)
.setTol(tol)
}
// TODO(?) add a scoring method here.

View File

@ -0,0 +1,42 @@
package com.databricks.spark.sql.perf.mllib.feature
import scala.util.Random
import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql._
import org.apache.spark.sql.functions.split
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
object HashingTF extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer {
// Sample a random sentence of length up to maxLen from the provided array of words
private def randomSentence(rng: Random, maxLen: Int, dictionary: Array[String]): Array[String] = {
val length = rng.nextInt(maxLen - 1) + 1
val dictLength = dictionary.length
Array.tabulate[String](length)(_ => dictionary(rng.nextInt(dictLength)))
}
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
import ctx.params._
// To test HashingTF, we generate arrays of (on average) docLength strings, where
// each string is selected from a pool of vocabSize strings
// The expected # of occurrences of each word in our vocabulary is
// (docLength * numExamples) / vocabSize
val df = DataGenerator.generateDoc(ctx.sqlContext, numExamples = numExamples, seed = ctx.seed(),
numPartitions = numPartitions, vocabSize = vocabSize, avgDocLength = docLength,
dataColName = inputCol)
df.withColumn(inputCol, split(df(inputCol), " "))
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.feature.HashingTF()
.setInputCol(inputCol)
.setNumFeatures(numFeatures)
}
}

View File

@ -0,0 +1,18 @@
package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.regression.DecisionTreeRegressor
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
object DecisionTreeRegression extends BenchmarkAlgorithm with TreeOrForestRegressor {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new DecisionTreeRegressor()
.setMaxDepth(depth)
.setSeed(ctx.seed())
}
}

View File

@ -0,0 +1,18 @@
package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.regression.RandomForestRegressor
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext,
TreeOrForestRegressor}
object RandomForestRegression extends BenchmarkAlgorithm with TreeOrForestRegressor {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new RandomForestRegressor()
.setMaxDepth(depth)
.setNumTrees(maxIter)
.setSeed(ctx.seed())
}
}

View File

@ -181,14 +181,14 @@ class MLParams(
new MLParams(randomSeed = randomSeed, numExamples = numExamples,
numTestExamples = numTestExamples, numPartitions = numPartitions,
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity, k = k, link = link, maxIter = maxIter,
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity, k = k,
link = link, maxIter = maxIter,
numClasses = numClasses, numFeatures = numFeatures, numInputCols = numInputCols,
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
}
}
object MLParams {
val empty = new MLParams()
}

View File

@ -7,12 +7,62 @@ common:
numPartitions: 3
randomSeed: [1]
benchmarks:
- name: classification.DecisionTreeClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 4
numFeatures: 5
- name: classification.GBTClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 2
numFeatures: 5
maxIter: 3
- name: classification.LinearSVC
params:
numExamples: 100
numFeatures: 10
regParam: 0.1
tol: 0.001
maxIter: 10
- name: classification.LogisticRegression
params:
numFeatures: 100
regParam: 0.1
tol: [0.2, 0.1]
maxIter: 10
- name: classification.NaiveBayes
params:
numExamples: 100
smoothing: 1.0
numClasses: 10
numFeatures: [10]
- name: classification.RandomForestClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 4
numFeatures: 5
maxIter: 3
- name: clustering.GaussianMixture
params:
numExamples: 10
numTestExamples: 10
k: 5
maxIter: 10
tol: 0.01
- name: clustering.KMeans
params:
numExamples: 10
numTestExamples: 10
k: 5
maxIter: 10
tol: 1e-4
- name: clustering.LDA
params:
numExamples: 10
@ -24,72 +74,15 @@ benchmarks:
optimizer:
- em
- online
- name: clustering.KMeans
params:
numExamples: 10
numTestExamples: 10
k: 5
maxIter: 10
- name: regression.GLMRegression
params:
numExamples: 100
numTestExamples: 10
numFeatures: 5
link: log
family: gaussian
tol: 0.0
maxIter: 10
regParam: 0.1
- name: classification.DecisionTreeClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 4
numFeatures: 5
- name: classification.RandomForestClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 4
numFeatures: 5
maxIter: 3
- name: classification.GBTClassification
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 2
numFeatures: 5
maxIter: 3
- name: regression.LinearRegression
params:
numExamples: 100
numTestExamples: 100
numFeatures: 100
regParam: 0.1
tol: [0.0]
maxIter: 10
- name: recommendation.ALS
params:
numExamples: 100
numTestExamples: 100
numUsers: 100
numItems: 100
regParam: 0.1
rank: 10
maxIter: 6
- name: feature.Bucketizer
params:
numExamples: 100
bucketizerNumBuckets: 10
- name: classification.NaiveBayes
- name: feature.HashingTF
params:
numExamples: 100
smoothing: 1.0
numClasses: 10
numFeatures: [10]
docLength: 20
vocabSize: 4
- name: feature.OneHotEncoder
params:
numExamples: 100
@ -112,10 +105,44 @@ benchmarks:
params:
numExamples: 100
numFeatures: 10
- name: classification.LinearSVC
- name: recommendation.ALS
params:
numExamples: 100
numFeatures: 10
numTestExamples: 100
numUsers: 100
numItems: 100
regParam: 0.1
tol: 0.001
maxIter: 10
rank: 10
maxIter: 6
- name: regression.DecisionTreeRegression
params:
numExamples: 100
numTestExamples: 10
depth: 3
numClasses: 4
numFeatures: 5
- name: regression.GLMRegression
params:
numExamples: 100
numTestExamples: 10
numFeatures: 5
link: log
family: gaussian
tol: 1e-6
maxIter: 10
regParam: 0.1
- name: regression.LinearRegression
params:
numExamples: 100
numTestExamples: 100
numFeatures: 100
regParam: 0.1
tol: [1e-6]
maxIter: 10
- name: regression.RandomForestRegression
params:
numExamples: 100
numTestExamples: 10
depth: 3
numFeatures: 5
maxIter: 3