Update tests to run with Spark 2.2, add NaiveBayes & Bucketizer ML tests (#110)

* Made small updates in Benchmark.scala and Query.scala for Spark 2.2
* Added tests for NaiveBayesModel and Bucketizer
* Changed BenchmarkAlgorithm.getEstimator() -> BenchmarkAlgorithm.getPipelineStage() to allow for the benchmarking of Estimators and Transformers instead of just Estimators

Commits:
* Changes made so that spark-sql-perf compiles with Spark 2.2

* Updates for running ML tests from the command line + added Naive Bayes test

* Add Bucketizer test as example of Featurizer test; change getEstimator() to getPipelineStage() in
BenchmarkAlgorithm to allow for testing of transformers in addition to estimators.

* Add comment for main method in MLlib.scala

* Rename MLTransformerBenchmarkable --> MLPipelineStageBenchmarkable, fix issue with NaiveBayes param

* Add UnaryTransformer trait for common data/methods to be shared across all objects
testing featurizers that operate on a single column (StringIndexer, OneHotEncoder, Bucketizer, HashingTF, etc)

* Respond to review comments:

* bin/run-ml: Add newline at EOF
* Query.scala: organized imports
* MLlib.scala: organized imports, fixed SparkContext initialization
* NaiveBayes.scala: removed unused temp val, improved probability calculation in trueModel()
* Bucketizer.scala: use DataGenerator.generateContinuousFeatures instead of generating data on the driver

* Fix bug in Bucketizer.scala

* Precompute log of sum of unnormalized probabilities in NaiveBayes.scala, add NaiveBayes and Bucketizer tests to mllib-small.yaml

* Update Query.scala to use p() to access SparkPlans under a given SparkPlan

* Update README to indicate that spark-sql-perf only works with Spark 2.2+ after this PR
This commit is contained in:
Siddharth Murching 2017-08-21 15:07:46 -07:00 committed by jkbradley
parent b3a6ed79b3
commit d0de5ae8aa
24 changed files with 231 additions and 54 deletions

View File

@ -2,7 +2,7 @@
[![Build Status](https://travis-ci.org/databricks/spark-sql-perf.svg)](https://travis-ci.org/databricks/spark-sql-perf)
This is a performance testing framework for [Spark SQL](https://spark.apache.org/sql/) in [Apache Spark](https://spark.apache.org/) 1.6+.
This is a performance testing framework for [Spark SQL](https://spark.apache.org/sql/) in [Apache Spark](https://spark.apache.org/) 2.2+.
**Note: This README is still under development. Please also check our source code for more information.**
@ -26,6 +26,11 @@ Usage: spark-sql-perf [options]
$ bin/run --benchmark DatasetPerformance
```
### MLlib tests
To run MLlib tests, run `/bin/run-ml yamlfile`, where `yamlfile` is the path to a YAML configuration
file describing tests to run and their parameters.
# TPC-DS
## How to use it

8
bin/run-ml Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
# runs spark-sql-perf ML tests from the current directory
# accepts a single command-line argument; the path to a YAML configuration file
# specifying the tests to be run & their params
ARGS="runMLBenchmark $@"
build/sbt "$ARGS"

View File

@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf"
// All Spark Packages need a license
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
sparkVersion := "2.0.1"
sparkVersion := "2.2.0"
sparkComponents ++= Seq("sql", "hive", "mllib")
@ -72,6 +72,19 @@ runBenchmark := {
streams.value.log)
}
val runMLBenchmark = inputKey[Unit]("runs an ML benchmark")
runMLBenchmark := {
import complete.DefaultParsers._
val args = spaceDelimited("[args]").parsed
val scalaRun = (runner in run).value
val classpath = (fullClasspath in Compile).value
scalaRun.run("com.databricks.spark.sql.perf.mllib.MLLib", classpath.map(_.data), args,
streams.value.log)
}
import ReleaseTransformations._
/** Push to the team directory instead of the user's homedir for releases. */

View File

@ -334,7 +334,7 @@ object Benchmark {
.flatMap { query =>
try {
query.newDataFrame().queryExecution.logical.collect {
case UnresolvedRelation(t, _) => t.table
case UnresolvedRelation(t) => t.table
}
} catch {
// ignore the queries that can't be parsed

View File

@ -16,9 +16,9 @@
package com.databricks.spark.sql.perf
import scala.language.implicitConversions
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.language.implicitConversions
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
@ -54,7 +54,7 @@ class Query(
}
lazy val tablesInvolved = buildDataFrame.queryExecution.logical collect {
case UnresolvedRelation(tableIdentifier, _) => {
case UnresolvedRelation(tableIdentifier) => {
// We are ignoring the database name.
tableIdentifier.table
}
@ -85,14 +85,14 @@ class Query(
val breakdownResults = if (includeBreakdown) {
val depth = queryExecution.executedPlan.collect { case p: SparkPlan => p }.size
val physicalOperators = (0 until depth).map(i => (i, queryExecution.executedPlan(i)))
val physicalOperators = (0 until depth).map(i => (i, queryExecution.executedPlan.p(i)))
val indexMap = physicalOperators.map { case (index, op) => (op, index) }.toMap
val timeMap = new mutable.HashMap[Int, Double]
physicalOperators.reverse.map {
case (index, node) =>
messages += s"Breakdown: ${node.simpleString}"
val newNode = buildDataFrame.queryExecution.executedPlan(index)
val newNode = buildDataFrame.queryExecution.executedPlan.p(index)
val executionTime = measureTimeMs {
newNode.execute().foreach((row: Any) => Unit)
}
@ -100,7 +100,6 @@ class Query(
val childIndexes = node.children.map(indexMap)
val childTime = childIndexes.map(timeMap).sum
messages += s"Breakdown time: $executionTime (+${executionTime - childTime})"
BreakdownResult(

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib
import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
import org.apache.spark.ml.attribute.{NominalAttribute, NumericAttribute}
import org.apache.spark.ml.{Estimator, Transformer}
import org.apache.spark.ml.{Estimator, PipelineStage, Transformer}
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
@ -27,9 +27,9 @@ trait BenchmarkAlgorithm extends Logging {
def testDataSet(ctx: MLBenchContext): DataFrame
/**
* Create an [[Estimator]] with params set from the given [[MLBenchContext]].
* Create an [[Estimator]] or [[Transformer]] with params set from the given [[MLBenchContext]].
*/
def getEstimator(ctx: MLBenchContext): Estimator[_]
def getPipelineStage(ctx: MLBenchContext): PipelineStage
/**
* The unnormalized score of the training procedure on a dataset. The normalization is

View File

@ -30,8 +30,8 @@ object MLBenchmarks {
val context = SparkContext.getOrCreate()
val sqlContext: SQLContext = SQLContext.getOrCreate(context)
def benchmarkObjects: Seq[MLTransformerBenchmarkable] = benchmarks.map { mlb =>
new MLTransformerBenchmarkable(mlb.params, mlb.benchmark, sqlContext)
def benchmarkObjects: Seq[MLPipelineStageBenchmarkable] = benchmarks.map { mlb =>
new MLPipelineStageBenchmarkable(mlb.params, mlb.benchmark, sqlContext)
}
}

View File

@ -2,12 +2,12 @@ package com.databricks.spark.sql.perf.mllib
import scala.language.implicitConversions
import com.databricks.spark.sql.perf._
import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import com.databricks.spark.sql.perf._
class MLLib(@transient sqlContext: SQLContext)
@ -34,6 +34,16 @@ object MLLib extends Logging {
e.getCurrentResults()
}
/**
* Entry point for running ML tests. Expects a single command-line argument: the path to
* a YAML config file specifying which ML tests to run and their parameters.
* @param args command line args
*/
def main(args: Array[String]): Unit = {
val configFile = args(0)
run(yamlFile = configFile)
}
/**
* Runs all the experiments and blocks on completion
*
@ -46,13 +56,15 @@ object MLLib extends Logging {
require(yamlConfig != null)
YamlConfig.readString(yamlConfig)
}
val sc = SparkContext.getOrCreate()
val sparkConf = new SparkConf().setAppName("MLlib QA").setMaster("local[2]")
val sc = SparkContext.getOrCreate(sparkConf)
sc.setLogLevel("INFO")
val b = new com.databricks.spark.sql.perf.mllib.MLLib()
val sqlContext = com.databricks.spark.sql.perf.mllib.MLBenchmarks.sqlContext
val benchmarksDescriptions = conf.runnableBenchmarks
val benchmarks = benchmarksDescriptions.map { mlb =>
new MLTransformerBenchmarkable(mlb.params, mlb.benchmark, sqlContext)
new MLPipelineStageBenchmarkable(mlb.params, mlb.benchmark, sqlContext)
}
println(s"${benchmarks.size} benchmarks identified:")
val str = benchmarks.map(_.prettyPrint).mkString("\n")

View File

@ -1,20 +1,21 @@
package com.databricks.spark.sql.perf.mllib
import com.databricks.spark.sql.perf._
import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
import org.apache.spark.sql._
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.Transformer
import com.typesafe.scalalogging.slf4j.{LazyLogging => Logging}
class MLTransformerBenchmarkable(
import org.apache.spark.ml.{Estimator, Transformer}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf._
class MLPipelineStageBenchmarkable(
params: MLParams,
test: BenchmarkAlgorithm,
sqlContext: SQLContext)
extends Benchmarkable with Serializable with Logging {
import MLTransformerBenchmarkable._
import MLPipelineStageBenchmarkable._
private var testData: DataFrame = null
private var trainingData: DataFrame = null
@ -48,17 +49,27 @@ class MLTransformerBenchmarkable(
try {
val (trainingTime, model: Transformer) = measureTime {
logger.info(s"$this: train: trainingSet=${trainingData.schema}")
val estimator = test.getEstimator(param)
estimator.fit(trainingData)
test.getPipelineStage(param) match {
case est: Estimator[_] => est.fit(trainingData)
case transformer: Transformer =>
transformer.transform(trainingData)
transformer
case other: Any => throw new UnsupportedOperationException("Algorithm to benchmark must" +
s" be an estimator or transformer, found ${other.getClass} instead.")
}
}
logger.info(s"model: $model")
val (_, scoreTraining) = measureTime {
val (scoreTrainTime, scoreTraining) = measureTime {
test.score(param, trainingData, model)
}
val (scoreTestTime, scoreTest) = measureTime {
test.score(param, testData, model)
}
logger.info(s"$this doBenchmark: Trained model in ${trainingTime.toMillis / 1000.0}" +
s" s, Scored training dataset in ${scoreTrainTime.toMillis / 1000.0} s," +
s" test dataset in ${scoreTestTime.toMillis / 1000.0} s")
val ml = MLResult(
trainingTime = Some(trainingTime.toMillis),
@ -96,7 +107,7 @@ class MLTransformerBenchmarkable(
}
object MLTransformerBenchmarkable {
object MLPipelineStageBenchmarkable {
private def pprint(p: AnyRef): Seq[String] = {
val m = getCCParams(p)
m.flatMap {

View File

@ -1,6 +1,6 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml._
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql.DataFrame
@ -34,7 +34,7 @@ abstract class TreeOrForestClassification extends BenchmarkAlgorithm
object DecisionTreeClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new DecisionTreeClassifier()
.setMaxDepth(depth)

View File

@ -1,12 +1,12 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer, TreeUtils}
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
@ -31,7 +31,7 @@ object GBTClassification extends BenchmarkAlgorithm
ctx.seed())
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
// TODO: subsamplingRate, featureSubsetStrategy
// TODO: cacheNodeIds, checkpoint?

View File

@ -3,9 +3,8 @@ package com.databricks.spark.sql.perf.mllib.classification
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer}
import org.apache.spark.ml.{Estimator, ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors
@ -32,7 +31,7 @@ object LogisticRegression extends BenchmarkAlgorithm
ModelBuilder.newLogisticRegressionModel(coefficients, intercept)
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.classification.LogisticRegression()
.setTol(tol)

View File

@ -0,0 +1,65 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
/** Object containing methods used in performance tests for (multinomial) NaiveBayesModels */
object NaiveBayes extends BenchmarkAlgorithm
with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
val rng = ctx.newGenerator()
// Max possible arity of a feature in generated training/test data for NaiveBayes models
val maxFeatureArity = 20
// All features for Naive Bayes must be categorical, i.e. have arity >= 2
val featureArity = 0.until(numFeatures).map(_ => 2 + rng.nextInt(maxFeatureArity - 2)).toArray
DataGenerator.generateMixedFeatures(
ctx.sqlContext,
numExamples,
ctx.seed(),
numPartitions,
featureArity)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
import ctx.params._
val rng = ctx.newGenerator()
// pi = log of class priors, whose dimension is C (number of classes)
// theta = log of class conditional probabilities, whose dimension is C (number of classes)
// by D (number of features)
val unnormalizedProbs = 0.until(numClasses).map(_ => rng.nextDouble() + 1e-5).toArray
val logProbSum = math.log(unnormalizedProbs.sum)
val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum)
// For class i, set the class-conditional probability of feature i to 0.7, and split up the
// remaining probability mass across the other features
val currClassProb = 0.7
val thetaArray = Array.tabulate(numClasses) { i: Int =>
val baseProbMass = (1 - currClassProb) / (numFeatures - 1)
val probs = Array.fill[Double](numFeatures)(baseProbMass)
probs(i) = currClassProb
probs
}.map(_.map(math.log))
// Initialize new Naive Bayes model
val pi = Vectors.dense(piArray)
val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true)
ModelBuilder.newNaiveBayesModel(pi, theta)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.classification.NaiveBayes()
.setSmoothing(naiveBayesSmoothing)
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new MulticlassClassificationEvaluator()
}

View File

@ -1,6 +1,6 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.{Estimator, PipelineStage}
import org.apache.spark.ml.classification.RandomForestClassifier
import com.databricks.spark.sql.perf.mllib._
@ -9,7 +9,7 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
object RandomForestClassification extends TreeOrForestClassification {
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
// TODO: subsamplingRate, featureSubsetStrategy
// TODO: cacheNodeIds, checkpoint?

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.clustering
import org.apache.spark.ml
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.{Estimator, PipelineStage}
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
@ -17,7 +17,7 @@ object KMeans extends BenchmarkAlgorithm with TestFromTraining {
numPartitions, numFeatures)
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.clustering.KMeans()
.setK(k)

View File

@ -4,7 +4,7 @@ import scala.collection.mutable.{HashMap => MHashMap}
import org.apache.commons.math3.random.Well19937c
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.{Estimator, PipelineStage}
import org.apache.spark.ml
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
@ -44,7 +44,7 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining {
ctx.sqlContext.createDataFrame(data).toDF("docIndex", "features")
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.clustering.LDA()
.setK(k)

View File

@ -0,0 +1,42 @@
package com.databricks.spark.sql.perf.mllib.feature
import scala.util.Random
import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
/** Object for testing Bucketizer performance */
object Bucketizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer {
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
import ctx.params._
import ctx.sqlContext.implicits._
val rng = ctx.newGenerator()
// For a bucketizer, training data consists of a single column of random doubles
DataGenerator.generateContinuousFeatures(ctx.sqlContext,
numExamples, ctx.seed(), numPartitions, numFeatures = 1).rdd.map { case Row(vec: Vector) =>
vec(0) // extract the single generated double value for each row
}.toDF(inputCol)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
val rng = ctx.newGenerator()
// Generate an array of (finite) splitting points in [-1, 1) for the Bucketizer
val splitPoints = 0.until(bucketizerNumBuckets - 1).map { _ =>
2 * rng.nextDouble() - 1
}.sorted.toArray
// Final array of splits contains +/- infinity
val splits = Array(Double.NegativeInfinity) ++ splitPoints ++ Array(Double.PositiveInfinity)
new ml.feature.Bucketizer()
.setSplits(splits)
.setInputCol(inputCol)
}
}

View File

@ -0,0 +1,6 @@
package com.databricks.spark.sql.perf.mllib.feature
/** Trait defining common state/methods for featurizers taking a single input col */
private[feature] trait UnaryTransformer {
private[feature] val inputCol = "inputCol"
}

View File

@ -1,13 +1,13 @@
package com.databricks.spark.sql.perf.mllib.recommendation
import org.apache.spark.ml
import org.apache.spark.ml.evaluation.{RegressionEvaluator, Evaluator}
import org.apache.spark.ml.{Transformer, Estimator}
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import com.databricks.spark.sql.perf.mllib.{ScoringWithEvaluator, BenchmarkAlgorithm, MLBenchContext}
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, ScoringWithEvaluator}
object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator {
@ -37,7 +37,7 @@ object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator {
ctx.seed())._2
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.recommendation.ALS()
.setSeed(ctx.seed())

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer}
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
@ -36,7 +36,7 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with
m
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new GeneralizedLinearRegression()
.setLink(link)

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer}
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
@ -32,7 +32,7 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with
ModelBuilder.newLinearRegressionModel(coefficients, intercept)
}
override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.regression.LinearRegression()
.setSolver("l-bfgs")

View File

@ -113,6 +113,7 @@ case class MLParams(
numTestExamples: Option[Long] = None,
numPartitions: Option[Int] = None,
// *** Specialized and sorted by name ***
bucketizerNumBuckets: Option[Int] = None,
depth: Option[Int] = None,
elasticNetParam: Option[Double] = None,
family: Option[String] = None,
@ -121,6 +122,7 @@ case class MLParams(
ldaNumVocabulary: Option[Int] = None,
link: Option[String] = None,
maxIter: Option[Int] = None,
naiveBayesSmoothing: Option[Double] = None,
numClasses: Option[Int] = None,
numFeatures: Option[Int] = None,
numItems: Option[Int] = None,

View File

@ -80,3 +80,13 @@ benchmarks:
regParam: 0.1
rank: 10
maxIter: 6
- name: feature.Bucketizer
params:
numExamples: 100
bucketizerNumBuckets: 10
- name: classification.NaiveBayes
params:
numExamples: 100
naiveBayesSmoothing: 1.0
numClasses: 10
numFeatures: [10]

View File

@ -1,8 +1,8 @@
package org.apache.spark.ml
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, LogisticRegressionModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.regression.{LinearRegressionModel, GeneralizedLinearRegressionModel, DecisionTreeRegressionModel}
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, LogisticRegressionModel, NaiveBayesModel}
import org.apache.spark.ml.linalg.{Matrix, Vector}
import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, GeneralizedLinearRegressionModel, LinearRegressionModel}
import org.apache.spark.ml.tree._
import org.apache.spark.mllib.random.RandomDataGenerator
import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
@ -51,6 +51,11 @@ object ModelBuilder {
featureArity = featureArity, seed = seed)
new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length)
}
def newNaiveBayesModel(pi: Vector, theta: Matrix): NaiveBayesModel = {
val model = new NaiveBayesModel("naivebayes-uid", pi, theta)
model.set(model.modelType, "multinomial")
}
}
/**