[ML-3342] Bug fixes to make mllib benchmarks work with dbr-4.0. (#125)

In spark 2.3 some default param values were moved from Models to matching Estimators. I added explicit sets for these values in our tests to avoid errors. Also renamed ModelBuilder to ModelBuildersSSP to avoid a name conflict with dbml-local which is included in databricks runtime.
This commit is contained in:
Bago Amirbekian 2018-03-02 09:12:38 -08:00 committed by jkbradley
parent 91604a3ab0
commit 6d01ac94a1
8 changed files with 23 additions and 17 deletions

View File

@ -1,6 +1,6 @@
package com.databricks.spark.sql.perf.mllib package com.databricks.spark.sql.perf.mllib
import org.apache.spark.ml.{ModelBuilder, Transformer, TreeUtils} import org.apache.spark.ml.{ModelBuilderSSP, Transformer, TreeUtils}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator, import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator,
RegressionEvaluator} RegressionEvaluator}
import org.apache.spark.sql.DataFrame import org.apache.spark.sql.DataFrame
@ -31,7 +31,7 @@ private[mllib] trait TreeOrForestClassifier extends TreeOrForestEstimator {
} }
override protected def trueModel(ctx: MLBenchContext): Transformer = { override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses, ModelBuilderSSP.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed()) TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
} }
} }
@ -45,7 +45,7 @@ private[mllib] trait TreeOrForestRegressor extends TreeOrForestEstimator {
} }
override protected def trueModel(ctx: MLBenchContext): Transformer = { override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeRegressionModel(ctx.params.depth, ModelBuilderSSP.newDecisionTreeRegressionModel(ctx.params.depth,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed()) TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
} }

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.classification.GBTClassifier import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib._
@ -14,7 +14,7 @@ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier
import ctx.params._ import ctx.params._
// We add +1 to the depth to make it more likely that many iterations of boosting are needed // We add +1 to the depth to make it more likely that many iterations of boosting are needed
// to model the true tree. // to model the true tree.
ModelBuilder.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx), ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
ctx.seed()) ctx.seed())
} }

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.Vectors
@ -28,7 +28,7 @@ object LinearSVC extends BenchmarkAlgorithm
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data. // Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1) val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLinearSVCModel(coefficients, intercept) ModelBuilderSSP.newLinearSVCModel(coefficients, intercept)
} }
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -4,7 +4,7 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{Estimator, ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.Vectors
@ -28,7 +28,7 @@ object LogisticRegression extends BenchmarkAlgorithm
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data. // Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1) val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLogisticRegressionModel(coefficients, intercept) ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept)
} }
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml import org.apache.spark.ml
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator} import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
@ -51,7 +51,7 @@ object NaiveBayes extends BenchmarkAlgorithm
// Initialize new Naive Bayes model // Initialize new Naive Bayes model
val pi = Vectors.dense(piArray) val pi = Vectors.dense(piArray)
val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true) val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true)
ModelBuilder.newNaiveBayesModel(pi, theta) ModelBuilderSSP.newNaiveBayesModel(pi, theta)
} }
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib._
@ -30,7 +30,7 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data. // Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1) val intercept = 0.01 * (2 * rng.nextDouble - 1)
val m = ModelBuilder.newGLR(coefficients, intercept) val m = ModelBuilderSSP.newGLR(coefficients, intercept)
m.set(m.link, link.get) m.set(m.link, link.get)
m.set(m.family, family.get) m.set(m.family, family.get)
m m

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml import org.apache.spark.ml
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer} import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib._
@ -29,7 +29,7 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data. // Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1) val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLinearRegressionModel(coefficients, intercept) ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept)
} }
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -11,18 +11,24 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
/** /**
* Helper for creating MLlib models which have private constructors. * Helper for creating MLlib models which have private constructors.
*/ */
object ModelBuilder { object ModelBuilderSSP {
def newLogisticRegressionModel( def newLogisticRegressionModel(
coefficients: Vector, coefficients: Vector,
intercept: Double): LogisticRegressionModel = { intercept: Double): LogisticRegressionModel = {
new LogisticRegressionModel("lr", coefficients, intercept) new LogisticRegressionModel("lr", coefficients, intercept)
.setThreshold(.5)
} }
def newLinearRegressionModel( def newLinearRegressionModel(
coefficients: Vector, coefficients: Vector,
intercept: Double): LinearRegressionModel = { intercept: Double): LinearRegressionModel = {
new LinearRegressionModel("linr", coefficients, intercept) val model = new LinearRegressionModel("linr", coefficients, intercept)
if (model.hasParam("loss")) {
model.set(model.getParam("loss"), "squaredError")
}
model
} }
def newGLR( def newGLR(