[ML-3342] Bug fixes to make mllib benchmarks work with dbr-4.0. (#125)

In spark 2.3 some default param values were moved from Models to matching Estimators. I added explicit sets for these values in our tests to avoid errors. Also renamed ModelBuilder to ModelBuildersSSP to avoid a name conflict with dbml-local which is included in databricks runtime.
This commit is contained in:
Bago Amirbekian 2018-03-02 09:12:38 -08:00 committed by jkbradley
parent 91604a3ab0
commit 6d01ac94a1
8 changed files with 23 additions and 17 deletions

View File

@ -1,6 +1,6 @@
package com.databricks.spark.sql.perf.mllib
import org.apache.spark.ml.{ModelBuilder, Transformer, TreeUtils}
import org.apache.spark.ml.{ModelBuilderSSP, Transformer, TreeUtils}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator,
RegressionEvaluator}
import org.apache.spark.sql.DataFrame
@ -31,7 +31,7 @@ private[mllib] trait TreeOrForestClassifier extends TreeOrForestEstimator {
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
ModelBuilderSSP.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
}
}
@ -45,7 +45,7 @@ private[mllib] trait TreeOrForestRegressor extends TreeOrForestEstimator {
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
ModelBuilder.newDecisionTreeRegressionModel(ctx.params.depth,
ModelBuilderSSP.newDecisionTreeRegressionModel(ctx.params.depth,
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
}

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.classification.GBTClassifier
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
@ -14,7 +14,7 @@ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier
import ctx.params._
// We add +1 to the depth to make it more likely that many iterations of boosting are needed
// to model the true tree.
ModelBuilder.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
ctx.seed())
}

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors
@ -28,7 +28,7 @@ object LinearSVC extends BenchmarkAlgorithm
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLinearSVCModel(coefficients, intercept)
ModelBuilderSSP.newLinearSVCModel(coefficients, intercept)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -4,7 +4,7 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.{Estimator, ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml
import org.apache.spark.ml.linalg.Vectors
@ -28,7 +28,7 @@ object LogisticRegression extends BenchmarkAlgorithm
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLogisticRegressionModel(coefficients, intercept)
ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -1,7 +1,7 @@
package com.databricks.spark.sql.perf.mllib.classification
import org.apache.spark.ml
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
@ -51,7 +51,7 @@ object NaiveBayes extends BenchmarkAlgorithm
// Initialize new Naive Bayes model
val pi = Vectors.dense(piArray)
val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true)
ModelBuilder.newNaiveBayesModel(pi, theta)
ModelBuilderSSP.newNaiveBayesModel(pi, theta)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
@ -30,7 +30,7 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
val m = ModelBuilder.newGLR(coefficients, intercept)
val m = ModelBuilderSSP.newGLR(coefficients, intercept)
m.set(m.link, link.get)
m.set(m.family, family.get)
m

View File

@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
import org.apache.spark.ml
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
@ -29,7 +29,7 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
ModelBuilder.newLinearRegressionModel(coefficients, intercept)
ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept)
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {

View File

@ -11,18 +11,24 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
/**
* Helper for creating MLlib models which have private constructors.
*/
object ModelBuilder {
object ModelBuilderSSP {
def newLogisticRegressionModel(
coefficients: Vector,
intercept: Double): LogisticRegressionModel = {
new LogisticRegressionModel("lr", coefficients, intercept)
.setThreshold(.5)
}
def newLinearRegressionModel(
coefficients: Vector,
intercept: Double): LinearRegressionModel = {
new LinearRegressionModel("linr", coefficients, intercept)
val model = new LinearRegressionModel("linr", coefficients, intercept)
if (model.hasParam("loss")) {
model.set(model.getParam("loss"), "squaredError")
}
model
}
def newGLR(