[ML-3342] Bug fixes to make mllib benchmarks work with dbr-4.0. (#125)
In spark 2.3 some default param values were moved from Models to matching Estimators. I added explicit sets for these values in our tests to avoid errors. Also renamed ModelBuilder to ModelBuildersSSP to avoid a name conflict with dbml-local which is included in databricks runtime.
This commit is contained in:
parent
91604a3ab0
commit
6d01ac94a1
@ -1,6 +1,6 @@
|
||||
package com.databricks.spark.sql.perf.mllib
|
||||
|
||||
import org.apache.spark.ml.{ModelBuilder, Transformer, TreeUtils}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, Transformer, TreeUtils}
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator,
|
||||
RegressionEvaluator}
|
||||
import org.apache.spark.sql.DataFrame
|
||||
@ -31,7 +31,7 @@ private[mllib] trait TreeOrForestClassifier extends TreeOrForestEstimator {
|
||||
}
|
||||
|
||||
override protected def trueModel(ctx: MLBenchContext): Transformer = {
|
||||
ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
|
||||
ModelBuilderSSP.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses,
|
||||
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
|
||||
}
|
||||
}
|
||||
@ -45,7 +45,7 @@ private[mllib] trait TreeOrForestRegressor extends TreeOrForestEstimator {
|
||||
}
|
||||
|
||||
override protected def trueModel(ctx: MLBenchContext): Transformer = {
|
||||
ModelBuilder.newDecisionTreeRegressionModel(ctx.params.depth,
|
||||
ModelBuilderSSP.newDecisionTreeRegressionModel(ctx.params.depth,
|
||||
TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed())
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
package com.databricks.spark.sql.perf.mllib.classification
|
||||
|
||||
import org.apache.spark.ml.classification.GBTClassifier
|
||||
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib._
|
||||
@ -14,7 +14,7 @@ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier
|
||||
import ctx.params._
|
||||
// We add +1 to the depth to make it more likely that many iterations of boosting are needed
|
||||
// to model the true tree.
|
||||
ModelBuilder.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
|
||||
ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx),
|
||||
ctx.seed())
|
||||
}
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
package com.databricks.spark.sql.perf.mllib.classification
|
||||
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
|
||||
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
|
||||
@ -28,7 +28,7 @@ object LinearSVC extends BenchmarkAlgorithm
|
||||
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
|
||||
// Small intercept to prevent some skew in the data.
|
||||
val intercept = 0.01 * (2 * rng.nextDouble - 1)
|
||||
ModelBuilder.newLinearSVCModel(coefficients, intercept)
|
||||
ModelBuilderSSP.newLinearSVCModel(coefficients, intercept)
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
|
||||
@ -4,7 +4,7 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib._
|
||||
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
|
||||
import org.apache.spark.ml.{Estimator, ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
|
||||
@ -28,7 +28,7 @@ object LogisticRegression extends BenchmarkAlgorithm
|
||||
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
|
||||
// Small intercept to prevent some skew in the data.
|
||||
val intercept = 0.01 * (2 * rng.nextDouble - 1)
|
||||
ModelBuilder.newLogisticRegressionModel(coefficients, intercept)
|
||||
ModelBuilderSSP.newLogisticRegressionModel(coefficients, intercept)
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
package com.databricks.spark.sql.perf.mllib.classification
|
||||
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator}
|
||||
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
|
||||
|
||||
@ -51,7 +51,7 @@ object NaiveBayes extends BenchmarkAlgorithm
|
||||
// Initialize new Naive Bayes model
|
||||
val pi = Vectors.dense(piArray)
|
||||
val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true)
|
||||
ModelBuilder.newNaiveBayesModel(pi, theta)
|
||||
ModelBuilderSSP.newNaiveBayesModel(pi, theta)
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
|
||||
@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
import org.apache.spark.ml.regression.GeneralizedLinearRegression
|
||||
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib._
|
||||
@ -30,7 +30,7 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with
|
||||
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
|
||||
// Small intercept to prevent some skew in the data.
|
||||
val intercept = 0.01 * (2 * rng.nextDouble - 1)
|
||||
val m = ModelBuilder.newGLR(coefficients, intercept)
|
||||
val m = ModelBuilderSSP.newGLR(coefficients, intercept)
|
||||
m.set(m.link, link.get)
|
||||
m.set(m.family, family.get)
|
||||
m
|
||||
|
||||
@ -3,7 +3,7 @@ package com.databricks.spark.sql.perf.mllib.regression
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
|
||||
import org.apache.spark.ml.linalg.Vectors
|
||||
import org.apache.spark.ml.{ModelBuilder, PipelineStage, Transformer}
|
||||
import org.apache.spark.ml.{ModelBuilderSSP, PipelineStage, Transformer}
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib._
|
||||
@ -29,7 +29,7 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with
|
||||
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
|
||||
// Small intercept to prevent some skew in the data.
|
||||
val intercept = 0.01 * (2 * rng.nextDouble - 1)
|
||||
ModelBuilder.newLinearRegressionModel(coefficients, intercept)
|
||||
ModelBuilderSSP.newLinearRegressionModel(coefficients, intercept)
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
|
||||
@ -11,18 +11,24 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
|
||||
/**
|
||||
* Helper for creating MLlib models which have private constructors.
|
||||
*/
|
||||
object ModelBuilder {
|
||||
object ModelBuilderSSP {
|
||||
|
||||
def newLogisticRegressionModel(
|
||||
coefficients: Vector,
|
||||
intercept: Double): LogisticRegressionModel = {
|
||||
new LogisticRegressionModel("lr", coefficients, intercept)
|
||||
.setThreshold(.5)
|
||||
|
||||
}
|
||||
|
||||
def newLinearRegressionModel(
|
||||
coefficients: Vector,
|
||||
intercept: Double): LinearRegressionModel = {
|
||||
new LinearRegressionModel("linr", coefficients, intercept)
|
||||
val model = new LinearRegressionModel("linr", coefficients, intercept)
|
||||
if (model.hasParam("loss")) {
|
||||
model.set(model.getParam("loss"), "squaredError")
|
||||
}
|
||||
model
|
||||
}
|
||||
|
||||
def newGLR(
|
||||
Loading…
Reference in New Issue
Block a user