From def20479a1c64f92821c66b9ad4748f9333ad3b5 Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 5 Jul 2016 13:42:56 -0700 Subject: [PATCH 1/3] linear regression --- .../mllib/regression/LinearRegression.scala | 45 +++++++++++++++++++ src/main/scala/configs/mllib-small.yaml | 6 +++ .../org/apache/spark/ml/ModelBuilder.scala | 9 +++- 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala new file mode 100644 index 0000000..4f81f4c --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala @@ -0,0 +1,45 @@ +package com.databricks.spark.sql.perf.mllib.regression + +import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.regression.{LinearRegression, GeneralizedLinearRegression} +import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer} + +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator + + +object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with + TrainingSetFromTransformer with ScoringWithEvaluator { + + override protected def initialData(ctx: MLBenchContext) = { + import ctx.params._ + DataGenerator.generateContinuousFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + numFeatures) + } + + override protected def trueModel(ctx: MLBenchContext): Transformer = { + val rng = ctx.newGenerator() + val coefficients = + Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) + // Small intercept to prevent some skew in the data. + val intercept = 0.01 * (2 * rng.nextDouble - 1) + ModelBuilder.newLinearRegressionModel(coefficients, intercept) + } + + override def getEstimator(ctx: MLBenchContext): Estimator[_] = { + import ctx.params._ + new LinearRegression() + .setRegParam(regParam) + .setMaxIter(maxIter) + .setTol(tol) + } + + override protected def evaluator(ctx: MLBenchContext): Evaluator = + new RegressionEvaluator() +} diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/scala/configs/mllib-small.yaml index 6f1b023..8851bec 100644 --- a/src/main/scala/configs/mllib-small.yaml +++ b/src/main/scala/configs/mllib-small.yaml @@ -63,3 +63,9 @@ benchmarks: numClasses: 4 numFeatures: 5 maxIter: 3 + - name: classification.LogisticRegression + params: + numFeatures: 100 + regParam: 0.1 + tol: [0.2, 0.1] + maxIter: 10 diff --git a/src/main/scala/org/apache/spark/ml/ModelBuilder.scala b/src/main/scala/org/apache/spark/ml/ModelBuilder.scala index 7d0143c..ccddb57 100644 --- a/src/main/scala/org/apache/spark/ml/ModelBuilder.scala +++ b/src/main/scala/org/apache/spark/ml/ModelBuilder.scala @@ -2,8 +2,7 @@ package org.apache.spark.ml import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, LogisticRegressionModel} import org.apache.spark.ml.linalg.Vector -import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel -import org.apache.spark.ml.regression.DecisionTreeRegressionModel +import org.apache.spark.ml.regression.{LinearRegressionModel, GeneralizedLinearRegressionModel, DecisionTreeRegressionModel} import org.apache.spark.ml.tree._ import org.apache.spark.mllib.random.RandomDataGenerator import org.apache.spark.mllib.tree.impurity.ImpurityCalculator @@ -20,6 +19,12 @@ object ModelBuilder { new LogisticRegressionModel("lr", coefficients, intercept) } + def newLinearRegressionModel( + coefficients: Vector, + intercept: Double): LinearRegressionModel = { + new LinearRegressionModel("linr", coefficients, intercept) + } + def newGLR( coefficients: Vector, intercept: Double): GeneralizedLinearRegressionModel = From ce7e20ae6d5a85856f1b82c55581bbf586aca9ae Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 5 Jul 2016 13:46:19 -0700 Subject: [PATCH 2/3] set the solver --- .../spark/sql/perf/mllib/regression/LinearRegression.scala | 5 +++-- src/main/scala/configs/mllib-small.yaml | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala index 4f81f4c..8acbb51 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala @@ -1,8 +1,8 @@ package com.databricks.spark.sql.perf.mllib.regression +import org.apache.spark.ml import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.regression.{LinearRegression, GeneralizedLinearRegression} import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ @@ -34,7 +34,8 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with override def getEstimator(ctx: MLBenchContext): Estimator[_] = { import ctx.params._ - new LinearRegression() + new ml.regression.LinearRegression() + .setSolver("l-bfgs") .setRegParam(regParam) .setMaxIter(maxIter) .setTol(tol) diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/scala/configs/mllib-small.yaml index 8851bec..f9896f0 100644 --- a/src/main/scala/configs/mllib-small.yaml +++ b/src/main/scala/configs/mllib-small.yaml @@ -63,9 +63,9 @@ benchmarks: numClasses: 4 numFeatures: 5 maxIter: 3 - - name: classification.LogisticRegression + - name: regression.LinearRegression params: numFeatures: 100 regParam: 0.1 - tol: [0.2, 0.1] + tol: [0.0] maxIter: 10 From 40e97ca3c05ca3f3d135d297a7b23a80e3e6ccea Mon Sep 17 00:00:00 2001 From: Timothy Hunter Date: Tue, 5 Jul 2016 15:01:50 -0700 Subject: [PATCH 3/3] comment --- src/main/scala/configs/mllib-small.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/scala/configs/mllib-small.yaml index f9896f0..24408f8 100644 --- a/src/main/scala/configs/mllib-small.yaml +++ b/src/main/scala/configs/mllib-small.yaml @@ -65,6 +65,8 @@ benchmarks: maxIter: 3 - name: regression.LinearRegression params: + numExamples: 100 + numTestExamples: 100 numFeatures: 100 regParam: 0.1 tol: [0.0]