diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLTransformerBenchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLTransformerBenchmarkable.scala index 61886fe..57c051b 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLTransformerBenchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLTransformerBenchmarkable.scala @@ -50,7 +50,6 @@ class MLTransformerBenchmarkable( logger.info(s"$this: train: trainingSet=${trainingData.schema}") val estimator = test.getEstimator(param) estimator.fit(trainingData) - //test.train(param, trainingData) } logger.info(s"model: $model") val (_, scoreTraining) = measureTime { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala index dbccf3f..a6daf4b 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala @@ -1,7 +1,7 @@ package com.databricks.spark.sql.perf.mllib.clustering -import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} -import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import scala.collection.mutable.{HashMap => MHashMap} + import org.apache.commons.math3.random.Well19937c import org.apache.spark.ml.Estimator @@ -9,7 +9,10 @@ import org.apache.spark.ml import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.ml.linalg.{Vector, Vectors} -import scala.collection.mutable.{HashMap => MHashMap} + +import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ + object LDA extends BenchmarkAlgorithm with TestFromTraining { // The LDA model is package private, no need to expose it. @@ -51,4 +54,4 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining { } // TODO(?) add a scoring method here. -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/data_generation.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala similarity index 89% rename from src/main/scala/com/databricks/spark/sql/perf/mllib/data/data_generation.scala rename to src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala index d34a828..2461fbc 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/data_generation.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala @@ -21,6 +21,11 @@ object DataGenerator { sql.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") } + /** + * Generate a mix of continuous and categorical features. + * @param featureArity Array of length numFeatures, where 0 indicates a continuous feature and + * a value > 0 indicates a categorical feature with that arity. + */ def generateMixedFeatures( sql: SQLContext, numExamples: Long, diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala index 36d0e36..aea75fd 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala @@ -1,14 +1,14 @@ package com.databricks.spark.sql.perf.mllib.regression -import com.databricks.spark.sql.perf.mllib.OptionImplicits._ -import com.databricks.spark.sql.perf.mllib._ -import com.databricks.spark.sql.perf.mllib.data.DataGenerator - import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegression import org.apache.spark.ml.{Estimator, ModelBuilder, Transformer} +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator + object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator {