diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/BucketedRandomProjectionLSH.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/BucketedRandomProjectionLSH.scala new file mode 100644 index 0000000..2a27667 --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/BucketedRandomProjectionLSH.scala @@ -0,0 +1,35 @@ +package com.databricks.spark.sql.perf.mllib.feature + +import org.apache.spark.ml +import org.apache.spark.ml.PipelineStage +import org.apache.spark.sql._ + +import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator + +/** Object for testing BucketedRandomProjectionLSH performance */ +object BucketedRandomProjectionLSH extends BenchmarkAlgorithm with TestFromTraining { + + override def trainingDataSet(ctx: MLBenchContext): DataFrame = { + import ctx.params._ + + val df = DataGenerator.generateContinuousFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + numFeatures + ) + df + } + + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + import ctx.params._ + + new ml.feature.BucketedRandomProjectionLSH() + .setInputCol("features") + .setNumHashTables(numHashTables) + } + +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/MinHashLSH.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/MinHashLSH.scala new file mode 100644 index 0000000..a68bc30 --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/MinHashLSH.scala @@ -0,0 +1,35 @@ +package com.databricks.spark.sql.perf.mllib.feature + +import org.apache.spark.ml +import org.apache.spark.ml.PipelineStage +import org.apache.spark.sql._ + +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator +import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} + +/** Object for testing MinHashLSH performance */ +object MinHashLSH extends BenchmarkAlgorithm with TestFromTraining { + + override def trainingDataSet(ctx: MLBenchContext): DataFrame = { + import ctx.params._ + + val df = DataGenerator.generateMixedFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + Array.fill(numFeatures)(2) + ) + df + } + + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + import ctx.params._ + + new ml.feature.MinHashLSH() + .setInputCol("features") + .setNumHashTables(numHashTables) + } + +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/results.scala b/src/main/scala/com/databricks/spark/sql/perf/results.scala index a9f46c1..ab0fb24 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/results.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/results.scala @@ -130,6 +130,7 @@ class MLParams( val maxIter: Option[Int] = None, val numClasses: Option[Int] = None, val numFeatures: Option[Int] = None, + val numHashTables: Option[Int] = Some(1), val numInputCols: Option[Int] = None, val numItems: Option[Int] = None, val numUsers: Option[Int] = None, @@ -171,6 +172,7 @@ class MLParams( maxIter: Option[Int] = maxIter, numClasses: Option[Int] = numClasses, numFeatures: Option[Int] = numFeatures, + numHashTables: Option[Int] = numHashTables, numInputCols: Option[Int] = numInputCols, numItems: Option[Int] = numItems, numUsers: Option[Int] = numUsers, @@ -185,8 +187,9 @@ class MLParams( bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength, elasticNetParam = elasticNetParam, family = family, featureArity = featureArity, itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter, - numClasses = numClasses, numFeatures = numFeatures, numInputCols = numInputCols, - numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam, + numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables, + numInputCols = numInputCols, numItems = numItems, numUsers = numUsers, + optimizer = optimizer, regParam = regParam, rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize) } } diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/scala/configs/mllib-small.yaml index d01f2be..9241be7 100644 --- a/src/main/scala/configs/mllib-small.yaml +++ b/src/main/scala/configs/mllib-small.yaml @@ -74,6 +74,11 @@ benchmarks: optimizer: - em - online + - name: feature.BucketedRandomProjectionLSH + params: + numExamples: 100 + numFeatures: 10 + numHashTables: 1 - name: feature.Bucketizer params: numExamples: 100 @@ -83,6 +88,11 @@ benchmarks: numExamples: 100 docLength: 20 vocabSize: 4 + - name: feature.MinHashLSH + params: + numExamples: 100 + numFeatures: 10 + numHashTables: 1 - name: feature.OneHotEncoder params: numExamples: 100