MinHashLSH and BucketedRandomProjectionLSH benchmark #128
MinHashLSH and BucketedRandomProjectionLSH benchmark added. Future questions: * Whether we need to improve the way of testing data generation for MinHashLSH ( and add more control param, such as max/min element number in each input set ) * Whether we need to add benchmark for approxNearestNeighbors and approxSimilarityJoin
This commit is contained in:
parent
6d01ac94a1
commit
93a34553f0
@ -0,0 +1,35 @@
|
||||
package com.databricks.spark.sql.perf.mllib.feature
|
||||
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.PipelineStage
|
||||
import org.apache.spark.sql._
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||
|
||||
/** Object for testing BucketedRandomProjectionLSH performance */
|
||||
object BucketedRandomProjectionLSH extends BenchmarkAlgorithm with TestFromTraining {
|
||||
|
||||
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
|
||||
import ctx.params._
|
||||
|
||||
val df = DataGenerator.generateContinuousFeatures(
|
||||
ctx.sqlContext,
|
||||
numExamples,
|
||||
ctx.seed(),
|
||||
numPartitions,
|
||||
numFeatures
|
||||
)
|
||||
df
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
import ctx.params._
|
||||
|
||||
new ml.feature.BucketedRandomProjectionLSH()
|
||||
.setInputCol("features")
|
||||
.setNumHashTables(numHashTables)
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.databricks.spark.sql.perf.mllib.feature
|
||||
|
||||
import org.apache.spark.ml
|
||||
import org.apache.spark.ml.PipelineStage
|
||||
import org.apache.spark.sql._
|
||||
|
||||
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
|
||||
|
||||
/** Object for testing MinHashLSH performance */
|
||||
object MinHashLSH extends BenchmarkAlgorithm with TestFromTraining {
|
||||
|
||||
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
|
||||
import ctx.params._
|
||||
|
||||
val df = DataGenerator.generateMixedFeatures(
|
||||
ctx.sqlContext,
|
||||
numExamples,
|
||||
ctx.seed(),
|
||||
numPartitions,
|
||||
Array.fill(numFeatures)(2)
|
||||
)
|
||||
df
|
||||
}
|
||||
|
||||
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||
import ctx.params._
|
||||
|
||||
new ml.feature.MinHashLSH()
|
||||
.setInputCol("features")
|
||||
.setNumHashTables(numHashTables)
|
||||
}
|
||||
|
||||
}
|
||||
@ -130,6 +130,7 @@ class MLParams(
|
||||
val maxIter: Option[Int] = None,
|
||||
val numClasses: Option[Int] = None,
|
||||
val numFeatures: Option[Int] = None,
|
||||
val numHashTables: Option[Int] = Some(1),
|
||||
val numInputCols: Option[Int] = None,
|
||||
val numItems: Option[Int] = None,
|
||||
val numUsers: Option[Int] = None,
|
||||
@ -171,6 +172,7 @@ class MLParams(
|
||||
maxIter: Option[Int] = maxIter,
|
||||
numClasses: Option[Int] = numClasses,
|
||||
numFeatures: Option[Int] = numFeatures,
|
||||
numHashTables: Option[Int] = numHashTables,
|
||||
numInputCols: Option[Int] = numInputCols,
|
||||
numItems: Option[Int] = numItems,
|
||||
numUsers: Option[Int] = numUsers,
|
||||
@ -185,8 +187,9 @@ class MLParams(
|
||||
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
|
||||
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity,
|
||||
itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter,
|
||||
numClasses = numClasses, numFeatures = numFeatures, numInputCols = numInputCols,
|
||||
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
|
||||
numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables,
|
||||
numInputCols = numInputCols, numItems = numItems, numUsers = numUsers,
|
||||
optimizer = optimizer, regParam = regParam,
|
||||
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
|
||||
}
|
||||
}
|
||||
|
||||
@ -74,6 +74,11 @@ benchmarks:
|
||||
optimizer:
|
||||
- em
|
||||
- online
|
||||
- name: feature.BucketedRandomProjectionLSH
|
||||
params:
|
||||
numExamples: 100
|
||||
numFeatures: 10
|
||||
numHashTables: 1
|
||||
- name: feature.Bucketizer
|
||||
params:
|
||||
numExamples: 100
|
||||
@ -83,6 +88,11 @@ benchmarks:
|
||||
numExamples: 100
|
||||
docLength: 20
|
||||
vocabSize: 4
|
||||
- name: feature.MinHashLSH
|
||||
params:
|
||||
numExamples: 100
|
||||
numFeatures: 10
|
||||
numHashTables: 1
|
||||
- name: feature.OneHotEncoder
|
||||
params:
|
||||
numExamples: 100
|
||||
|
||||
Loading…
Reference in New Issue
Block a user