MinHashLSH and BucketedRandomProjectionLSH benchmark #128
MinHashLSH and BucketedRandomProjectionLSH benchmark added. Future questions: * Whether we need to improve the way of testing data generation for MinHashLSH ( and add more control param, such as max/min element number in each input set ) * Whether we need to add benchmark for approxNearestNeighbors and approxSimilarityJoin
This commit is contained in:
parent
6d01ac94a1
commit
93a34553f0
@ -0,0 +1,35 @@
|
|||||||
|
package com.databricks.spark.sql.perf.mllib.feature
|
||||||
|
|
||||||
|
import org.apache.spark.ml
|
||||||
|
import org.apache.spark.ml.PipelineStage
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
|
||||||
|
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
|
||||||
|
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||||
|
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||||
|
|
||||||
|
/** Object for testing BucketedRandomProjectionLSH performance */
|
||||||
|
object BucketedRandomProjectionLSH extends BenchmarkAlgorithm with TestFromTraining {
|
||||||
|
|
||||||
|
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
|
||||||
|
import ctx.params._
|
||||||
|
|
||||||
|
val df = DataGenerator.generateContinuousFeatures(
|
||||||
|
ctx.sqlContext,
|
||||||
|
numExamples,
|
||||||
|
ctx.seed(),
|
||||||
|
numPartitions,
|
||||||
|
numFeatures
|
||||||
|
)
|
||||||
|
df
|
||||||
|
}
|
||||||
|
|
||||||
|
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||||
|
import ctx.params._
|
||||||
|
|
||||||
|
new ml.feature.BucketedRandomProjectionLSH()
|
||||||
|
.setInputCol("features")
|
||||||
|
.setNumHashTables(numHashTables)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,35 @@
|
|||||||
|
package com.databricks.spark.sql.perf.mllib.feature
|
||||||
|
|
||||||
|
import org.apache.spark.ml
|
||||||
|
import org.apache.spark.ml.PipelineStage
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
|
||||||
|
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
|
||||||
|
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
|
||||||
|
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
|
||||||
|
|
||||||
|
/** Object for testing MinHashLSH performance */
|
||||||
|
object MinHashLSH extends BenchmarkAlgorithm with TestFromTraining {
|
||||||
|
|
||||||
|
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
|
||||||
|
import ctx.params._
|
||||||
|
|
||||||
|
val df = DataGenerator.generateMixedFeatures(
|
||||||
|
ctx.sqlContext,
|
||||||
|
numExamples,
|
||||||
|
ctx.seed(),
|
||||||
|
numPartitions,
|
||||||
|
Array.fill(numFeatures)(2)
|
||||||
|
)
|
||||||
|
df
|
||||||
|
}
|
||||||
|
|
||||||
|
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
|
||||||
|
import ctx.params._
|
||||||
|
|
||||||
|
new ml.feature.MinHashLSH()
|
||||||
|
.setInputCol("features")
|
||||||
|
.setNumHashTables(numHashTables)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -130,6 +130,7 @@ class MLParams(
|
|||||||
val maxIter: Option[Int] = None,
|
val maxIter: Option[Int] = None,
|
||||||
val numClasses: Option[Int] = None,
|
val numClasses: Option[Int] = None,
|
||||||
val numFeatures: Option[Int] = None,
|
val numFeatures: Option[Int] = None,
|
||||||
|
val numHashTables: Option[Int] = Some(1),
|
||||||
val numInputCols: Option[Int] = None,
|
val numInputCols: Option[Int] = None,
|
||||||
val numItems: Option[Int] = None,
|
val numItems: Option[Int] = None,
|
||||||
val numUsers: Option[Int] = None,
|
val numUsers: Option[Int] = None,
|
||||||
@ -171,6 +172,7 @@ class MLParams(
|
|||||||
maxIter: Option[Int] = maxIter,
|
maxIter: Option[Int] = maxIter,
|
||||||
numClasses: Option[Int] = numClasses,
|
numClasses: Option[Int] = numClasses,
|
||||||
numFeatures: Option[Int] = numFeatures,
|
numFeatures: Option[Int] = numFeatures,
|
||||||
|
numHashTables: Option[Int] = numHashTables,
|
||||||
numInputCols: Option[Int] = numInputCols,
|
numInputCols: Option[Int] = numInputCols,
|
||||||
numItems: Option[Int] = numItems,
|
numItems: Option[Int] = numItems,
|
||||||
numUsers: Option[Int] = numUsers,
|
numUsers: Option[Int] = numUsers,
|
||||||
@ -185,8 +187,9 @@ class MLParams(
|
|||||||
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
|
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
|
||||||
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity,
|
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity,
|
||||||
itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter,
|
itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter,
|
||||||
numClasses = numClasses, numFeatures = numFeatures, numInputCols = numInputCols,
|
numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables,
|
||||||
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam,
|
numInputCols = numInputCols, numItems = numItems, numUsers = numUsers,
|
||||||
|
optimizer = optimizer, regParam = regParam,
|
||||||
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
|
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -74,6 +74,11 @@ benchmarks:
|
|||||||
optimizer:
|
optimizer:
|
||||||
- em
|
- em
|
||||||
- online
|
- online
|
||||||
|
- name: feature.BucketedRandomProjectionLSH
|
||||||
|
params:
|
||||||
|
numExamples: 100
|
||||||
|
numFeatures: 10
|
||||||
|
numHashTables: 1
|
||||||
- name: feature.Bucketizer
|
- name: feature.Bucketizer
|
||||||
params:
|
params:
|
||||||
numExamples: 100
|
numExamples: 100
|
||||||
@ -83,6 +88,11 @@ benchmarks:
|
|||||||
numExamples: 100
|
numExamples: 100
|
||||||
docLength: 20
|
docLength: 20
|
||||||
vocabSize: 4
|
vocabSize: 4
|
||||||
|
- name: feature.MinHashLSH
|
||||||
|
params:
|
||||||
|
numExamples: 100
|
||||||
|
numFeatures: 10
|
||||||
|
numHashTables: 1
|
||||||
- name: feature.OneHotEncoder
|
- name: feature.OneHotEncoder
|
||||||
params:
|
params:
|
||||||
numExamples: 100
|
numExamples: 100
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user