MinHashLSH and BucketedRandomProjectionLSH benchmark #128

MinHashLSH and BucketedRandomProjectionLSH benchmark added.

Future questions:
* Whether we need to improve the way of testing data generation for MinHashLSH ( and add more control param, such as max/min element number in each input set )
* Whether we need to add benchmark for approxNearestNeighbors and approxSimilarityJoin
This commit is contained in:
WeichenXu 2018-03-03 07:21:37 +08:00 committed by jkbradley
parent 6d01ac94a1
commit 93a34553f0
4 changed files with 85 additions and 2 deletions

View File

@ -0,0 +1,35 @@
package com.databricks.spark.sql.perf.mllib.feature
import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
/** Object for testing BucketedRandomProjectionLSH performance */
object BucketedRandomProjectionLSH extends BenchmarkAlgorithm with TestFromTraining {
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
import ctx.params._
val df = DataGenerator.generateContinuousFeatures(
ctx.sqlContext,
numExamples,
ctx.seed(),
numPartitions,
numFeatures
)
df
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.feature.BucketedRandomProjectionLSH()
.setInputCol("features")
.setNumHashTables(numHashTables)
}
}

View File

@ -0,0 +1,35 @@
package com.databricks.spark.sql.perf.mllib.feature
import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.sql._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining}
/** Object for testing MinHashLSH performance */
object MinHashLSH extends BenchmarkAlgorithm with TestFromTraining {
override def trainingDataSet(ctx: MLBenchContext): DataFrame = {
import ctx.params._
val df = DataGenerator.generateMixedFeatures(
ctx.sqlContext,
numExamples,
ctx.seed(),
numPartitions,
Array.fill(numFeatures)(2)
)
df
}
override def getPipelineStage(ctx: MLBenchContext): PipelineStage = {
import ctx.params._
new ml.feature.MinHashLSH()
.setInputCol("features")
.setNumHashTables(numHashTables)
}
}

View File

@ -130,6 +130,7 @@ class MLParams(
val maxIter: Option[Int] = None, val maxIter: Option[Int] = None,
val numClasses: Option[Int] = None, val numClasses: Option[Int] = None,
val numFeatures: Option[Int] = None, val numFeatures: Option[Int] = None,
val numHashTables: Option[Int] = Some(1),
val numInputCols: Option[Int] = None, val numInputCols: Option[Int] = None,
val numItems: Option[Int] = None, val numItems: Option[Int] = None,
val numUsers: Option[Int] = None, val numUsers: Option[Int] = None,
@ -171,6 +172,7 @@ class MLParams(
maxIter: Option[Int] = maxIter, maxIter: Option[Int] = maxIter,
numClasses: Option[Int] = numClasses, numClasses: Option[Int] = numClasses,
numFeatures: Option[Int] = numFeatures, numFeatures: Option[Int] = numFeatures,
numHashTables: Option[Int] = numHashTables,
numInputCols: Option[Int] = numInputCols, numInputCols: Option[Int] = numInputCols,
numItems: Option[Int] = numItems, numItems: Option[Int] = numItems,
numUsers: Option[Int] = numUsers, numUsers: Option[Int] = numUsers,
@ -185,8 +187,9 @@ class MLParams(
bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength, bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength,
elasticNetParam = elasticNetParam, family = family, featureArity = featureArity, elasticNetParam = elasticNetParam, family = family, featureArity = featureArity,
itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter, itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter,
numClasses = numClasses, numFeatures = numFeatures, numInputCols = numInputCols, numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables,
numItems = numItems, numUsers = numUsers, optimizer = optimizer, regParam = regParam, numInputCols = numInputCols, numItems = numItems, numUsers = numUsers,
optimizer = optimizer, regParam = regParam,
rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize) rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize)
} }
} }

View File

@ -74,6 +74,11 @@ benchmarks:
optimizer: optimizer:
- em - em
- online - online
- name: feature.BucketedRandomProjectionLSH
params:
numExamples: 100
numFeatures: 10
numHashTables: 1
- name: feature.Bucketizer - name: feature.Bucketizer
params: params:
numExamples: 100 numExamples: 100
@ -83,6 +88,11 @@ benchmarks:
numExamples: 100 numExamples: 100
docLength: 20 docLength: 20
vocabSize: 4 vocabSize: 4
- name: feature.MinHashLSH
params:
numExamples: 100
numFeatures: 10
numHashTables: 1
- name: feature.OneHotEncoder - name: feature.OneHotEncoder
params: params:
numExamples: 100 numExamples: 100