From 3786a8391ebf67c3195316772aa6fe424070a3e1 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 18 May 2018 02:55:00 +0800 Subject: [PATCH] Quantile discretizer benchmark (#135) QuantileDiscretizer benchmark --- .../sql/perf/mllib/config/mllib-small.yaml | 5 +++ .../mllib/feature/QuantileDiscretizer.scala | 37 ++++++++++++++++ .../databricks/spark/sql/perf/results.scala | 44 ++++++++++++++----- 3 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml index efbef73..8de1235 100644 --- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-small.yaml @@ -97,6 +97,11 @@ benchmarks: params: numExamples: 100 featureArity: 10 + - name: feature.QuantileDiscretizer + params: + numExamples: 100 + bucketizerNumBuckets: 2 + relativeError: 0.001 - name: feature.StringIndexer params: numExamples: 100 diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala new file mode 100644 index 0000000..47674f7 --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala @@ -0,0 +1,37 @@ +package com.databricks.spark.sql.perf.mllib.feature + +import org.apache.spark.ml +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.PipelineStage +import org.apache.spark.sql._ + +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator +import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} + +/** Object for testing QuantileDiscretizer performance */ +object QuantileDiscretizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { + + override def trainingDataSet(ctx: MLBenchContext): DataFrame = { + import ctx.params._ + import ctx.sqlContext.implicits._ + + DataGenerator.generateContinuousFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + 1 + ).rdd.map { case Row(vec: Vector) => + vec(0) // extract the single generated double value for each row + }.toDF(inputCol) + } + + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + import ctx.params._ + new ml.feature.QuantileDiscretizer() + .setInputCol(inputCol) + .setNumBuckets(bucketizerNumBuckets) + .setRelativeError(relativeError) + } +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/results.scala b/src/main/scala/com/databricks/spark/sql/perf/results.scala index 487d7a6..4ac99d7 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/results.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/results.scala @@ -137,6 +137,7 @@ class MLParams( val numUsers: Option[Int] = None, val optimizer: Option[String] = None, val regParam: Option[Double] = None, + val relativeError: Option[Double] = Some(0.001), val rank: Option[Int] = None, val smoothing: Option[Double] = None, val tol: Option[Double] = None, @@ -178,21 +179,42 @@ class MLParams( numInputCols: Option[Int] = numInputCols, numItems: Option[Int] = numItems, numUsers: Option[Int] = numUsers, - vocabSize: Option[Int] = vocabSize, optimizer: Option[String] = optimizer, regParam: Option[Double] = regParam, + relativeError: Option[Double] = relativeError, rank: Option[Int] = rank, smoothing: Option[Double] = smoothing, - tol: Option[Double] = tol): MLParams = { - new MLParams(randomSeed = randomSeed, numExamples = numExamples, - numTestExamples = numTestExamples, numPartitions = numPartitions, - bucketizerNumBuckets = bucketizerNumBuckets, depth = depth, docLength = docLength, - elasticNetParam = elasticNetParam, family = family, featureArity = featureArity, - itemSetSize = itemSetSize, k = k, link = link, maxIter = maxIter, - numClasses = numClasses, numFeatures = numFeatures, numHashTables = numHashTables, - numInputCols = numInputCols, numItems = numItems, numSynonymsToFind = numSynonymsToFind, - numUsers = numUsers, optimizer = optimizer, regParam = regParam, - rank = rank, smoothing = smoothing, tol = tol, vocabSize = vocabSize) + tol: Option[Double] = tol, + vocabSize: Option[Int] = vocabSize): MLParams = { + new MLParams( + randomSeed = randomSeed, + numExamples = numExamples, + numTestExamples = numTestExamples, + numPartitions = numPartitions, + bucketizerNumBuckets = bucketizerNumBuckets, + depth = depth, + docLength = docLength, + elasticNetParam = elasticNetParam, + family = family, + featureArity = featureArity, + itemSetSize = itemSetSize, + k = k, + link = link, + maxIter = maxIter, + numClasses = numClasses, + numFeatures = numFeatures, + numHashTables = numHashTables, + numSynonymsToFind = numSynonymsToFind, + numInputCols = numInputCols, + numItems = numItems, + numUsers = numUsers, + optimizer = optimizer, + regParam = regParam, + relativeError = relativeError, + rank = rank, + smoothing = smoothing, + tol = tol, + vocabSize = vocabSize) } }