From 5af9f6dfc20ba175c588b401b25de107b5c6bbbf Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 16 Mar 2018 04:10:04 +0800 Subject: [PATCH] Word2Vec benchmark (#127) * init pr * update * use builtin split fun --- .../sql/perf/mllib/feature/Word2Vec.scala | 34 +++++++++++++++++++ src/main/scala/configs/mllib-small.yaml | 5 +++ 2 files changed, 39 insertions(+) create mode 100644 src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala new file mode 100644 index 0000000..ca30dcf --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala @@ -0,0 +1,34 @@ +package com.databricks.spark.sql.perf.mllib.feature + +import org.apache.spark.ml +import org.apache.spark.ml.PipelineStage +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, split} + +import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator + +/** Object for testing Word2Vec performance */ +object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { + + override def trainingDataSet(ctx: MLBenchContext): DataFrame = { + import ctx.params._ + + val df = DataGenerator.generateDoc( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + vocabSize, + docLength, + "text" + ) + df.select(split(col("text"), " ").as("text")) + } + + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + new ml.feature.Word2Vec().setInputCol("text") + } + +} diff --git a/src/main/scala/configs/mllib-small.yaml b/src/main/scala/configs/mllib-small.yaml index 9241be7..b0392d5 100644 --- a/src/main/scala/configs/mllib-small.yaml +++ b/src/main/scala/configs/mllib-small.yaml @@ -115,6 +115,11 @@ benchmarks: params: numExamples: 100 numFeatures: 10 + - name: feature.Word2Vec + params: + numExamples: 100 + vocabSize: 100 + docLength: 10 - name: recommendation.ALS params: numExamples: 100