From 51786921a671b06f2a16762a6e06bc694279e593 Mon Sep 17 00:00:00 2001 From: ludatabricks <38018689+ludatabricks@users.noreply.github.com> Date: Tue, 12 Jun 2018 17:31:30 -0700 Subject: [PATCH] [ML-3583] Add benchmarks to mllib-large.yaml for featurization (#152) Benchmark for featurization is added to mllib-large.yaml. Cannot run QuantileDiscretizer with spark 2.3. Leave this as future work: https://databricks.atlassian.net/browse/ML-3869 --- .../sql/perf/mllib/config/mllib-large.yaml | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml index 26a2970..1f4c3ee 100644 --- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml @@ -59,6 +59,42 @@ benchmarks: optimizer: - em - online + - name: feature.BucketedRandomProjectionLSH + params: + numHashTables: 20 + - name: feature.Bucketizer + params: + bucketizerNumBuckets: 20 + - name: feature.HashingTF + params: + docLength: 2000 + vocabSize: 200 + - name: feature.MinHashLSH + params: + numFeatures: 10000 + numHashTables: 10 + - name: feature.OneHotEncoder + params: + featureArity: 10000 + - name: feature.StringIndexer + params: + vocabSize: 10000 + - name: feature.Tokenizer + params: + vocabSize: 2000 + docLength: 10000 + - name: feature.VectorAssembler + params: + numInputCols: 20 + - name: feature.VectorSlicer + params: + numFeatures: 5000 + - name: feature.Word2Vec + params: + numExamples: 10000 + vocabSize: 100 + docLength: 1000 + numSynonymsToFind: 3 - name: fpm.FPGrowth params: numItems: 10000 @@ -93,4 +129,4 @@ benchmarks: - name: regression.RandomForestRegression params: depth: 10 - maxIter: 4 \ No newline at end of file + maxIter: 4