From 9ab2a8bb14550da406a51a16f2e3a9f04f825971 Mon Sep 17 00:00:00 2001 From: ludatabricks <38018689+ludatabricks@users.noreply.github.com> Date: Fri, 8 Jun 2018 12:06:52 -0700 Subject: [PATCH] [ML-3585] Added benchmarks to mllib-large.yaml for clustering (#149) Benchmark for clustering is added to mllib-large.yaml. GaussianMixture, KMeans, and LDA are added. BisectingKMeans is missing in spark-sql-perf now. Need to be fixed in the following up JIRA: https://databricks.atlassian.net/browse/ML-3834 Then parameters is based on the previous benchmarks for the Spark 2.2 QA. --- .../sql/perf/mllib/config/mllib-large.yaml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml index b9eb226..741359d 100644 --- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml @@ -37,6 +37,28 @@ benchmarks: numFeatures: 5000 numClasses: 2 smoothing: 1.0 + - name: clustering.GaussianMixture + params: + numExamples: 100000 + numTestExamples: 100000 + numFeatures: 1000 + k: 10 + maxIter: 10 + tol: 0.01 + - name: clustering.KMeans + params: + k: 50 + maxIter: 20 + tol: 1e-3 + - name: clustering.LDA + params: + docLength: 100 + vocabSize: 5000 + k: 60 + maxIter: 20 + optimizer: + - em + - online - name: recommendation.ALS params: numExamples: 50000000