From 9ab2a8bb14550da406a51a16f2e3a9f04f825971 Mon Sep 17 00:00:00 2001
From: ludatabricks <38018689+ludatabricks@users.noreply.github.com>
Date: Fri, 8 Jun 2018 12:06:52 -0700
Subject: [PATCH] [ML-3585] Added benchmarks to mllib-large.yaml for clustering
 (#149)

Benchmark for clustering is added to mllib-large.yaml.
GaussianMixture, KMeans, and LDA are added. BisectingKMeans is missing in spark-sql-perf now. Need to be fixed in the following up JIRA: https://databricks.atlassian.net/browse/ML-3834
Then parameters is based on the previous benchmarks for the Spark 2.2 QA.
---
 .../sql/perf/mllib/config/mllib-large.yaml    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
index b9eb226..741359d 100644
--- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
+++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml
@@ -37,6 +37,28 @@ benchmarks:
       numFeatures: 5000
       numClasses: 2
       smoothing: 1.0
+  - name: clustering.GaussianMixture
+    params:
+      numExamples: 100000
+      numTestExamples: 100000
+      numFeatures: 1000
+      k: 10
+      maxIter: 10
+      tol: 0.01
+  - name: clustering.KMeans
+    params:
+      k: 50
+      maxIter: 20
+      tol: 1e-3
+  - name: clustering.LDA
+    params:
+      docLength: 100
+      vocabSize: 5000
+      k: 60
+      maxIter: 20
+      optimizer:
+        - em
+        - online
   - name: recommendation.ALS
     params:
       numExamples: 50000000