From 107495afe279e2f89dfde8e5331a20b6bc46b2af Mon Sep 17 00:00:00 2001 From: Joseph Bradley Date: Mon, 9 Jul 2018 17:41:44 -0700 Subject: [PATCH] [ML-4069] Improve timing of estimators (#161) This gives the following running times: ``` recommendation.ALS 72.083s classification.DecisionTreeClassification 37.125s classification.DecisionTreeClassification 33.274s regression.DecisionTreeRegression 31.252s regression.DecisionTreeRegression 63.35s fpm.FPGrowth 6.219s fpm.FPGrowth 5.342s classification.GBTClassification 46.154s regression.GBTRegression 45.832s clustering.GaussianMixture 18.936s regression.GLMRegression 20.342s clustering.KMeans 32.473s clustering.LDA 44.574s clustering.LDA 24.658s classification.LinearSVC 39.84s regression.LinearRegression 43.335s classification.LogisticRegression 41.637s classification.LogisticRegression 37.711s classification.NaiveBayes 23.351s classification.RandomForestClassification 20.781s regression.RandomForestRegression 39.971s feature.Word2Vec 51.892s ``` --- .../sql/perf/mllib/config/mllib-large.yaml | 74 ++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml index 381b824..c6dea11 100644 --- a/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml +++ b/src/main/resources/com/databricks/spark/sql/perf/mllib/config/mllib-large.yaml @@ -9,53 +9,58 @@ common: benchmarks: - name: classification.DecisionTreeClassification params: - depth: [5, 10] + depth: [5, 12] numClasses: 4 - name: classification.GBTClassification params: - numFeatures: 3000 - depth: 5 + numFeatures: 1500 + depth: 4 numClasses: 4 - maxIter: 10 + maxIter: 8 - name: classification.RandomForestClassification params: depth: 10 + numFeatures: 1000 numClasses: 4 - maxIter: 200 # number of trees + maxIter: 100 - name: classification.LogisticRegression params: + numExamples: 700000 + numFeatures: 5000 + elasticNetParam: [0.0, 0.5] regParam: 0.01 tol: 0.0 - maxIter: 20 + maxIter: 10 - name: classification.LinearSVC params: + numExamples: 500000 regParam: 0.01 tol: 0 - maxIter: 20 + maxIter: 10 - name: classification.NaiveBayes params: + numExamples: 2000000 numFeatures: 5000 - numClasses: 2 + numClasses: 10 smoothing: 1.0 - name: clustering.GaussianMixture params: - numExamples: 100000 - numTestExamples: 100000 - numFeatures: 1000 - k: 10 - maxIter: 10 - tol: 0.01 + numFeatures: 30 + k: 15 + maxIter: 15 + tol: 0.0 - name: clustering.KMeans params: - k: 50 + k: 20 maxIter: 20 tol: 1e-3 - name: clustering.LDA params: + numExamples: 200000 docLength: 100 vocabSize: 5000 - k: 60 - maxIter: 20 + k: 20 + maxIter: 10 optimizer: - em - online @@ -93,45 +98,48 @@ benchmarks: params: numExamples: 10000 vocabSize: 100 - docLength: 1000 + docLength: 300 numSynonymsToFind: 3 - name: fpm.FPGrowth params: + numExamples: 10000000 numItems: 10000 - itemSetSize: [4, 10] + itemSetSize: [4, 14] - name: recommendation.ALS params: - numExamples: 50000000 + numExamples: 20000000 numTestExamples: 50000000 - numUsers: 6000000 - numItems: 6000000 + numUsers: 4000000 + numItems: 4000000 regParam: 0.01 rank: 10 - maxIter: 10 + maxIter: 8 - name: regression.DecisionTreeRegression params: - depth: [5, 10] + depth: [4, 7] - name: regression.GBTRegression params: - numFeatures: 2000 - depth: 5 - maxIter: 5 + numFeatures: 1000 + depth: 4 + maxIter: 8 - name: regression.GLMRegression params: - numExamples: 500000 + numExamples: 400000 numTestExamples: 500000 - numFeatures: 1000 + numFeatures: 500 link: log family: gaussian tol: 0.0 - maxIter: 10 + maxIter: 8 regParam: 0.1 - name: regression.LinearRegression params: + numFeatures: 5000 regParam: 0.01 tol: 0.0 - maxIter: 20 + maxIter: 9 - name: regression.RandomForestRegression params: - depth: 10 - maxIter: 4 + depth: 7 + numFeatures: 500 + maxIter: 16