From 85bbfd4ca22b386b3216af5434b44ad3b6a32b58 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Fri, 9 Nov 2018 16:21:22 -0800
Subject: [PATCH] [ML-5437] Build with spark-2.4.0 and resolve build issues
 (#174)

We made some changes to related to new APIs in spark2.4. These APIs were reverted because they were breaking changes so we need to revert our changes.
---
 build.sbt                                     |  4 +-
 .../org/apache/spark/ml/ModelBuilderSSP.scala | 53 +++----------------
 .../spark/sql/perf/mllib/MLLibSuite.scala     | 16 ++++++
 3 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/build.sbt b/build.sbt
index 654f155..31b4f50 100644
--- a/build.sbt
+++ b/build.sbt
@@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf"
 // All Spark Packages need a license
 licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
 
-sparkVersion := "2.4.0-SNAPSHOT"
+sparkVersion := "2.4.0"
 
 sparkComponents ++= Seq("sql", "hive", "mllib")
 
@@ -44,8 +44,6 @@ libraryDependencies += "org.yaml" % "snakeyaml" % "1.17"
 
 libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2"
 
-resolvers += "Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
-
 fork := true
 
 // Your username to login to Databricks Cloud
diff --git a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala
index 39cebc0..511b645 100644
--- a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala
+++ b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala
@@ -45,7 +45,6 @@ object ModelBuilderSSP {
       s" but was given $numClasses")
     val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses,
       featureArity = featureArity, seed = seed)
-      .asInstanceOf[ClassificationNode]
     new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length,
       numClasses = numClasses)
   }
@@ -56,7 +55,6 @@ object ModelBuilderSSP {
       seed: Long): DecisionTreeRegressionModel = {
     val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0,
       featureArity = featureArity, seed = seed)
-      .asInstanceOf[RegressionNode]
     new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length)
   }
 
@@ -167,45 +165,12 @@ object TreeBuilder {
       ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0))
     }
 
-    randomBalancedDecisionTreeHelper(isRegression, depth, featureArity, impurityCalculator,
+    randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator,
       labelGenerator, Set.empty, rng)
   }
 
-  private def createLeafNode(
-      isRegression: Boolean,
-      prediction: Double,
-      impurity: Double,
-      impurityStats: ImpurityCalculator): LeafNode = {
-    if (isRegression) {
-      new RegressionLeafNode(prediction, impurity, impurityStats)
-    } else {
-      new ClassificationLeafNode(prediction, impurity, impurityStats)
-    }
-  }
-
-  private def createInternalNode(
-      isRegression: Boolean,
-      prediction: Double,
-      impurity: Double,
-      gain: Double,
-      leftChild: Node,
-      rightChild: Node,
-      split: Split,
-      impurityStats: ImpurityCalculator): InternalNode = {
-    if (isRegression) {
-      new RegressionInternalNode(prediction, impurity, gain,
-        leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode],
-        split, impurityStats)
-    } else {
-      new ClassificationInternalNode(prediction, impurity, gain,
-        leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode],
-        split, impurityStats)
-    }
-  }
-
   /**
    * Create an internal node.  Either create the leaf nodes beneath it, or recurse as needed.
-   * @param isRegression  Whether the tree is a regressor or not (classifier)
    * @param subtreeDepth  Depth of subtree to build.  Depth 0 means this is a leaf node.
    * @param featureArity  Indicates feature type.  Value 0 indicates continuous feature.
    *                      Other values >= 2 indicate a categorical feature,
@@ -217,7 +182,6 @@ object TreeBuilder {
    * @return
    */
   private def randomBalancedDecisionTreeHelper(
-      isRegression: Boolean,
       subtreeDepth: Int,
       featureArity: Array[Int],
       impurityCalculator: ImpurityCalculator,
@@ -227,7 +191,7 @@ object TreeBuilder {
 
     if (subtreeDepth == 0) {
       // This case only happens for a depth 0 tree.
-      createLeafNode(isRegression, prediction = 0.0, impurity = 0.0, impurityCalculator)
+      return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator)
     }
 
     val numFeatures = featureArity.length
@@ -257,20 +221,19 @@ object TreeBuilder {
     val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) {
       // Add leaf nodes.  Assign these jointly so they make different predictions.
       val predictions = labelGenerator.nextValue()
-      val leftChild = createLeafNode(isRegression, prediction = predictions._1, impurity = 0.0,
+      val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0,
         impurityStats = impurityCalculator)
-      val rightChild = createLeafNode(isRegression, prediction = predictions._2, impurity = 0.0,
+      val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0,
         impurityStats = impurityCalculator)
       (leftChild, rightChild)
     } else {
-      val leftChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
+      val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
         impurityCalculator, labelGenerator, usedFeatures + feature, rng)
-      val rightChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
+      val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
         impurityCalculator, labelGenerator, usedFeatures + feature, rng)
       (leftChild, rightChild)
     }
-    createInternalNode(isRegression, prediction = 0.0, impurity = 0.0, gain = 0.0,
-      leftChild = leftChild, rightChild = rightChild, split = split,
-      impurityStats = impurityCalculator)
+    new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild,
+      rightChild = rightChild, split = split, impurityStats = impurityCalculator)
   }
 }
diff --git a/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala b/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala
index 7e72f5e..377156c 100644
--- a/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala
+++ b/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala
@@ -2,18 +2,34 @@ package com.databricks.spark.sql.perf.mllib
 
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
+import org.apache.log4j.{Level, Logger}
+
 import org.apache.spark.sql.{Row, SparkSession}
 
 class MLLibSuite extends FunSuite with BeforeAndAfterAll {
 
   private var sparkSession: SparkSession = _
+  var savedLevels: Map[String, Level] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
     sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()
+
+    // Travis limits the size of the log file produced by a build. Because we do run a small
+    // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
+    // log level to ERROR, just for this suite, to avoid displeasing travis.
+    savedLevels = Seq("akka", "org", "com.databricks").map { name =>
+      val logger = Logger.getLogger(name)
+      val curLevel = logger.getLevel
+      logger.setLevel(Level.ERROR)
+      name -> curLevel
+    }.toMap
   }
 
   override def afterAll(): Unit = {
+    savedLevels.foreach { case (name, level) =>
+      Logger.getLogger(name).setLevel(level)
+    }
     try {
       if (sparkSession != null) {
         sparkSession.stop()