[ML-5437] Build with spark-2.4.0 and resolve build issues (#174)

We made some changes to related to new APIs in spark2.4. These APIs were reverted because they were breaking changes so we need to revert our changes.
2018-11-09 16:21:22 -08:00 · 2018-11-09 16:21:22 -08:00 · 85bbfd4ca2
commit 85bbfd4ca2
parent d44caec277
3 changed files with 25 additions and 48 deletions
--- a/build.sbt
+++ b/build.sbt
@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf"
 // All Spark Packages need a license
 licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))

-sparkVersion := "2.4.0-SNAPSHOT"
+sparkVersion := "2.4.0"

 sparkComponents ++= Seq("sql", "hive", "mllib")

@ -44,8 +44,6 @@ libraryDependencies += "org.yaml" % "snakeyaml" % "1.17"

 libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2"

-resolvers += "Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
-
 fork := true

 // Your username to login to Databricks Cloud
--- a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala
+++ b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala
@ -45,7 +45,6 @@ object ModelBuilderSSP {
      s" but was given $numClasses")
    val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses,
      featureArity = featureArity, seed = seed)
-      .asInstanceOf[ClassificationNode]
    new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length,
      numClasses = numClasses)
  }
@ -56,7 +55,6 @@ object ModelBuilderSSP {
      seed: Long): DecisionTreeRegressionModel = {
    val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0,
      featureArity = featureArity, seed = seed)
-      .asInstanceOf[RegressionNode]
    new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length)
  }

@ -167,45 +165,12 @@ object TreeBuilder {
      ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0))
    }

-    randomBalancedDecisionTreeHelper(isRegression, depth, featureArity, impurityCalculator,
+    randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator,
      labelGenerator, Set.empty, rng)
  }

-  private def createLeafNode(
-      isRegression: Boolean,
-      prediction: Double,
-      impurity: Double,
-      impurityStats: ImpurityCalculator): LeafNode = {
-    if (isRegression) {
-      new RegressionLeafNode(prediction, impurity, impurityStats)
-    } else {
-      new ClassificationLeafNode(prediction, impurity, impurityStats)
-    }
-  }
-
-  private def createInternalNode(
-      isRegression: Boolean,
-      prediction: Double,
-      impurity: Double,
-      gain: Double,
-      leftChild: Node,
-      rightChild: Node,
-      split: Split,
-      impurityStats: ImpurityCalculator): InternalNode = {
-    if (isRegression) {
-      new RegressionInternalNode(prediction, impurity, gain,
-        leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode],
-        split, impurityStats)
-    } else {
-      new ClassificationInternalNode(prediction, impurity, gain,
-        leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode],
-        split, impurityStats)
-    }
-  }
-
  /**
   * Create an internal node.  Either create the leaf nodes beneath it, or recurse as needed.
-   * @param isRegression  Whether the tree is a regressor or not (classifier)
   * @param subtreeDepth  Depth of subtree to build.  Depth 0 means this is a leaf node.
   * @param featureArity  Indicates feature type.  Value 0 indicates continuous feature.
   *                      Other values >= 2 indicate a categorical feature,
@ -217,7 +182,6 @@ object TreeBuilder {
   * @return
   */
  private def randomBalancedDecisionTreeHelper(
-      isRegression: Boolean,
      subtreeDepth: Int,
      featureArity: Array[Int],
      impurityCalculator: ImpurityCalculator,
@ -227,7 +191,7 @@ object TreeBuilder {

    if (subtreeDepth == 0) {
      // This case only happens for a depth 0 tree.
-      createLeafNode(isRegression, prediction = 0.0, impurity = 0.0, impurityCalculator)
+      return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator)
    }

    val numFeatures = featureArity.length
@ -257,20 +221,19 @@ object TreeBuilder {
    val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) {
      // Add leaf nodes.  Assign these jointly so they make different predictions.
      val predictions = labelGenerator.nextValue()
-      val leftChild = createLeafNode(isRegression, prediction = predictions._1, impurity = 0.0,
+      val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0,
        impurityStats = impurityCalculator)
-      val rightChild = createLeafNode(isRegression, prediction = predictions._2, impurity = 0.0,
+      val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0,
        impurityStats = impurityCalculator)
      (leftChild, rightChild)
    } else {
-      val leftChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
+      val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
        impurityCalculator, labelGenerator, usedFeatures + feature, rng)
-      val rightChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
+      val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
        impurityCalculator, labelGenerator, usedFeatures + feature, rng)
      (leftChild, rightChild)
    }
-    createInternalNode(isRegression, prediction = 0.0, impurity = 0.0, gain = 0.0,
-      leftChild = leftChild, rightChild = rightChild, split = split,
-      impurityStats = impurityCalculator)
+    new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild,
+      rightChild = rightChild, split = split, impurityStats = impurityCalculator)
  }
 }
--- a/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala
+++ b/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala
@ -2,18 +2,34 @@ package com.databricks.spark.sql.perf.mllib

 import org.scalatest.{BeforeAndAfterAll, FunSuite}

+import org.apache.log4j.{Level, Logger}
+
 import org.apache.spark.sql.{Row, SparkSession}

 class MLLibSuite extends FunSuite with BeforeAndAfterAll {

  private var sparkSession: SparkSession = _
+  var savedLevels: Map[String, Level] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()
+
+    // Travis limits the size of the log file produced by a build. Because we do run a small
+    // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
+    // log level to ERROR, just for this suite, to avoid displeasing travis.
+    savedLevels = Seq("akka", "org", "com.databricks").map { name =>
+      val logger = Logger.getLogger(name)
+      val curLevel = logger.getLevel
+      logger.setLevel(Level.ERROR)
+      name -> curLevel
+    }.toMap
  }

  override def afterAll(): Unit = {
+    savedLevels.foreach { case (name, level) =>
+      Logger.getLogger(name).setLevel(level)
+    }
    try {
      if (sparkSession != null) {
        sparkSession.stop()