From 85bbfd4ca22b386b3216af5434b44ad3b6a32b58 Mon Sep 17 00:00:00 2001 From: Bago Amirbekian Date: Fri, 9 Nov 2018 16:21:22 -0800 Subject: [PATCH] [ML-5437] Build with spark-2.4.0 and resolve build issues (#174) We made some changes to related to new APIs in spark2.4. These APIs were reverted because they were breaking changes so we need to revert our changes. --- build.sbt | 4 +- .../org/apache/spark/ml/ModelBuilderSSP.scala | 53 +++---------------- .../spark/sql/perf/mllib/MLLibSuite.scala | 16 ++++++ 3 files changed, 25 insertions(+), 48 deletions(-) diff --git a/build.sbt b/build.sbt index 654f155..31b4f50 100644 --- a/build.sbt +++ b/build.sbt @@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "2.4.0-SNAPSHOT" +sparkVersion := "2.4.0" sparkComponents ++= Seq("sql", "hive", "mllib") @@ -44,8 +44,6 @@ libraryDependencies += "org.yaml" % "snakeyaml" % "1.17" libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2" -resolvers += "Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots" - fork := true // Your username to login to Databricks Cloud diff --git a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala index 39cebc0..511b645 100644 --- a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala +++ b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala @@ -45,7 +45,6 @@ object ModelBuilderSSP { s" but was given $numClasses") val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses, featureArity = featureArity, seed = seed) - .asInstanceOf[ClassificationNode] new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length, numClasses = numClasses) } @@ -56,7 +55,6 @@ object ModelBuilderSSP { seed: Long): DecisionTreeRegressionModel = { val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0, featureArity = featureArity, seed = seed) - .asInstanceOf[RegressionNode] new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length) } @@ -167,45 +165,12 @@ object TreeBuilder { ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0)) } - randomBalancedDecisionTreeHelper(isRegression, depth, featureArity, impurityCalculator, + randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator, labelGenerator, Set.empty, rng) } - private def createLeafNode( - isRegression: Boolean, - prediction: Double, - impurity: Double, - impurityStats: ImpurityCalculator): LeafNode = { - if (isRegression) { - new RegressionLeafNode(prediction, impurity, impurityStats) - } else { - new ClassificationLeafNode(prediction, impurity, impurityStats) - } - } - - private def createInternalNode( - isRegression: Boolean, - prediction: Double, - impurity: Double, - gain: Double, - leftChild: Node, - rightChild: Node, - split: Split, - impurityStats: ImpurityCalculator): InternalNode = { - if (isRegression) { - new RegressionInternalNode(prediction, impurity, gain, - leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode], - split, impurityStats) - } else { - new ClassificationInternalNode(prediction, impurity, gain, - leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode], - split, impurityStats) - } - } - /** * Create an internal node. Either create the leaf nodes beneath it, or recurse as needed. - * @param isRegression Whether the tree is a regressor or not (classifier) * @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node. * @param featureArity Indicates feature type. Value 0 indicates continuous feature. * Other values >= 2 indicate a categorical feature, @@ -217,7 +182,6 @@ object TreeBuilder { * @return */ private def randomBalancedDecisionTreeHelper( - isRegression: Boolean, subtreeDepth: Int, featureArity: Array[Int], impurityCalculator: ImpurityCalculator, @@ -227,7 +191,7 @@ object TreeBuilder { if (subtreeDepth == 0) { // This case only happens for a depth 0 tree. - createLeafNode(isRegression, prediction = 0.0, impurity = 0.0, impurityCalculator) + return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator) } val numFeatures = featureArity.length @@ -257,20 +221,19 @@ object TreeBuilder { val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) { // Add leaf nodes. Assign these jointly so they make different predictions. val predictions = labelGenerator.nextValue() - val leftChild = createLeafNode(isRegression, prediction = predictions._1, impurity = 0.0, + val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0, impurityStats = impurityCalculator) - val rightChild = createLeafNode(isRegression, prediction = predictions._2, impurity = 0.0, + val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0, impurityStats = impurityCalculator) (leftChild, rightChild) } else { - val leftChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity, + val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, impurityCalculator, labelGenerator, usedFeatures + feature, rng) - val rightChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity, + val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, impurityCalculator, labelGenerator, usedFeatures + feature, rng) (leftChild, rightChild) } - createInternalNode(isRegression, prediction = 0.0, impurity = 0.0, gain = 0.0, - leftChild = leftChild, rightChild = rightChild, split = split, - impurityStats = impurityCalculator) + new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild, + rightChild = rightChild, split = split, impurityStats = impurityCalculator) } } diff --git a/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala b/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala index 7e72f5e..377156c 100644 --- a/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala +++ b/src/test/scala/com/databricks/spark/sql/perf/mllib/MLLibSuite.scala @@ -2,18 +2,34 @@ package com.databricks.spark.sql.perf.mllib import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.apache.log4j.{Level, Logger} + import org.apache.spark.sql.{Row, SparkSession} class MLLibSuite extends FunSuite with BeforeAndAfterAll { private var sparkSession: SparkSession = _ + var savedLevels: Map[String, Level] = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate() + + // Travis limits the size of the log file produced by a build. Because we do run a small + // version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the + // log level to ERROR, just for this suite, to avoid displeasing travis. + savedLevels = Seq("akka", "org", "com.databricks").map { name => + val logger = Logger.getLogger(name) + val curLevel = logger.getLevel + logger.setLevel(Level.ERROR) + name -> curLevel + }.toMap } override def afterAll(): Unit = { + savedLevels.foreach { case (name, level) => + Logger.getLogger(name).setLevel(level) + } try { if (sparkSession != null) { sparkSession.stop()