[ML-5437] Build with spark-2.4.0 and resolve build issues (#174)

We made some changes to related to new APIs in spark2.4. These APIs were reverted because they were breaking changes so we need to revert our changes.
This commit is contained in:
Bago Amirbekian 2018-11-09 16:21:22 -08:00 committed by Joseph Bradley
parent d44caec277
commit 85bbfd4ca2
3 changed files with 25 additions and 48 deletions

View File

@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf"
// All Spark Packages need a license
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
sparkVersion := "2.4.0-SNAPSHOT"
sparkVersion := "2.4.0"
sparkComponents ++= Seq("sql", "hive", "mllib")
@ -44,8 +44,6 @@ libraryDependencies += "org.yaml" % "snakeyaml" % "1.17"
libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2"
resolvers += "Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
fork := true
// Your username to login to Databricks Cloud

View File

@ -45,7 +45,6 @@ object ModelBuilderSSP {
s" but was given $numClasses")
val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses,
featureArity = featureArity, seed = seed)
.asInstanceOf[ClassificationNode]
new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length,
numClasses = numClasses)
}
@ -56,7 +55,6 @@ object ModelBuilderSSP {
seed: Long): DecisionTreeRegressionModel = {
val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0,
featureArity = featureArity, seed = seed)
.asInstanceOf[RegressionNode]
new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length)
}
@ -167,45 +165,12 @@ object TreeBuilder {
ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0))
}
randomBalancedDecisionTreeHelper(isRegression, depth, featureArity, impurityCalculator,
randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator,
labelGenerator, Set.empty, rng)
}
private def createLeafNode(
isRegression: Boolean,
prediction: Double,
impurity: Double,
impurityStats: ImpurityCalculator): LeafNode = {
if (isRegression) {
new RegressionLeafNode(prediction, impurity, impurityStats)
} else {
new ClassificationLeafNode(prediction, impurity, impurityStats)
}
}
private def createInternalNode(
isRegression: Boolean,
prediction: Double,
impurity: Double,
gain: Double,
leftChild: Node,
rightChild: Node,
split: Split,
impurityStats: ImpurityCalculator): InternalNode = {
if (isRegression) {
new RegressionInternalNode(prediction, impurity, gain,
leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode],
split, impurityStats)
} else {
new ClassificationInternalNode(prediction, impurity, gain,
leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode],
split, impurityStats)
}
}
/**
* Create an internal node. Either create the leaf nodes beneath it, or recurse as needed.
* @param isRegression Whether the tree is a regressor or not (classifier)
* @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node.
* @param featureArity Indicates feature type. Value 0 indicates continuous feature.
* Other values >= 2 indicate a categorical feature,
@ -217,7 +182,6 @@ object TreeBuilder {
* @return
*/
private def randomBalancedDecisionTreeHelper(
isRegression: Boolean,
subtreeDepth: Int,
featureArity: Array[Int],
impurityCalculator: ImpurityCalculator,
@ -227,7 +191,7 @@ object TreeBuilder {
if (subtreeDepth == 0) {
// This case only happens for a depth 0 tree.
createLeafNode(isRegression, prediction = 0.0, impurity = 0.0, impurityCalculator)
return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator)
}
val numFeatures = featureArity.length
@ -257,20 +221,19 @@ object TreeBuilder {
val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) {
// Add leaf nodes. Assign these jointly so they make different predictions.
val predictions = labelGenerator.nextValue()
val leftChild = createLeafNode(isRegression, prediction = predictions._1, impurity = 0.0,
val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0,
impurityStats = impurityCalculator)
val rightChild = createLeafNode(isRegression, prediction = predictions._2, impurity = 0.0,
val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0,
impurityStats = impurityCalculator)
(leftChild, rightChild)
} else {
val leftChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
impurityCalculator, labelGenerator, usedFeatures + feature, rng)
val rightChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
impurityCalculator, labelGenerator, usedFeatures + feature, rng)
(leftChild, rightChild)
}
createInternalNode(isRegression, prediction = 0.0, impurity = 0.0, gain = 0.0,
leftChild = leftChild, rightChild = rightChild, split = split,
impurityStats = impurityCalculator)
new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild,
rightChild = rightChild, split = split, impurityStats = impurityCalculator)
}
}

View File

@ -2,18 +2,34 @@ package com.databricks.spark.sql.perf.mllib
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
class MLLibSuite extends FunSuite with BeforeAndAfterAll {
private var sparkSession: SparkSession = _
var savedLevels: Map[String, Level] = _
override def beforeAll(): Unit = {
super.beforeAll()
sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()
// Travis limits the size of the log file produced by a build. Because we do run a small
// version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
// log level to ERROR, just for this suite, to avoid displeasing travis.
savedLevels = Seq("akka", "org", "com.databricks").map { name =>
val logger = Logger.getLogger(name)
val curLevel = logger.getLevel
logger.setLevel(Level.ERROR)
name -> curLevel
}.toMap
}
override def afterAll(): Unit = {
savedLevels.foreach { case (name, level) =>
Logger.getLogger(name).setLevel(level)
}
try {
if (sparkSession != null) {
sparkSession.stop()