[ML-5437] Build with spark-2.4.0 and resolve build issues (#174)
We made some changes to related to new APIs in spark2.4. These APIs were reverted because they were breaking changes so we need to revert our changes.
This commit is contained in:
parent
d44caec277
commit
85bbfd4ca2
@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf"
|
||||
// All Spark Packages need a license
|
||||
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
|
||||
|
||||
sparkVersion := "2.4.0-SNAPSHOT"
|
||||
sparkVersion := "2.4.0"
|
||||
|
||||
sparkComponents ++= Seq("sql", "hive", "mllib")
|
||||
|
||||
@ -44,8 +44,6 @@ libraryDependencies += "org.yaml" % "snakeyaml" % "1.17"
|
||||
|
||||
libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2"
|
||||
|
||||
resolvers += "Apache Development Snapshot Repository" at "https://repository.apache.org/content/repositories/snapshots"
|
||||
|
||||
fork := true
|
||||
|
||||
// Your username to login to Databricks Cloud
|
||||
|
||||
@ -45,7 +45,6 @@ object ModelBuilderSSP {
|
||||
s" but was given $numClasses")
|
||||
val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses,
|
||||
featureArity = featureArity, seed = seed)
|
||||
.asInstanceOf[ClassificationNode]
|
||||
new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length,
|
||||
numClasses = numClasses)
|
||||
}
|
||||
@ -56,7 +55,6 @@ object ModelBuilderSSP {
|
||||
seed: Long): DecisionTreeRegressionModel = {
|
||||
val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0,
|
||||
featureArity = featureArity, seed = seed)
|
||||
.asInstanceOf[RegressionNode]
|
||||
new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length)
|
||||
}
|
||||
|
||||
@ -167,45 +165,12 @@ object TreeBuilder {
|
||||
ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0))
|
||||
}
|
||||
|
||||
randomBalancedDecisionTreeHelper(isRegression, depth, featureArity, impurityCalculator,
|
||||
randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator,
|
||||
labelGenerator, Set.empty, rng)
|
||||
}
|
||||
|
||||
private def createLeafNode(
|
||||
isRegression: Boolean,
|
||||
prediction: Double,
|
||||
impurity: Double,
|
||||
impurityStats: ImpurityCalculator): LeafNode = {
|
||||
if (isRegression) {
|
||||
new RegressionLeafNode(prediction, impurity, impurityStats)
|
||||
} else {
|
||||
new ClassificationLeafNode(prediction, impurity, impurityStats)
|
||||
}
|
||||
}
|
||||
|
||||
private def createInternalNode(
|
||||
isRegression: Boolean,
|
||||
prediction: Double,
|
||||
impurity: Double,
|
||||
gain: Double,
|
||||
leftChild: Node,
|
||||
rightChild: Node,
|
||||
split: Split,
|
||||
impurityStats: ImpurityCalculator): InternalNode = {
|
||||
if (isRegression) {
|
||||
new RegressionInternalNode(prediction, impurity, gain,
|
||||
leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode],
|
||||
split, impurityStats)
|
||||
} else {
|
||||
new ClassificationInternalNode(prediction, impurity, gain,
|
||||
leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode],
|
||||
split, impurityStats)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an internal node. Either create the leaf nodes beneath it, or recurse as needed.
|
||||
* @param isRegression Whether the tree is a regressor or not (classifier)
|
||||
* @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node.
|
||||
* @param featureArity Indicates feature type. Value 0 indicates continuous feature.
|
||||
* Other values >= 2 indicate a categorical feature,
|
||||
@ -217,7 +182,6 @@ object TreeBuilder {
|
||||
* @return
|
||||
*/
|
||||
private def randomBalancedDecisionTreeHelper(
|
||||
isRegression: Boolean,
|
||||
subtreeDepth: Int,
|
||||
featureArity: Array[Int],
|
||||
impurityCalculator: ImpurityCalculator,
|
||||
@ -227,7 +191,7 @@ object TreeBuilder {
|
||||
|
||||
if (subtreeDepth == 0) {
|
||||
// This case only happens for a depth 0 tree.
|
||||
createLeafNode(isRegression, prediction = 0.0, impurity = 0.0, impurityCalculator)
|
||||
return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator)
|
||||
}
|
||||
|
||||
val numFeatures = featureArity.length
|
||||
@ -257,20 +221,19 @@ object TreeBuilder {
|
||||
val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) {
|
||||
// Add leaf nodes. Assign these jointly so they make different predictions.
|
||||
val predictions = labelGenerator.nextValue()
|
||||
val leftChild = createLeafNode(isRegression, prediction = predictions._1, impurity = 0.0,
|
||||
val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0,
|
||||
impurityStats = impurityCalculator)
|
||||
val rightChild = createLeafNode(isRegression, prediction = predictions._2, impurity = 0.0,
|
||||
val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0,
|
||||
impurityStats = impurityCalculator)
|
||||
(leftChild, rightChild)
|
||||
} else {
|
||||
val leftChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
|
||||
val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
|
||||
impurityCalculator, labelGenerator, usedFeatures + feature, rng)
|
||||
val rightChild = randomBalancedDecisionTreeHelper(isRegression, subtreeDepth - 1, featureArity,
|
||||
val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity,
|
||||
impurityCalculator, labelGenerator, usedFeatures + feature, rng)
|
||||
(leftChild, rightChild)
|
||||
}
|
||||
createInternalNode(isRegression, prediction = 0.0, impurity = 0.0, gain = 0.0,
|
||||
leftChild = leftChild, rightChild = rightChild, split = split,
|
||||
impurityStats = impurityCalculator)
|
||||
new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild,
|
||||
rightChild = rightChild, split = split, impurityStats = impurityCalculator)
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,18 +2,34 @@ package com.databricks.spark.sql.perf.mllib
|
||||
|
||||
import org.scalatest.{BeforeAndAfterAll, FunSuite}
|
||||
|
||||
import org.apache.log4j.{Level, Logger}
|
||||
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
|
||||
class MLLibSuite extends FunSuite with BeforeAndAfterAll {
|
||||
|
||||
private var sparkSession: SparkSession = _
|
||||
var savedLevels: Map[String, Level] = _
|
||||
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
sparkSession = SparkSession.builder.master("local[2]").appName("MLlib QA").getOrCreate()
|
||||
|
||||
// Travis limits the size of the log file produced by a build. Because we do run a small
|
||||
// version of all the ML benchmarks in this suite, we produce a ton of logs. Here we set the
|
||||
// log level to ERROR, just for this suite, to avoid displeasing travis.
|
||||
savedLevels = Seq("akka", "org", "com.databricks").map { name =>
|
||||
val logger = Logger.getLogger(name)
|
||||
val curLevel = logger.getLevel
|
||||
logger.setLevel(Level.ERROR)
|
||||
name -> curLevel
|
||||
}.toMap
|
||||
}
|
||||
|
||||
override def afterAll(): Unit = {
|
||||
savedLevels.foreach { case (name, level) =>
|
||||
Logger.getLogger(name).setLevel(level)
|
||||
}
|
||||
try {
|
||||
if (sparkSession != null) {
|
||||
sparkSession.stop()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user