From 33a1e55366ba9ecb601577a4e69a050c5c5d406b Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Wed, 29 Jun 2016 17:06:27 -0700 Subject: [PATCH] partly done adding decision tree tests --- .../DecisionTreeClassification.scala | 45 ++++ .../databricks/spark/sql/perf/results.scala | 4 +- .../org/apache/spark/ml/ModelBuilder.scala | 198 +++++++++++++++++- 3 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 src/main/scala/com/databricks/spark/sql/perf/mllib/classification/DecisionTreeClassification.scala diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/DecisionTreeClassification.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/DecisionTreeClassification.scala new file mode 100644 index 0000000..4eebd74 --- /dev/null +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/DecisionTreeClassification.scala @@ -0,0 +1,45 @@ +package com.databricks.spark.sql.perf.mllib.classification + +import org.apache.spark.ml.ModelBuilder +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.classification.DecisionTreeClassifier +import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, Evaluator} +import org.apache.spark.sql.DataFrame + +import com.databricks.spark.sql.perf.mllib.OptionImplicits._ +import com.databricks.spark.sql.perf.mllib._ +import com.databricks.spark.sql.perf.mllib.data.DataGenerator + + +object DecisionTreeClassification extends BenchmarkAlgorithm + with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { + + override protected def initialData(ctx: MLBenchContext) = { + import ctx.params._ + DataGenerator.generateFeatures(ctx.sqlContext, numExamples, ctx.seed(), numPartitions, + numFeatures) + } + + override protected def trueModel(ctx: MLBenchContext): Transformer = { + //val rng = ctx.newGenerator() + val numFeatures = ctx.params.numFeatures.get + val fourthFeatures = numFeatures / 4 + val featureArity: Array[Int] = + Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical + Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical + Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous + ModelBuilder.newDecisionTreeClassificationModel(ctx.params.depth.get, ctx.params.numClasses.get, + featureArity, ctx.seed()) + } + + override def train(ctx: MLBenchContext, trainingSet: DataFrame): Transformer = { + logger.info(s"$this: train: trainingSet=${trainingSet.schema}") + import ctx.params._ + new DecisionTreeClassifier() + .setMaxDepth(depth.get) + .fit(trainingSet) + } + + override protected def evaluator(ctx: MLBenchContext): Evaluator = + new MulticlassClassificationEvaluator() +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/results.scala b/src/main/scala/com/databricks/spark/sql/perf/results.scala index 1effa9c..62a2435 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/results.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/results.scala @@ -113,13 +113,15 @@ case class MLParams( numTestExamples: Option[Long] = None, numPartitions: Option[Int] = None, // *** Specialized and sorted by name *** + depth: Option[Int] = None, elasticNetParam: Option[Double] = None, family: Option[String] = None, - link: Option[String] = None, k: Option[Int] = None, ldaDocLength: Option[Int] = None, ldaNumVocabulary: Option[Int] = None, + link: Option[String] = None, maxIter: Option[Int] = None, + numClasses: Option[Int] = None, numFeatures: Option[Int] = None, optimizer: Option[String] = None, regParam: Option[Double] = None, diff --git a/src/main/scala/org/apache/spark/ml/ModelBuilder.scala b/src/main/scala/org/apache/spark/ml/ModelBuilder.scala index 089376a..0466ba6 100644 --- a/src/main/scala/org/apache/spark/ml/ModelBuilder.scala +++ b/src/main/scala/org/apache/spark/ml/ModelBuilder.scala @@ -1,8 +1,13 @@ package org.apache.spark.ml -import org.apache.spark.ml.classification.LogisticRegressionModel +import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, LogisticRegressionModel} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel +import org.apache.spark.ml.regression.DecisionTreeRegressionModel +import org.apache.spark.ml.tree._ +import org.apache.spark.mllib.random.RandomDataGenerator +import org.apache.spark.mllib.tree.impurity.ImpurityCalculator + /** * Helper for creating MLlib models which have private constructors. @@ -19,4 +24,193 @@ object ModelBuilder { coefficients: Vector, intercept: Double): GeneralizedLinearRegressionModel = new GeneralizedLinearRegressionModel("glr-uid", coefficients, intercept) -} \ No newline at end of file + + def newDecisionTreeClassificationModel( + depth: Int, + numClasses: Int, + featureArity: Array[Int], + seed: Long): DecisionTreeClassificationModel = { + require(numClasses >= 2, s"DecisionTreeClassificationModel requires numClasses >= 2," + + s" but was given $numClasses") + val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses, + featureArity = featureArity, seed = seed) + new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length, + numClasses = numClasses) + } + + def newDecisionTreeRegressionModel( + depth: Int, + featureArity: Array[Int], + seed: Long): DecisionTreeRegressionModel = { + val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0, + featureArity = featureArity, seed = seed) + new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length) + } +} + +/** + * Helpers for creating random decision trees. + */ +object TreeBuilder { + + /** + * Generator for a pair of distinct class labels from the set {0,...,numClasses-1}. + * Pairs are useful for trees to make sure sibling leaf nodes make different predictions. + * @param numClasses Number of classes. + */ + private class ClassLabelPairGenerator(val numClasses: Int) + extends RandomDataGenerator[Pair[Double, Double]] { + + require(numClasses >= 2, + s"ClassLabelPairGenerator given label numClasses = $numClasses, but numClasses should be >= 2.") + + private val rng = new java.util.Random() + + override def nextValue(): Pair[Double, Double] = { + val left = rng.nextInt(numClasses) + var right = rng.nextInt(numClasses) + while (right == left) { + right = rng.nextInt(numClasses) + } + new Pair[Double, Double](left, right) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): ClassLabelPairGenerator = new ClassLabelPairGenerator(numClasses) + } + + + /** + * Generator for a pair of real-valued labels. + * Pairs are useful for trees to make sure sibling leaf nodes make different predictions. + */ + private class RealLabelPairGenerator() extends RandomDataGenerator[Pair[Double, Double]] { + + private val rng = new java.util.Random() + + override def nextValue(): Pair[Double, Double] = + new Pair[Double, Double](rng.nextDouble(), rng.nextDouble()) + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): RealLabelPairGenerator = new RealLabelPairGenerator() + } + + /** + * Creates a random decision tree structure. + * @param depth Depth of tree to build. Must be <= numFeatures. + * @param labelType Value 0 indicates regression. Integers >= 2 indicate numClasses for + * classification. + * @param featureArity Array of length numFeatures indicating feature type. + * Value 0 indicates continuous feature. + * Other values >= 2 indicate a categorical feature, + * where the value is the number of categories. + * @return root node of tree + */ + def randomBalancedDecisionTree( + depth: Int, + labelType: Int, + featureArity: Array[Int], + seed: Long): Node = { + require(depth >= 0, s"randomBalancedDecisionTree given depth < 0.") + val numFeatures = featureArity.length + require(depth <= numFeatures, + s"randomBalancedDecisionTree requires depth <= featureArity.size," + + s" but depth = $depth and featureArity.size = $numFeatures") + val isRegression = labelType == 0 + if (!isRegression) { + require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.") + } + + val rng = new scala.util.Random() + rng.setSeed(seed) + + val labelGenerator = if (isRegression) { + new RealLabelPairGenerator() + } else { + new ClassLabelPairGenerator(labelType) + } + // We use a dummy impurityCalculator for all nodes. + val impurityCalculator = if (isRegression) { + ImpurityCalculator.getCalculator("variance", Array.fill[Double](3)(0.0)) + } else { + ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0)) + } + + randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator, + labelGenerator, Set.empty, rng) + } + + /** + * Create an internal node. Either create the leaf nodes beneath it, or recurse as needed. + * @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node. + * @param featureArity Indicates feature type. Value 0 indicates continuous feature. + * Other values >= 2 indicate a categorical feature, + * where the value is the number of categories. + * @param impurityCalculator Dummy impurity calculator to use at all tree nodes + * @param usedFeatures Features appearing in the path from the tree root to the node + * being constructed. + * @param labelGenerator Generates pairs of distinct labels. + * @return + */ + private def randomBalancedDecisionTreeHelper( + subtreeDepth: Int, + featureArity: Array[Int], + impurityCalculator: ImpurityCalculator, + labelGenerator: RandomDataGenerator[Pair[Double, Double]], + usedFeatures: Set[Int], + rng: scala.util.Random): Node = { + + if (subtreeDepth == 0) { + // This case only happens for a depth 0 tree. + return new LeafNode(prediction = 0.0, impurity = 0.0, impurityStats = impurityCalculator) + } + + val numFeatures = featureArity.length + // Should not happen. + assert(usedFeatures.size < numFeatures, s"randomBalancedDecisionTreeSplitNode ran out of " + + s"features for splits.") + + // Make node internal. + var feature: Int = rng.nextInt(numFeatures) + while (usedFeatures.contains(feature)) { + feature = rng.nextInt(numFeatures) + } + val split: Split = if (featureArity(feature) == 0) { + // continuous feature + new ContinuousSplit(featureIndex = feature, threshold = rng.nextDouble()) + } else { + // categorical feature + // Put nCatsSplit categories on left, and the rest on the right. + // nCatsSplit is in {1,...,arity-1}. + val nCatsSplit = rng.nextInt(featureArity(feature) - 1) + 1 + val splitCategories: Array[Double] = + rng.shuffle(Range(0,featureArity(feature))).map(_.toDouble).toArray.take(nCatsSplit) + new CategoricalSplit(featureIndex = feature, + _leftCategories = splitCategories, numCategories = featureArity(feature)) + } + + val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) { + // Add leaf nodes. Assign these jointly so they make different predictions. + val predictions = labelGenerator.nextValue() + val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0, + impurityStats = impurityCalculator) + val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0, + impurityStats = impurityCalculator) + (leftChild, rightChild) + } else { + val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, + impurityCalculator, labelGenerator, usedFeatures + feature, rng) + val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, + impurityCalculator, labelGenerator, usedFeatures + feature, rng) + (leftChild, rightChild) + } + new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild, + rightChild = rightChild, split = split, impurityStats = impurityCalculator) + } +}