work on glm, and some notbooks

This commit is contained in:
Timothy Hunter 2016-06-23 12:13:11 -07:00
parent 1388722b81
commit 87dc42a466
10 changed files with 100 additions and 12 deletions

View File

@ -45,9 +45,9 @@ trait Benchmarkable extends Logging {
sparkContext.setJobDescription(s"Execution: $name, $description")
beforeBenchmark()
val result = if (forkThread) {
runBenchmarkForked(includeBreakdown, description, messages, timeout)
} else {
doBenchmark(includeBreakdown, description, messages)
} else {
runBenchmarkForked(includeBreakdown, description, messages, timeout)
}
afterBenchmark(sqlContext.sparkContext)
result
@ -81,7 +81,13 @@ trait Benchmarkable extends Logging {
} catch {
case e: Throwable =>
logger.info(s"$that: failure in runBenchmark: $e")
throw e
println(s"$that: failure in runBenchmark: $e")
result = BenchmarkResult(
name = name,
mode = executionMode.toString,
parameters = Map.empty,
failure = Some(Failure(e.getClass.getSimpleName,
e.getMessage + ":\n" + e.getStackTraceString)))
}
}
}

View File

@ -35,6 +35,10 @@ trait BenchmarkAlgorithm extends Logging {
ctx: MLBenchContext,
testSet: DataFrame,
model: Transformer): Double = -1.0 // Not putting NaN because it is not valid JSON.
def name: String = {
this.getClass.getCanonicalName.replace("$", "")
}
}
/**
@ -82,7 +86,9 @@ trait TestFromTraining {
// Copy the context with a new seed.
val ctx2 = ctx.params.randomSeed match {
case Some(x) =>
val p = ctx.params.copy(randomSeed = Some(x + 1))
// Also set the number of examples to the number of test examples.
assert(ctx.params.numTestExamples.nonEmpty, "You must specify test examples")
val p = ctx.params.copy(randomSeed = Some(x + 1), numExamples = ctx.params.numTestExamples)
ctx.copy(params = p)
case None =>
// Making a full copy to reset the internal seed.

View File

@ -54,9 +54,9 @@ object MLLib extends Logging {
val benchmarks = benchmarksDescriptions.map { mlb =>
new MLTransformerBenchmarkable(mlb.params, mlb.benchmark, sqlContext)
}
logger.info(s"${benchmarks.size} benchmarks identified:")
println(s"${benchmarks.size} benchmarks identified:")
val str = benchmarks.map(_.prettyPrint).mkString("\n")
logger.info(str)
println(str)
logger.info("Starting experiments")
val e = b.runExperiment(
executionsToRun = benchmarks,

View File

@ -18,7 +18,7 @@ class MLTransformerBenchmarkable(
private var trainingData: DataFrame = null
private val param = MLBenchContext(params, sqlContext)
override val name = test.getClass.getCanonicalName
override val name = test.name
override protected val executionMode: ExecutionMode = ExecutionMode.SparkPerfResults

View File

@ -0,0 +1,55 @@
package com.databricks.spark.sql.perf.mllib.regression
import com.databricks.spark.sql.perf.mllib.OptionImplicits._
import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.data.DataGenerator
import org.apache.spark.ml.evaluation.{Evaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.ml.{ModelBuilder, Transformer}
import org.apache.spark.sql._
object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with
TrainingSetFromTransformer with ScoringWithEvaluator {
override protected def initialData(ctx: MLBenchContext) = {
import ctx.params._
DataGenerator.generateFeatures(
ctx.sqlContext,
numExamples,
ctx.seed(),
numPartitions,
numFeatures)
}
override protected def trueModel(ctx: MLBenchContext): Transformer = {
import ctx.params._
val rng = ctx.newGenerator()
val coefficients =
Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1))
// Small intercept to prevent some skew in the data.
val intercept = 0.01 * (2 * rng.nextDouble - 1)
val m = ModelBuilder.newGLR(coefficients, intercept)
m.set(m.link, link.get)
m.set(m.family, family.get)
m
}
override def train(
ctx: MLBenchContext,
trainingSet: DataFrame): Transformer = {
logger.info(s"$this: train: trainingSet=${trainingSet.schema}")
import ctx.params._
val glr = new GeneralizedLinearRegression()
.setLink(link)
.setFamily(family)
.setRegParam(regParam)
.setMaxIter(maxIter)
.setTol(tol)
glr.fit(trainingSet)
}
override protected def evaluator(ctx: MLBenchContext): Evaluator =
new RegressionEvaluator()
}

View File

@ -111,12 +111,11 @@ object YamlConfig {
}
// Some ugly internals to make simple constructs
package object ccFromMap {
object ccFromMap {
// Builds a case class from a map.
// (taken from stack overflow)
// if strict, will report an error if some unknown arguments are passed to the constructor
def fromMap[T: TypeTag: ClassTag](m: Map[String,_], strict: Boolean) = {
scala.reflect.runtime.universe
val rm = runtimeMirror(classTag[T].runtimeClass.getClassLoader)
val classTest = typeOf[T].typeSymbol.asClass
@ -144,7 +143,8 @@ package object ccFromMap {
m.get(paramName).getOrElse(throw new IllegalArgumentException("Map is missing required parameter named " + paramName))
})
constructorMirror(constructorArgs:_*).asInstanceOf[T]
val res = constructorMirror(constructorArgs:_*).asInstanceOf[T]
res
}
// TODO: handle scala.reflect.internal.MissingRequirementError

View File

@ -113,9 +113,12 @@ case class MLParams(
numTestExamples: Option[Long] = None,
numPartitions: Option[Int] = None,
// *** Specialized and sorted by name ***
elasticNetParam: Option[Double] = None,
family: Option[String] = None,
link: Option[String] = None,
k: Option[Int] = None,
ldaDocLength: Option[Int] = None,
ldaNumVocabulary: Option[Int] = None,
k: Option[Int] = None,
maxIter: Option[Int] = None,
numFeatures: Option[Int] = None,
optimizer: Option[String] = None,

View File

@ -3,6 +3,7 @@ timeoutSeconds: 1000
common:
numFeatures: 10
numExamples: [1, 3]
numTestExamples: 100
numPartitions: 3
randomSeed: [1, 2, 3]
benchmarks:
@ -14,6 +15,7 @@ benchmarks:
- name: clustering.LDA
params:
numExamples: 10
numTestExamples: 10
ldaDocLength: 20
ldaNumVocabulary: 4
k: 5
@ -21,3 +23,13 @@ benchmarks:
optimizer:
- em
- online
- name: regression.GLMRegression
params:
numExamples: 100
numTestExamples: 10
numFeatures: 5
link: log
family: gaussian
tol: 0.0
maxIter: 10
regParam: 0.1

View File

@ -2,6 +2,7 @@ package org.apache.spark.ml
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel
/**
* Helper for creating MLlib models which have private constructors.
@ -13,4 +14,9 @@ object ModelBuilder {
intercept: Double): LogisticRegressionModel = {
new LogisticRegressionModel("lr", coefficients, intercept)
}
def newGLR(
coefficients: Vector,
intercept: Double): GeneralizedLinearRegressionModel =
new GeneralizedLinearRegressionModel("glr-uid", coefficients, intercept)
}

View File

@ -1 +1 @@
version in ThisBuild := "0.4.9-SNAPSHOT"
version in ThisBuild := "0.4.10-SNAPSHOT"