Add more tests for join performance

Author: Michael Armbrust <michael@databricks.com>

Closes #17 from marmbrus/joinPerf.
This commit is contained in:
Michael Armbrust 2015-09-09 21:56:47 -07:00
parent 08cb68ca20
commit e2dc749480

View File

@ -1,11 +1,15 @@
package com.databricks.spark.sql.perf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
trait JoinPerformance extends Benchmark {
// 1.5 mb, 1 file
import ExecutionMode._
import sqlContext.implicits._
private val table = sqlContext.table _
val x = Table(
"1milints",
@ -15,7 +19,7 @@ trait JoinPerformance extends Benchmark {
val joinTables = Seq(
// 143.542mb, 10 files
Table(
"1bilints",
"100milints",
sqlContext.range(0, 100000000)
.repartition(10)),
@ -42,4 +46,26 @@ trait JoinPerformance extends Benchmark {
}
}
}
}
val varyDataSize = Seq(1, 128, 256, 512, 1024).map { dataSize =>
val intsWithData = table("100milints").select($"id", lit("*" * dataSize).as(s"data$dataSize"))
new Query(
s"join - datasize: $dataSize",
intsWithData.as("a").join(intsWithData.as("b"), $"a.id" === $"b.id"))
}
val varyKeyType = Seq(StringType, IntegerType, LongType, DoubleType).map { keyType =>
val convertedInts = table("100milints").select($"id".cast(keyType).as("id"))
new Query(
s"join - keytype: $keyType",
convertedInts.as("a").join(convertedInts.as("b"), $"a.id" === $"b.id"))
}
val varyNumMatches = Seq(1, 2, 4, 8, 16).map { numCopies =>
val ints = table("100milints")
val copiedInts = Seq.fill(numCopies)(ints).reduce(_ unionAll _)
new Query(
s"join - numMatches: $numCopies",
copiedInts.as("a").join(ints.as("b"), $"a.id" === $"b.id"))
}
}