From bcda8fc1e5d7bba89f8c608c836f3566330ffdec Mon Sep 17 00:00:00 2001 From: Juliusz Sompolski Date: Mon, 4 Sep 2017 18:05:42 +0200 Subject: [PATCH] Coalesce non-partitioned tables. (#118) In #109 coalescing of non-partitioned tables into 1 file seems to have gotten accidentally removed. Put it back, but only when clusterByPartitionedColumns == true Considering that we coalesce partitions only when that setting is true, it seems to be consistent to use it also for non-partitioned tables. It may be better to change the name of the parameter, but that changes the interface, and possibly should be left for some future clean up. --- src/main/scala/com/databricks/spark/sql/perf/Tables.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala index 368ea5d..775dfd8 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala @@ -211,7 +211,12 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, data.write } } else { - data.write + if (clusterByPartitionColumns) { + // treat non-partitioned tables as "one partition" that we want to coalesce + data.coalesce(1).write + } else { + data.write + } } writer.format(format).mode(mode) if (partitionColumns.nonEmpty) {