Add remaining tpcds tables.

Author: Nong Li <nongli@gmail.com>

Closes #34 from nongli/tpcds.
This commit is contained in:
Nong Li 2015-11-19 13:50:00 -08:00
parent e2073129cf
commit 1aa5bfc838
2 changed files with 257 additions and 9 deletions

View File

@ -10,7 +10,9 @@ This is a performance testing framework for [Spark SQL](https://spark.apache.org
The rest of document will use TPC-DS benchmark as an example. We will add contents to explain how to use other benchmarks add the support of a new benchmark dataset in future.
### Setup a benchmark
Before running any query, a dataset needs to be setup by creating a `Benchmark` object.
Before running any query, a dataset needs to be setup by creating a `Benchmark` object. Generating
the TPCDS data requires dsdgen built and available on the machines. We have a fork of dsdgen that
you will need. It can be found [here](https://github.com/davies/tpcds-kit).
```
import com.databricks.spark.sql.perf.tpcds.Tables

View File

@ -16,15 +16,12 @@
package com.databricks.spark.sql.perf.tpcds
import java.io.File
import scala.sys.process._
import org.apache.spark.Logging
import org.apache.spark.sql.{SaveMode, SQLContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extends Serializable with Logging {
import sqlContext.implicits._
@ -256,13 +253,77 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
}
val tables = Seq(
/* This is another large table that we don't build yet.
Table("catalog_sales",
partitionColumns = "cs_sold_date_sk" :: Nil,
'cs_sold_date_sk .int,
'cs_sold_time_sk .int,
'cs_ship_date_sk .int,
'cs_bill_customer_sk .int,
'cs_bill_cdemo_sk .int,
'cs_bill_hdemo_sk .int,
'cs_bill_addr_sk .int,
'cs_ship_customer_sk .int,
'cs_ship_cdemo_sk .int,
'cs_ship_hdemo_sk .int,
'cs_ship_addr_sk .int,
'cs_call_center_sk .int,
'cs_catalog_page_sk .int,
'cs_ship_mode_sk .int,
'cs_warehouse_sk .int,
'cs_item_sk .int,
'cs_promo_sk .int,
'cs_order_number .int,
'cs_quantity .int,
'cs_wholesale_cost .decimal(7,2),
'cs_list_price .decimal(7,2),
'cs_sales_price .decimal(7,2),
'cs_ext_discount_amt .decimal(7,2),
'cs_ext_sales_price .decimal(7,2),
'cs_ext_wholesale_cost .decimal(7,2),
'cs_ext_list_price .decimal(7,2),
'cs_ext_tax .decimal(7,2),
'cs_coupon_amt .decimal(7,2),
'cs_ext_ship_cost .decimal(7,2),
'cs_net_paid .decimal(7,2),
'cs_net_paid_inc_tax .decimal(7,2),
'cs_net_paid_inc_ship .decimal(7,2),
'cs_net_paid_inc_ship_tax .decimal(7,2),
'cs_net_profit .decimal(7,2)),
Table("catalog_returns",
partitionColumns = "cr_returned_date_sk" :: Nil,
'cr_returned_date_sk .int,
'cr_returned_time_sk .int,
'cr_item_sk .int,
'cr_refunded_customer_sk .int,
'cr_refunded_cdemo_sk .int,
'cr_refunded_hdemo_sk .int,
'cr_refunded_addr_sk .int,
'cr_returning_customer_sk .int,
'cr_returning_cdemo_sk .int,
'cr_returning_hdemo_sk .int,
'cr_returning_addr_sk .int,
'cr_call_center_sk .int,
'cr_catalog_page_sk .int,
'cr_ship_mode_sk .int,
'cr_warehouse_sk .int,
'cr_reason_sk .int,
'cr_order_number .int,
'cr_return_quantity .int,
'cr_return_amount .decimal(7,2),
'cr_return_tax .decimal(7,2),
'cr_return_amt_inc_tax .decimal(7,2),
'cr_fee .decimal(7,2),
'cr_return_ship_cost .decimal(7,2),
'cr_refunded_cash .decimal(7,2),
'cr_reversed_charge .decimal(7,2),
'cr_store_credit .decimal(7,2),
'cr_net_loss .decimal(7,2)),
Table("inventory",
PartitionedTable("inv_date_sk"),
partitionColumns = "inv_date_sk" :: Nil,
'inv_date_sk .int,
'inv_item_sk .int,
'inv_warehouse_sk .int,
'inv_quantity_on_hand .int),*/
'inv_quantity_on_hand .int),
Table("store_sales",
partitionColumns = "ss_sold_date_sk" :: Nil,
'ss_sold_date_sk .int,
@ -310,6 +371,112 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
'sr_reversed_charge .decimal(7,2),
'sr_store_credit .decimal(7,2),
'sr_net_loss .decimal(7,2)),
Table("web_sales",
partitionColumns = "ws_sold_date_sk" :: Nil,
'ws_sold_date_sk .int,
'ws_sold_time_sk .int,
'ws_ship_date_sk .int,
'ws_item_sk .int,
'ws_bill_customer_sk .int,
'ws_bill_cdemo_sk .int,
'ws_bill_hdemo_sk .int,
'ws_bill_addr_sk .int,
'ws_ship_customer_sk .int,
'ws_ship_cdemo_sk .int,
'ws_ship_hdemo_sk .int,
'ws_ship_addr_sk .int,
'ws_web_page_sk .int,
'ws_web_site_sk .int,
'ws_ship_mode_sk .int,
'ws_warehouse_sk .int,
'ws_promo_sk .int,
'ws_order_number .int,
'ws_quantity .int,
'ws_wholesale_cost .decimal(7,2),
'ws_list_price .decimal(7,2),
'ws_sales_price .decimal(7,2),
'ws_ext_discount_amt .decimal(7,2),
'ws_ext_sales_price .decimal(7,2),
'ws_ext_wholesale_cost .decimal(7,2),
'ws_ext_list_price .decimal(7,2),
'ws_ext_tax .decimal(7,2),
'ws_coupon_amt .decimal(7,2),
'ws_ext_ship_cost .decimal(7,2),
'ws_net_paid .decimal(7,2),
'ws_net_paid_inc_tax .decimal(7,2),
'ws_net_paid_inc_ship .decimal(7,2),
'ws_net_paid_inc_ship_tax .decimal(7,2),
'ws_net_profit .decimal(7,2)),
Table("web_returns",
partitionColumns = "wr_returned_date_sk" ::Nil,
'wr_returned_date_sk .long,
'wr_returned_time_sk .long,
'wr_item_sk .long,
'wr_refunded_customer_sk .long,
'wr_refunded_cdemo_sk .long,
'wr_refunded_hdemo_sk .long,
'wr_refunded_addr_sk .long,
'wr_returning_customer_sk .long,
'wr_returning_cdemo_sk .long,
'wr_returning_hdemo_sk .long,
'wr_returning_addr_sk .long,
'wr_web_page_sk .long,
'wr_reason_sk .long,
'wr_order_number .long,
'wr_return_quantity .long,
'wr_return_amt .decimal(7,2),
'wr_return_tax .decimal(7,2),
'wr_return_amt_inc_tax .decimal(7,2),
'wr_fee .decimal(7,2),
'wr_return_ship_cost .decimal(7,2),
'wr_refunded_cash .decimal(7,2),
'wr_reversed_charge .decimal(7,2),
'wr_account_credit .decimal(7,2),
'wr_net_loss .decimal(7,2)),
Table("call_center",
partitionColumns = Nil,
'cc_call_center_sk .int,
'cc_call_center_id .string,
'cc_rec_start_date .date,
'cc_rec_end_date .date,
'cc_closed_date_sk .int,
'cc_open_date_sk .int,
'cc_name .string,
'cc_class .string,
'cc_employees .int,
'cc_sq_ft .int,
'cc_hours .string,
'cc_manager .string,
'cc_mkt_id .int,
'cc_mkt_class .string,
'cc_mkt_desc .string,
'cc_market_manager .string,
'cc_division .int,
'cc_division_name .string,
'cc_company .int,
'cc_company_name .string,
'cc_street_number .string,
'cc_street_name .string,
'cc_street_type .string,
'cc_suite_number .string,
'cc_city .string,
'cc_county .string,
'cc_state .string,
'cc_zip .string,
'cc_country .string,
'cc_gmt_offset .decimal(5,2),
'cc_tax_percentage .decimal(5,2)),
Table("catalog_page",
partitionColumns = Nil,
'cp_catalog_page_sk .int,
'cp_catalog_page_id .string,
'cp_start_date_sk .int,
'cp_end_date_sk .int,
'cp_department .string,
'cp_catalog_number .int,
'cp_catalog_page_number .int,
'cp_description .string,
'cp_type .string),
Table("customer",
partitionColumns = Nil,
'c_customer_sk .int,
@ -393,6 +560,11 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
'hd_buy_potential .string,
'hd_dep_count .int,
'hd_vehicle_count .int),
Table("income_band",
partitionColumns = Nil,
'ib_income_band_sk .int,
'ib_lower_bound .int,
'ib_upper_bound .int),
Table("item",
partitionColumns = Nil,
'i_item_sk .int,
@ -438,6 +610,19 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
'p_channel_details .string,
'p_purpose .string,
'p_discount_active .string),
Table("reason",
partitionColumns = Nil,
'r_reason_sk .int,
'r_reason_id .string,
'r_reason_desc .string),
Table("ship_mode",
partitionColumns = Nil,
'sm_ship_mode_sk .int,
'sm_ship_mode_id .string,
'sm_type .string,
'sm_code .string,
'sm_carrier .string,
'sm_contract .string),
Table("store",
partitionColumns = Nil,
's_store_sk .int,
@ -480,5 +665,66 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
't_am_pm .string,
't_shift .string,
't_sub_shift .string,
't_meal_time .string))
't_meal_time .string),
Table("warehouse",
partitionColumns = Nil,
'w_warehouse_sk .int,
'w_warehouse_id .string,
'w_warehouse_name .string,
'w_warehouse_sq_ft .int,
'w_street_number .string,
'w_street_name .string,
'w_street_type .string,
'w_suite_number .string,
'w_city .string,
'w_county .string,
'w_state .string,
'w_zip .string,
'w_country .string,
'w_gmt_offset .decimal(5,2)),
Table("web_page",
partitionColumns = Nil,
'wp_web_page_sk .int,
'wp_web_page_id .string,
'wp_rec_start_date .date,
'wp_rec_end_date .date,
'wp_creation_date_sk .int,
'wp_access_date_sk .int,
'wp_autogen_flag .string,
'wp_customer_sk .int,
'wp_url .string,
'wp_type .string,
'wp_char_count .int,
'wp_link_count .int,
'wp_image_count .int,
'wp_max_ad_count .int),
Table("web_site",
partitionColumns = Nil,
'web_site_sk .int,
'web_site_id .string,
'web_rec_start_date .date,
'web_rec_end_date .date,
'web_name .string,
'web_open_date_sk .int,
'web_close_date_sk .int,
'web_class .string,
'web_manager .string,
'web_mkt_id .int,
'web_mkt_class .string,
'web_mkt_desc .string,
'web_market_manager .string,
'web_company_id .int,
'web_company_name .string,
'web_street_number .string,
'web_street_name .string,
'web_street_type .string,
'web_suite_number .string,
'web_city .string,
'web_county .string,
'web_state .string,
'web_zip .string,
'web_country .string,
'web_gmt_offset .string,
'web_tax_percentage .decimal(5,2))
)
}