diff --git a/README.md b/README.md index 7c42dbb..4d2a97d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,9 @@ This is a performance testing framework for [Spark SQL](https://spark.apache.org The rest of document will use TPC-DS benchmark as an example. We will add contents to explain how to use other benchmarks add the support of a new benchmark dataset in future. ### Setup a benchmark -Before running any query, a dataset needs to be setup by creating a `Benchmark` object. +Before running any query, a dataset needs to be setup by creating a `Benchmark` object. Generating +the TPCDS data requires dsdgen built and available on the machines. We have a fork of dsdgen that +you will need. It can be found [here](https://github.com/davies/tpcds-kit). ``` import com.databricks.spark.sql.perf.tpcds.Tables diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/Tables.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/Tables.scala index 2e52552..6807dba 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/Tables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/Tables.scala @@ -16,15 +16,12 @@ package com.databricks.spark.sql.perf.tpcds -import java.io.File - import scala.sys.process._ import org.apache.spark.Logging -import org.apache.spark.sql.{SaveMode, SQLContext} -import org.apache.spark.sql.Row import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Row, SQLContext, SaveMode} class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extends Serializable with Logging { import sqlContext.implicits._ @@ -256,13 +253,77 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend } val tables = Seq( - /* This is another large table that we don't build yet. + Table("catalog_sales", + partitionColumns = "cs_sold_date_sk" :: Nil, + 'cs_sold_date_sk .int, + 'cs_sold_time_sk .int, + 'cs_ship_date_sk .int, + 'cs_bill_customer_sk .int, + 'cs_bill_cdemo_sk .int, + 'cs_bill_hdemo_sk .int, + 'cs_bill_addr_sk .int, + 'cs_ship_customer_sk .int, + 'cs_ship_cdemo_sk .int, + 'cs_ship_hdemo_sk .int, + 'cs_ship_addr_sk .int, + 'cs_call_center_sk .int, + 'cs_catalog_page_sk .int, + 'cs_ship_mode_sk .int, + 'cs_warehouse_sk .int, + 'cs_item_sk .int, + 'cs_promo_sk .int, + 'cs_order_number .int, + 'cs_quantity .int, + 'cs_wholesale_cost .decimal(7,2), + 'cs_list_price .decimal(7,2), + 'cs_sales_price .decimal(7,2), + 'cs_ext_discount_amt .decimal(7,2), + 'cs_ext_sales_price .decimal(7,2), + 'cs_ext_wholesale_cost .decimal(7,2), + 'cs_ext_list_price .decimal(7,2), + 'cs_ext_tax .decimal(7,2), + 'cs_coupon_amt .decimal(7,2), + 'cs_ext_ship_cost .decimal(7,2), + 'cs_net_paid .decimal(7,2), + 'cs_net_paid_inc_tax .decimal(7,2), + 'cs_net_paid_inc_ship .decimal(7,2), + 'cs_net_paid_inc_ship_tax .decimal(7,2), + 'cs_net_profit .decimal(7,2)), + Table("catalog_returns", + partitionColumns = "cr_returned_date_sk" :: Nil, + 'cr_returned_date_sk .int, + 'cr_returned_time_sk .int, + 'cr_item_sk .int, + 'cr_refunded_customer_sk .int, + 'cr_refunded_cdemo_sk .int, + 'cr_refunded_hdemo_sk .int, + 'cr_refunded_addr_sk .int, + 'cr_returning_customer_sk .int, + 'cr_returning_cdemo_sk .int, + 'cr_returning_hdemo_sk .int, + 'cr_returning_addr_sk .int, + 'cr_call_center_sk .int, + 'cr_catalog_page_sk .int, + 'cr_ship_mode_sk .int, + 'cr_warehouse_sk .int, + 'cr_reason_sk .int, + 'cr_order_number .int, + 'cr_return_quantity .int, + 'cr_return_amount .decimal(7,2), + 'cr_return_tax .decimal(7,2), + 'cr_return_amt_inc_tax .decimal(7,2), + 'cr_fee .decimal(7,2), + 'cr_return_ship_cost .decimal(7,2), + 'cr_refunded_cash .decimal(7,2), + 'cr_reversed_charge .decimal(7,2), + 'cr_store_credit .decimal(7,2), + 'cr_net_loss .decimal(7,2)), Table("inventory", - PartitionedTable("inv_date_sk"), + partitionColumns = "inv_date_sk" :: Nil, 'inv_date_sk .int, 'inv_item_sk .int, 'inv_warehouse_sk .int, - 'inv_quantity_on_hand .int),*/ + 'inv_quantity_on_hand .int), Table("store_sales", partitionColumns = "ss_sold_date_sk" :: Nil, 'ss_sold_date_sk .int, @@ -310,6 +371,112 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend 'sr_reversed_charge .decimal(7,2), 'sr_store_credit .decimal(7,2), 'sr_net_loss .decimal(7,2)), + Table("web_sales", + partitionColumns = "ws_sold_date_sk" :: Nil, + 'ws_sold_date_sk .int, + 'ws_sold_time_sk .int, + 'ws_ship_date_sk .int, + 'ws_item_sk .int, + 'ws_bill_customer_sk .int, + 'ws_bill_cdemo_sk .int, + 'ws_bill_hdemo_sk .int, + 'ws_bill_addr_sk .int, + 'ws_ship_customer_sk .int, + 'ws_ship_cdemo_sk .int, + 'ws_ship_hdemo_sk .int, + 'ws_ship_addr_sk .int, + 'ws_web_page_sk .int, + 'ws_web_site_sk .int, + 'ws_ship_mode_sk .int, + 'ws_warehouse_sk .int, + 'ws_promo_sk .int, + 'ws_order_number .int, + 'ws_quantity .int, + 'ws_wholesale_cost .decimal(7,2), + 'ws_list_price .decimal(7,2), + 'ws_sales_price .decimal(7,2), + 'ws_ext_discount_amt .decimal(7,2), + 'ws_ext_sales_price .decimal(7,2), + 'ws_ext_wholesale_cost .decimal(7,2), + 'ws_ext_list_price .decimal(7,2), + 'ws_ext_tax .decimal(7,2), + 'ws_coupon_amt .decimal(7,2), + 'ws_ext_ship_cost .decimal(7,2), + 'ws_net_paid .decimal(7,2), + 'ws_net_paid_inc_tax .decimal(7,2), + 'ws_net_paid_inc_ship .decimal(7,2), + 'ws_net_paid_inc_ship_tax .decimal(7,2), + 'ws_net_profit .decimal(7,2)), + Table("web_returns", + partitionColumns = "wr_returned_date_sk" ::Nil, + 'wr_returned_date_sk .long, + 'wr_returned_time_sk .long, + 'wr_item_sk .long, + 'wr_refunded_customer_sk .long, + 'wr_refunded_cdemo_sk .long, + 'wr_refunded_hdemo_sk .long, + 'wr_refunded_addr_sk .long, + 'wr_returning_customer_sk .long, + 'wr_returning_cdemo_sk .long, + 'wr_returning_hdemo_sk .long, + 'wr_returning_addr_sk .long, + 'wr_web_page_sk .long, + 'wr_reason_sk .long, + 'wr_order_number .long, + 'wr_return_quantity .long, + 'wr_return_amt .decimal(7,2), + 'wr_return_tax .decimal(7,2), + 'wr_return_amt_inc_tax .decimal(7,2), + 'wr_fee .decimal(7,2), + 'wr_return_ship_cost .decimal(7,2), + 'wr_refunded_cash .decimal(7,2), + 'wr_reversed_charge .decimal(7,2), + 'wr_account_credit .decimal(7,2), + 'wr_net_loss .decimal(7,2)), + Table("call_center", + partitionColumns = Nil, + 'cc_call_center_sk .int, + 'cc_call_center_id .string, + 'cc_rec_start_date .date, + 'cc_rec_end_date .date, + 'cc_closed_date_sk .int, + 'cc_open_date_sk .int, + 'cc_name .string, + 'cc_class .string, + 'cc_employees .int, + 'cc_sq_ft .int, + 'cc_hours .string, + 'cc_manager .string, + 'cc_mkt_id .int, + 'cc_mkt_class .string, + 'cc_mkt_desc .string, + 'cc_market_manager .string, + 'cc_division .int, + 'cc_division_name .string, + 'cc_company .int, + 'cc_company_name .string, + 'cc_street_number .string, + 'cc_street_name .string, + 'cc_street_type .string, + 'cc_suite_number .string, + 'cc_city .string, + 'cc_county .string, + 'cc_state .string, + 'cc_zip .string, + 'cc_country .string, + 'cc_gmt_offset .decimal(5,2), + 'cc_tax_percentage .decimal(5,2)), + Table("catalog_page", + partitionColumns = Nil, + 'cp_catalog_page_sk .int, + 'cp_catalog_page_id .string, + 'cp_start_date_sk .int, + 'cp_end_date_sk .int, + 'cp_department .string, + 'cp_catalog_number .int, + 'cp_catalog_page_number .int, + 'cp_description .string, + 'cp_type .string), Table("customer", partitionColumns = Nil, 'c_customer_sk .int, @@ -393,6 +560,11 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend 'hd_buy_potential .string, 'hd_dep_count .int, 'hd_vehicle_count .int), + Table("income_band", + partitionColumns = Nil, + 'ib_income_band_sk .int, + 'ib_lower_bound .int, + 'ib_upper_bound .int), Table("item", partitionColumns = Nil, 'i_item_sk .int, @@ -438,6 +610,19 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend 'p_channel_details .string, 'p_purpose .string, 'p_discount_active .string), + Table("reason", + partitionColumns = Nil, + 'r_reason_sk .int, + 'r_reason_id .string, + 'r_reason_desc .string), + Table("ship_mode", + partitionColumns = Nil, + 'sm_ship_mode_sk .int, + 'sm_ship_mode_id .string, + 'sm_type .string, + 'sm_code .string, + 'sm_carrier .string, + 'sm_contract .string), Table("store", partitionColumns = Nil, 's_store_sk .int, @@ -480,5 +665,66 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend 't_am_pm .string, 't_shift .string, 't_sub_shift .string, - 't_meal_time .string)) + 't_meal_time .string), + Table("warehouse", + partitionColumns = Nil, + 'w_warehouse_sk .int, + 'w_warehouse_id .string, + 'w_warehouse_name .string, + 'w_warehouse_sq_ft .int, + 'w_street_number .string, + 'w_street_name .string, + 'w_street_type .string, + 'w_suite_number .string, + 'w_city .string, + 'w_county .string, + 'w_state .string, + 'w_zip .string, + 'w_country .string, + 'w_gmt_offset .decimal(5,2)), + Table("web_page", + partitionColumns = Nil, + 'wp_web_page_sk .int, + 'wp_web_page_id .string, + 'wp_rec_start_date .date, + 'wp_rec_end_date .date, + 'wp_creation_date_sk .int, + 'wp_access_date_sk .int, + 'wp_autogen_flag .string, + 'wp_customer_sk .int, + 'wp_url .string, + 'wp_type .string, + 'wp_char_count .int, + 'wp_link_count .int, + 'wp_image_count .int, + 'wp_max_ad_count .int), + Table("web_site", + partitionColumns = Nil, + 'web_site_sk .int, + 'web_site_id .string, + 'web_rec_start_date .date, + 'web_rec_end_date .date, + 'web_name .string, + 'web_open_date_sk .int, + 'web_close_date_sk .int, + 'web_class .string, + 'web_manager .string, + 'web_mkt_id .int, + 'web_mkt_class .string, + 'web_mkt_desc .string, + 'web_market_manager .string, + 'web_company_id .int, + 'web_company_name .string, + 'web_street_number .string, + 'web_street_name .string, + 'web_street_type .string, + 'web_suite_number .string, + 'web_city .string, + 'web_county .string, + 'web_state .string, + 'web_zip .string, + 'web_country .string, + 'web_gmt_offset .string, + 'web_tax_percentage .decimal(5,2)) + ) }