Add remaining tpcds tables.
Author: Nong Li <nongli@gmail.com> Closes #34 from nongli/tpcds.
This commit is contained in:
parent
e2073129cf
commit
1aa5bfc838
@ -10,7 +10,9 @@ This is a performance testing framework for [Spark SQL](https://spark.apache.org
|
||||
The rest of document will use TPC-DS benchmark as an example. We will add contents to explain how to use other benchmarks add the support of a new benchmark dataset in future.
|
||||
|
||||
### Setup a benchmark
|
||||
Before running any query, a dataset needs to be setup by creating a `Benchmark` object.
|
||||
Before running any query, a dataset needs to be setup by creating a `Benchmark` object. Generating
|
||||
the TPCDS data requires dsdgen built and available on the machines. We have a fork of dsdgen that
|
||||
you will need. It can be found [here](https://github.com/davies/tpcds-kit).
|
||||
|
||||
```
|
||||
import com.databricks.spark.sql.perf.tpcds.Tables
|
||||
|
||||
@ -16,15 +16,12 @@
|
||||
|
||||
package com.databricks.spark.sql.perf.tpcds
|
||||
|
||||
import java.io.File
|
||||
|
||||
import scala.sys.process._
|
||||
|
||||
import org.apache.spark.Logging
|
||||
import org.apache.spark.sql.{SaveMode, SQLContext}
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.types._
|
||||
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
|
||||
|
||||
class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extends Serializable with Logging {
|
||||
import sqlContext.implicits._
|
||||
@ -256,13 +253,77 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
|
||||
}
|
||||
|
||||
val tables = Seq(
|
||||
/* This is another large table that we don't build yet.
|
||||
Table("catalog_sales",
|
||||
partitionColumns = "cs_sold_date_sk" :: Nil,
|
||||
'cs_sold_date_sk .int,
|
||||
'cs_sold_time_sk .int,
|
||||
'cs_ship_date_sk .int,
|
||||
'cs_bill_customer_sk .int,
|
||||
'cs_bill_cdemo_sk .int,
|
||||
'cs_bill_hdemo_sk .int,
|
||||
'cs_bill_addr_sk .int,
|
||||
'cs_ship_customer_sk .int,
|
||||
'cs_ship_cdemo_sk .int,
|
||||
'cs_ship_hdemo_sk .int,
|
||||
'cs_ship_addr_sk .int,
|
||||
'cs_call_center_sk .int,
|
||||
'cs_catalog_page_sk .int,
|
||||
'cs_ship_mode_sk .int,
|
||||
'cs_warehouse_sk .int,
|
||||
'cs_item_sk .int,
|
||||
'cs_promo_sk .int,
|
||||
'cs_order_number .int,
|
||||
'cs_quantity .int,
|
||||
'cs_wholesale_cost .decimal(7,2),
|
||||
'cs_list_price .decimal(7,2),
|
||||
'cs_sales_price .decimal(7,2),
|
||||
'cs_ext_discount_amt .decimal(7,2),
|
||||
'cs_ext_sales_price .decimal(7,2),
|
||||
'cs_ext_wholesale_cost .decimal(7,2),
|
||||
'cs_ext_list_price .decimal(7,2),
|
||||
'cs_ext_tax .decimal(7,2),
|
||||
'cs_coupon_amt .decimal(7,2),
|
||||
'cs_ext_ship_cost .decimal(7,2),
|
||||
'cs_net_paid .decimal(7,2),
|
||||
'cs_net_paid_inc_tax .decimal(7,2),
|
||||
'cs_net_paid_inc_ship .decimal(7,2),
|
||||
'cs_net_paid_inc_ship_tax .decimal(7,2),
|
||||
'cs_net_profit .decimal(7,2)),
|
||||
Table("catalog_returns",
|
||||
partitionColumns = "cr_returned_date_sk" :: Nil,
|
||||
'cr_returned_date_sk .int,
|
||||
'cr_returned_time_sk .int,
|
||||
'cr_item_sk .int,
|
||||
'cr_refunded_customer_sk .int,
|
||||
'cr_refunded_cdemo_sk .int,
|
||||
'cr_refunded_hdemo_sk .int,
|
||||
'cr_refunded_addr_sk .int,
|
||||
'cr_returning_customer_sk .int,
|
||||
'cr_returning_cdemo_sk .int,
|
||||
'cr_returning_hdemo_sk .int,
|
||||
'cr_returning_addr_sk .int,
|
||||
'cr_call_center_sk .int,
|
||||
'cr_catalog_page_sk .int,
|
||||
'cr_ship_mode_sk .int,
|
||||
'cr_warehouse_sk .int,
|
||||
'cr_reason_sk .int,
|
||||
'cr_order_number .int,
|
||||
'cr_return_quantity .int,
|
||||
'cr_return_amount .decimal(7,2),
|
||||
'cr_return_tax .decimal(7,2),
|
||||
'cr_return_amt_inc_tax .decimal(7,2),
|
||||
'cr_fee .decimal(7,2),
|
||||
'cr_return_ship_cost .decimal(7,2),
|
||||
'cr_refunded_cash .decimal(7,2),
|
||||
'cr_reversed_charge .decimal(7,2),
|
||||
'cr_store_credit .decimal(7,2),
|
||||
'cr_net_loss .decimal(7,2)),
|
||||
Table("inventory",
|
||||
PartitionedTable("inv_date_sk"),
|
||||
partitionColumns = "inv_date_sk" :: Nil,
|
||||
'inv_date_sk .int,
|
||||
'inv_item_sk .int,
|
||||
'inv_warehouse_sk .int,
|
||||
'inv_quantity_on_hand .int),*/
|
||||
'inv_quantity_on_hand .int),
|
||||
Table("store_sales",
|
||||
partitionColumns = "ss_sold_date_sk" :: Nil,
|
||||
'ss_sold_date_sk .int,
|
||||
@ -310,6 +371,112 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
|
||||
'sr_reversed_charge .decimal(7,2),
|
||||
'sr_store_credit .decimal(7,2),
|
||||
'sr_net_loss .decimal(7,2)),
|
||||
Table("web_sales",
|
||||
partitionColumns = "ws_sold_date_sk" :: Nil,
|
||||
'ws_sold_date_sk .int,
|
||||
'ws_sold_time_sk .int,
|
||||
'ws_ship_date_sk .int,
|
||||
'ws_item_sk .int,
|
||||
'ws_bill_customer_sk .int,
|
||||
'ws_bill_cdemo_sk .int,
|
||||
'ws_bill_hdemo_sk .int,
|
||||
'ws_bill_addr_sk .int,
|
||||
'ws_ship_customer_sk .int,
|
||||
'ws_ship_cdemo_sk .int,
|
||||
'ws_ship_hdemo_sk .int,
|
||||
'ws_ship_addr_sk .int,
|
||||
'ws_web_page_sk .int,
|
||||
'ws_web_site_sk .int,
|
||||
'ws_ship_mode_sk .int,
|
||||
'ws_warehouse_sk .int,
|
||||
'ws_promo_sk .int,
|
||||
'ws_order_number .int,
|
||||
'ws_quantity .int,
|
||||
'ws_wholesale_cost .decimal(7,2),
|
||||
'ws_list_price .decimal(7,2),
|
||||
'ws_sales_price .decimal(7,2),
|
||||
'ws_ext_discount_amt .decimal(7,2),
|
||||
'ws_ext_sales_price .decimal(7,2),
|
||||
'ws_ext_wholesale_cost .decimal(7,2),
|
||||
'ws_ext_list_price .decimal(7,2),
|
||||
'ws_ext_tax .decimal(7,2),
|
||||
'ws_coupon_amt .decimal(7,2),
|
||||
'ws_ext_ship_cost .decimal(7,2),
|
||||
'ws_net_paid .decimal(7,2),
|
||||
'ws_net_paid_inc_tax .decimal(7,2),
|
||||
'ws_net_paid_inc_ship .decimal(7,2),
|
||||
'ws_net_paid_inc_ship_tax .decimal(7,2),
|
||||
'ws_net_profit .decimal(7,2)),
|
||||
Table("web_returns",
|
||||
partitionColumns = "wr_returned_date_sk" ::Nil,
|
||||
'wr_returned_date_sk .long,
|
||||
'wr_returned_time_sk .long,
|
||||
'wr_item_sk .long,
|
||||
'wr_refunded_customer_sk .long,
|
||||
'wr_refunded_cdemo_sk .long,
|
||||
'wr_refunded_hdemo_sk .long,
|
||||
'wr_refunded_addr_sk .long,
|
||||
'wr_returning_customer_sk .long,
|
||||
'wr_returning_cdemo_sk .long,
|
||||
'wr_returning_hdemo_sk .long,
|
||||
'wr_returning_addr_sk .long,
|
||||
'wr_web_page_sk .long,
|
||||
'wr_reason_sk .long,
|
||||
'wr_order_number .long,
|
||||
'wr_return_quantity .long,
|
||||
'wr_return_amt .decimal(7,2),
|
||||
'wr_return_tax .decimal(7,2),
|
||||
'wr_return_amt_inc_tax .decimal(7,2),
|
||||
'wr_fee .decimal(7,2),
|
||||
'wr_return_ship_cost .decimal(7,2),
|
||||
'wr_refunded_cash .decimal(7,2),
|
||||
'wr_reversed_charge .decimal(7,2),
|
||||
'wr_account_credit .decimal(7,2),
|
||||
'wr_net_loss .decimal(7,2)),
|
||||
Table("call_center",
|
||||
partitionColumns = Nil,
|
||||
'cc_call_center_sk .int,
|
||||
'cc_call_center_id .string,
|
||||
'cc_rec_start_date .date,
|
||||
'cc_rec_end_date .date,
|
||||
'cc_closed_date_sk .int,
|
||||
'cc_open_date_sk .int,
|
||||
'cc_name .string,
|
||||
'cc_class .string,
|
||||
'cc_employees .int,
|
||||
'cc_sq_ft .int,
|
||||
'cc_hours .string,
|
||||
'cc_manager .string,
|
||||
'cc_mkt_id .int,
|
||||
'cc_mkt_class .string,
|
||||
'cc_mkt_desc .string,
|
||||
'cc_market_manager .string,
|
||||
'cc_division .int,
|
||||
'cc_division_name .string,
|
||||
'cc_company .int,
|
||||
'cc_company_name .string,
|
||||
'cc_street_number .string,
|
||||
'cc_street_name .string,
|
||||
'cc_street_type .string,
|
||||
'cc_suite_number .string,
|
||||
'cc_city .string,
|
||||
'cc_county .string,
|
||||
'cc_state .string,
|
||||
'cc_zip .string,
|
||||
'cc_country .string,
|
||||
'cc_gmt_offset .decimal(5,2),
|
||||
'cc_tax_percentage .decimal(5,2)),
|
||||
Table("catalog_page",
|
||||
partitionColumns = Nil,
|
||||
'cp_catalog_page_sk .int,
|
||||
'cp_catalog_page_id .string,
|
||||
'cp_start_date_sk .int,
|
||||
'cp_end_date_sk .int,
|
||||
'cp_department .string,
|
||||
'cp_catalog_number .int,
|
||||
'cp_catalog_page_number .int,
|
||||
'cp_description .string,
|
||||
'cp_type .string),
|
||||
Table("customer",
|
||||
partitionColumns = Nil,
|
||||
'c_customer_sk .int,
|
||||
@ -393,6 +560,11 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
|
||||
'hd_buy_potential .string,
|
||||
'hd_dep_count .int,
|
||||
'hd_vehicle_count .int),
|
||||
Table("income_band",
|
||||
partitionColumns = Nil,
|
||||
'ib_income_band_sk .int,
|
||||
'ib_lower_bound .int,
|
||||
'ib_upper_bound .int),
|
||||
Table("item",
|
||||
partitionColumns = Nil,
|
||||
'i_item_sk .int,
|
||||
@ -438,6 +610,19 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
|
||||
'p_channel_details .string,
|
||||
'p_purpose .string,
|
||||
'p_discount_active .string),
|
||||
Table("reason",
|
||||
partitionColumns = Nil,
|
||||
'r_reason_sk .int,
|
||||
'r_reason_id .string,
|
||||
'r_reason_desc .string),
|
||||
Table("ship_mode",
|
||||
partitionColumns = Nil,
|
||||
'sm_ship_mode_sk .int,
|
||||
'sm_ship_mode_id .string,
|
||||
'sm_type .string,
|
||||
'sm_code .string,
|
||||
'sm_carrier .string,
|
||||
'sm_contract .string),
|
||||
Table("store",
|
||||
partitionColumns = Nil,
|
||||
's_store_sk .int,
|
||||
@ -480,5 +665,66 @@ class Tables(sqlContext: SQLContext, dsdgenDir: String, scaleFactor: Int) extend
|
||||
't_am_pm .string,
|
||||
't_shift .string,
|
||||
't_sub_shift .string,
|
||||
't_meal_time .string))
|
||||
't_meal_time .string),
|
||||
Table("warehouse",
|
||||
partitionColumns = Nil,
|
||||
'w_warehouse_sk .int,
|
||||
'w_warehouse_id .string,
|
||||
'w_warehouse_name .string,
|
||||
'w_warehouse_sq_ft .int,
|
||||
'w_street_number .string,
|
||||
'w_street_name .string,
|
||||
'w_street_type .string,
|
||||
'w_suite_number .string,
|
||||
'w_city .string,
|
||||
'w_county .string,
|
||||
'w_state .string,
|
||||
'w_zip .string,
|
||||
'w_country .string,
|
||||
'w_gmt_offset .decimal(5,2)),
|
||||
Table("web_page",
|
||||
partitionColumns = Nil,
|
||||
'wp_web_page_sk .int,
|
||||
'wp_web_page_id .string,
|
||||
'wp_rec_start_date .date,
|
||||
'wp_rec_end_date .date,
|
||||
'wp_creation_date_sk .int,
|
||||
'wp_access_date_sk .int,
|
||||
'wp_autogen_flag .string,
|
||||
'wp_customer_sk .int,
|
||||
'wp_url .string,
|
||||
'wp_type .string,
|
||||
'wp_char_count .int,
|
||||
'wp_link_count .int,
|
||||
'wp_image_count .int,
|
||||
'wp_max_ad_count .int),
|
||||
Table("web_site",
|
||||
partitionColumns = Nil,
|
||||
'web_site_sk .int,
|
||||
'web_site_id .string,
|
||||
'web_rec_start_date .date,
|
||||
'web_rec_end_date .date,
|
||||
'web_name .string,
|
||||
'web_open_date_sk .int,
|
||||
'web_close_date_sk .int,
|
||||
'web_class .string,
|
||||
'web_manager .string,
|
||||
'web_mkt_id .int,
|
||||
'web_mkt_class .string,
|
||||
'web_mkt_desc .string,
|
||||
'web_market_manager .string,
|
||||
'web_company_id .int,
|
||||
'web_company_name .string,
|
||||
'web_street_number .string,
|
||||
'web_street_name .string,
|
||||
'web_street_type .string,
|
||||
'web_suite_number .string,
|
||||
'web_city .string,
|
||||
'web_county .string,
|
||||
'web_state .string,
|
||||
'web_zip .string,
|
||||
'web_country .string,
|
||||
'web_gmt_offset .string,
|
||||
'web_tax_percentage .decimal(5,2))
|
||||
)
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user