瀏覽代碼

parallelize NULL loading in TPC-DS

Gopal V 9 年之前
父節點
當前提交
d36b4bbbfa

+ 35 - 2
ddl-tpcds/bin_partitioned/catalog_returns.sql

@@ -35,6 +35,37 @@ create table catalog_returns
 partitioned by (cr_returned_date_sk bigint)
 stored as ${FILE};
 
+from ${SOURCE}.catalog_returns cr
+insert overwrite table catalog_returns partition(cr_returned_date_sk) 
+select
+        cr.cr_returned_time_sk,
+        cr.cr_item_sk,
+        cr.cr_refunded_customer_sk,
+        cr.cr_refunded_cdemo_sk,
+        cr.cr_refunded_hdemo_sk,
+        cr.cr_refunded_addr_sk,
+        cr.cr_returning_customer_sk,
+        cr.cr_returning_cdemo_sk,
+        cr.cr_returning_hdemo_sk,
+        cr.cr_returning_addr_sk,
+        cr.cr_call_center_sk,
+        cr.cr_catalog_page_sk,
+        cr.cr_ship_mode_sk,
+        cr.cr_warehouse_sk,
+        cr.cr_reason_sk,
+        cr.cr_order_number,
+        cr.cr_return_quantity,
+        cr.cr_return_amount,
+        cr.cr_return_tax,
+        cr.cr_return_amt_inc_tax,
+        cr.cr_fee,
+        cr.cr_return_ship_cost,
+        cr.cr_refunded_cash,
+        cr.cr_reversed_charge,
+        cr.cr_store_credit,
+        cr.cr_net_loss,
+        cr.cr_returned_date_sk
+      where cr.cr_returned_date_sk is not null
 insert overwrite table catalog_returns partition (cr_returned_date_sk) 
 select
         cr.cr_returned_time_sk,
@@ -63,5 +94,7 @@ select
         cr.cr_reversed_charge,
         cr.cr_store_credit,
         cr.cr_net_loss,
-		cr.cr_returned_date_sk
-      from ${SOURCE}.catalog_returns cr;
+        cr.cr_returned_date_sk
+      where cr.cr_returned_date_sk is null
+      sort by cr_returned_date_sk
+;

+ 41 - 1
ddl-tpcds/bin_partitioned/catalog_sales.sql

@@ -42,6 +42,7 @@ create table catalog_sales
 partitioned by (cs_sold_date_sk bigint)
 stored as ${FILE};
 
+from ${SOURCE}.catalog_sales cs
 insert overwrite table catalog_sales partition (cs_sold_date_sk) 
 select
         cs.cs_sold_time_sk,
@@ -78,4 +79,43 @@ select
         cs.cs_net_paid_inc_ship_tax,
         cs.cs_net_profit,
         cs.cs_sold_date_sk
-      from ${SOURCE}.catalog_sales cs;
+        where cs.cs_sold_date_sk is not null
+insert overwrite table catalog_sales partition (cs_sold_date_sk) 
+select
+        cs.cs_sold_time_sk,
+        cs.cs_ship_date_sk,
+        cs.cs_bill_customer_sk,
+        cs.cs_bill_cdemo_sk,
+        cs.cs_bill_hdemo_sk,
+        cs.cs_bill_addr_sk,
+        cs.cs_ship_customer_sk,
+        cs.cs_ship_cdemo_sk,
+        cs.cs_ship_hdemo_sk,
+        cs.cs_ship_addr_sk,
+        cs.cs_call_center_sk,
+        cs.cs_catalog_page_sk,
+        cs.cs_ship_mode_sk,
+        cs.cs_warehouse_sk,
+        cs.cs_item_sk,
+        cs.cs_promo_sk,
+        cs.cs_order_number,
+        cs.cs_quantity,
+        cs.cs_wholesale_cost,
+        cs.cs_list_price,
+        cs.cs_sales_price,
+        cs.cs_ext_discount_amt,
+        cs.cs_ext_sales_price,
+        cs.cs_ext_wholesale_cost,
+        cs.cs_ext_list_price,
+        cs.cs_ext_tax,
+        cs.cs_coupon_amt,
+        cs.cs_ext_ship_cost,
+        cs.cs_net_paid,
+        cs.cs_net_paid_inc_tax,
+        cs.cs_net_paid_inc_ship,
+        cs.cs_net_paid_inc_ship_tax,
+        cs.cs_net_profit,
+        cs.cs_sold_date_sk
+        where cs.cs_sold_date_sk is null
+        sort by cs.cs_sold_date_sk
+ ;

+ 26 - 1
ddl-tpcds/bin_partitioned/store_returns.sql

@@ -28,6 +28,7 @@ create table store_returns
 partitioned by (sr_returned_date_sk bigint)
 stored as ${FILE};
 
+from ${SOURCE}.store_returns sr
 insert overwrite table store_returns partition (sr_returned_date_sk) 
 select
         sr.sr_return_time_sk,
@@ -50,4 +51,28 @@ select
         sr.sr_store_credit,
         sr.sr_net_loss,
         sr.sr_returned_date_sk
-      from ${SOURCE}.store_returns sr;
+        where sr.sr_returned_date_sk is not null
+insert overwrite table store_returns partition (sr_returned_date_sk) 
+select
+        sr.sr_return_time_sk,
+        sr.sr_item_sk,
+        sr.sr_customer_sk,
+        sr.sr_cdemo_sk,
+        sr.sr_hdemo_sk,
+        sr.sr_addr_sk,
+        sr.sr_store_sk,
+        sr.sr_reason_sk,
+        sr.sr_ticket_number,
+        sr.sr_return_quantity,
+        sr.sr_return_amt,
+        sr.sr_return_tax,
+        sr.sr_return_amt_inc_tax,
+        sr.sr_fee,
+        sr.sr_return_ship_cost,
+        sr.sr_refunded_cash,
+        sr.sr_reversed_charge,
+        sr.sr_store_credit,
+        sr.sr_net_loss,
+        sr.sr_returned_date_sk
+        where sr.sr_returned_date_sk is null
+        sort by sr.sr_returned_date_sk

+ 30 - 1
ddl-tpcds/bin_partitioned/store_sales.sql

@@ -31,6 +31,7 @@ create table store_sales
 partitioned by (ss_sold_date_sk bigint)
 stored as ${FILE};
 
+from ${SOURCE}.store_sales ss
 insert overwrite table store_sales partition (ss_sold_date_sk) 
 select
         ss.ss_sold_time_sk,
@@ -56,4 +57,32 @@ select
         ss.ss_net_paid_inc_tax,
         ss.ss_net_profit,
         ss.ss_sold_date_sk
-      from ${SOURCE}.store_sales ss;
+        where ss.ss_sold_date_sk is not null
+insert overwrite table store_sales partition (ss_sold_date_sk) 
+select
+        ss.ss_sold_time_sk,
+        ss.ss_item_sk,
+        ss.ss_customer_sk,
+        ss.ss_cdemo_sk,
+        ss.ss_hdemo_sk,
+        ss.ss_addr_sk,
+        ss.ss_store_sk,
+        ss.ss_promo_sk,
+        ss.ss_ticket_number,
+        ss.ss_quantity,
+        ss.ss_wholesale_cost,
+        ss.ss_list_price,
+        ss.ss_sales_price,
+        ss.ss_ext_discount_amt,
+        ss.ss_ext_sales_price,
+        ss.ss_ext_wholesale_cost,
+        ss.ss_ext_list_price,
+        ss.ss_ext_tax,
+        ss.ss_coupon_amt,
+        ss.ss_net_paid,
+        ss.ss_net_paid_inc_tax,
+        ss.ss_net_profit,
+        ss.ss_sold_date_sk
+        where ss.ss_sold_date_sk is null
+        sort by ss.ss_sold_date_sk
+;

+ 31 - 1
ddl-tpcds/bin_partitioned/web_returns.sql

@@ -32,6 +32,7 @@ create table web_returns
 partitioned by (wr_returned_date_sk       bigint)
 stored as ${FILE};
 
+from ${SOURCE}.web_returns wr
 insert overwrite table web_returns partition (wr_returned_date_sk)
 select
         wr.wr_returned_time_sk,
@@ -58,4 +59,33 @@ select
         wr.wr_account_credit,
         wr.wr_net_loss,
 		wr.wr_returned_date_sk
-      from ${SOURCE}.web_returns wr;
+        where wr.wr_returned_date_sk is not null
+insert overwrite table web_returns partition (wr_returned_date_sk)
+select
+        wr.wr_returned_time_sk,
+        wr.wr_item_sk,
+        wr.wr_refunded_customer_sk,
+        wr.wr_refunded_cdemo_sk,
+        wr.wr_refunded_hdemo_sk,
+        wr.wr_refunded_addr_sk,
+        wr.wr_returning_customer_sk,
+        wr.wr_returning_cdemo_sk,
+        wr.wr_returning_hdemo_sk,
+        wr.wr_returning_addr_sk,
+        wr.wr_web_page_sk,
+        wr.wr_reason_sk,
+        wr.wr_order_number,
+        wr.wr_return_quantity,
+        wr.wr_return_amt,
+        wr.wr_return_tax,
+        wr.wr_return_amt_inc_tax,
+        wr.wr_fee,
+        wr.wr_return_ship_cost,
+        wr.wr_refunded_cash,
+        wr.wr_reversed_charge,
+        wr.wr_account_credit,
+        wr.wr_net_loss,
+		wr.wr_returned_date_sk
+        where wr.wr_returned_date_sk is null
+        sort by wr.wr_returned_date_sk 
+;

+ 41 - 1
ddl-tpcds/bin_partitioned/web_sales.sql

@@ -42,6 +42,7 @@ create table web_sales
 partitioned by (ws_sold_date_sk           bigint)
 stored as ${FILE};
 
+from ${SOURCE}.web_sales ws
 insert overwrite table web_sales partition (ws_sold_date_sk) 
 select
         ws.ws_sold_time_sk,
@@ -78,4 +79,43 @@ select
         ws.ws_net_paid_inc_ship_tax,
         ws.ws_net_profit,
         ws.ws_sold_date_sk
-      from ${SOURCE}.web_sales ws;
+        where ws.ws_sold_date_sk is not null
+insert overwrite table web_sales partition (ws_sold_date_sk) 
+select
+        ws.ws_sold_time_sk,
+        ws.ws_ship_date_sk,
+        ws.ws_item_sk,
+        ws.ws_bill_customer_sk,
+        ws.ws_bill_cdemo_sk,
+        ws.ws_bill_hdemo_sk,
+        ws.ws_bill_addr_sk,
+        ws.ws_ship_customer_sk,
+        ws.ws_ship_cdemo_sk,
+        ws.ws_ship_hdemo_sk,
+        ws.ws_ship_addr_sk,
+        ws.ws_web_page_sk,
+        ws.ws_web_site_sk,
+        ws.ws_ship_mode_sk,
+        ws.ws_warehouse_sk,
+        ws.ws_promo_sk,
+        ws.ws_order_number,
+        ws.ws_quantity,
+        ws.ws_wholesale_cost,
+        ws.ws_list_price,
+        ws.ws_sales_price,
+        ws.ws_ext_discount_amt,
+        ws.ws_ext_sales_price,
+        ws.ws_ext_wholesale_cost,
+        ws.ws_ext_list_price,
+        ws.ws_ext_tax,
+        ws.ws_coupon_amt,
+        ws.ws_ext_ship_cost,
+        ws.ws_net_paid,
+        ws.ws_net_paid_inc_tax,
+        ws.ws_net_paid_inc_ship,
+        ws.ws_net_paid_inc_ship_tax,
+        ws.ws_net_profit,
+        ws.ws_sold_date_sk
+        where ws.ws_sold_date_sk is null
+        sort by ws.ws_sold_date_sk
+;

+ 22 - 20
tpcds-setup.sh

@@ -75,39 +75,41 @@ runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB
 if [ "X$FORMAT" = "X" ]; then
 	FORMAT=orc
 fi
+
+LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
+SILENCE="2> /dev/null 1> /dev/null" 
+if [ "X$DEBUG_SCRIPT" != "X" ]; then
+	SILENCE=""
+fi
+
+echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
+
 i=1
 total=24
 DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
-for t in ${FACTS}
+
+# Populate the smaller tables.
+for t in ${DIMS}
 do
-	echo "Optimizing table $t ($i/$total)."
 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
+	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
             -d SCALE=${SCALE} \
-	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
-	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=${FORMAT}"
-	runcommand "$COMMAND"
-	if [ $? -ne 0 ]; then
-		echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
-		exit 1
-	fi
+	    -d FILE=${FORMAT}"
+	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
 	i=`expr $i + 1`
 done
 
-# Populate the smaller tables.
-for t in ${DIMS}
+for t in ${FACTS}
 do
-	echo "Optimizing table $t ($i/$total)."
 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
+	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
             -d SCALE=${SCALE} \
-	    -d FILE=${FORMAT}"
-	runcommand "$COMMAND"
-	if [ $? -ne 0 ]; then
-		echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
-		exit 1
-	fi
+	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
+	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=${FORMAT}"
+	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
 	i=`expr $i + 1`
 done
 
+make -j 2 -f $LOAD_FILE
+
 echo "Data loaded into database ${DATABASE}."