1
0
Эх сурвалжийг харах

Putting clustering back in.

cartershanklin 11 жил өмнө
parent
commit
efb79aa98d

+ 0 - 2
ddl/bin_partitioned/analyze.sql

@@ -1,4 +1,3 @@
-
 use ${DB};
 ADD JAR file://${mysql_jar};
 
@@ -13,7 +12,6 @@ ANALYZE TABLE store COMPUTE STATISTICS;
 ANALYZE TABLE promotion COMPUTE STATISTICS;
 ANALYZE TABLE web_site COMPUTE STATISTICS;
 
-
 ANALYZE TABLE inventory partition(inv_date) COMPUTE STATISTICS;
 ANALYZE TABLE store_sales partition(ss_sold_date) COMPUTE STATISTICS;
 ANALYZE TABLE store_returns partition(sr_returned_date) COMPUTE STATISTICS;

+ 2 - 7
ddl/bin_partitioned/catalog_returns.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -40,6 +34,7 @@ create table catalog_returns
     cr_net_loss               float
 )
 partitioned by (cr_returned_date string)
+clustered by (cr_item_sk) sorted by (cr_item_sk) into ${RETURN_BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table catalog_returns partition (cr_returned_date) 
@@ -73,5 +68,5 @@ select
         cr.cr_net_loss,
         dd.d_date as cr_returned_date
       from ${SOURCE}.catalog_returns cr
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (cr.cr_returned_date_sk = dd.d_date_sk);

+ 2 - 7
ddl/bin_partitioned/catalog_sales.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -47,6 +41,7 @@ create table catalog_sales
     cs_net_profit             float
 )
 partitioned by (cs_sold_date string)
+clustered by (cs_item_sk) sorted by (cs_item_sk) into ${BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table catalog_sales partition (cs_sold_date) 
@@ -87,5 +82,5 @@ select
         cs.cs_net_profit,
         dd.d_date as cs_sold_date
       from ${SOURCE}.catalog_sales cs
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (cs.cs_sold_date_sk = dd.d_date_sk);

+ 3 - 8
ddl/bin_partitioned/inventory.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -17,15 +11,16 @@ create table inventory
     inv_quantity_on_hand	int
 )
 partitioned by (inv_date string)
+clustered by (inv_item_sk) sorted by (inv_item_sk) into ${BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table inventory partition (inv_date)
-from (select
+select
 	i.inv_date_sk,
 	i.inv_item_sk,
 	i.inv_warehouse_sk,
 	i.inv_quantity_on_hand,
 	d.d_date as inv_date
   from ${SOURCE}.inventory i
-  left outer join ${SOURCE}.date_dim d
+  join ${SOURCE}.date_dim d
   on (d.d_date_sk = i.inv_date_sk);

+ 0 - 6
ddl/bin_partitioned/item.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 

+ 2 - 7
ddl/bin_partitioned/store_returns.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -33,6 +27,7 @@ create table store_returns
     sr_net_loss               float
 )
 partitioned by (sr_returned_date string)
+clustered by (sr_item_sk) sorted by (sr_item_sk) into ${RETURN_BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table store_returns partition (sr_returned_date) 
@@ -59,5 +54,5 @@ select
         sr.sr_net_loss,
         dd.d_date as sr_returned_date
       from ${SOURCE}.store_returns sr
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (sr.sr_returned_date_sk = dd.d_date_sk);

+ 2 - 7
ddl/bin_partitioned/store_sales.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -36,6 +30,7 @@ create table store_sales
     ss_net_profit             float
 )
 partitioned by (ss_sold_date string)
+clustered by (ss_item_sk) sorted by (ss_item_sk) into ${BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table store_sales partition (ss_sold_date) 
@@ -65,5 +60,5 @@ select
         ss.ss_net_profit,
         dd.d_date as ss_sold_date
       from ${SOURCE}.store_sales ss
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (ss.ss_sold_date_sk = dd.d_date_sk);

+ 2 - 7
ddl/bin_partitioned/web_returns.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -37,6 +31,7 @@ create table web_returns
     wr_net_loss               float
 )
 partitioned by (wr_returned_date string)
+clustered by (wr_item_sk) sorted by (wr_item_sk) into ${RETURN_BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table web_returns partition (wr_returned_date)
@@ -67,5 +62,5 @@ select
         wr.wr_net_loss,
         dd.d_date as wr_returned_date
       from ${SOURCE}.web_returns wr
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (wr.wr_returned_date_sk = dd.d_date_sk);

+ 2 - 7
ddl/bin_partitioned/web_sales.sql

@@ -1,9 +1,3 @@
-set hive.enforce.bucketing=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-set hive.exec.max.dynamic.partitions=4096;
-set hive.exec.max.dynamic.partitions.pernode=4096;
-set mapred.job.reduce.input.buffer.percent=0.0;
-
 create database if not exists ${DB};
 use ${DB};
 
@@ -47,6 +41,7 @@ create table web_sales
     ws_net_profit             float
 )
 partitioned by (ws_sold_date string)
+clustered by (ws_item_sk) sorted by (ws_item_sk) into ${BUCKETS} buckets
 stored as ${FILE};
 
 insert overwrite table web_sales partition (ws_sold_date) 
@@ -87,5 +82,5 @@ select
         ws.ws_net_profit,
         dd.d_date as ws_sold_date
       from ${SOURCE}.web_sales ws
-      left outer join ${SOURCE}.date_dim dd
+      join ${SOURCE}.date_dim dd
       on (ws.ws_sold_date_sk = dd.d_date_sk);