Browse Source

Use beeline for the TPC-DS dataload

(cherry picked from commit 06ba6c5897760957b7cd99a36e0c09b0afc973c9)
Gopal V 7 years ago
parent
commit
ff001c4f67

+ 3 - 1
ddl-tpcds/bin_partitioned/customer.sql

@@ -5,4 +5,6 @@ drop table if exists customer;
 
 create table customer
 stored as ${FILE}
-as select * from ${SOURCE}.customer;
+as select * from ${SOURCE}.customer
+CLUSTER BY c_customer_sk
+;

+ 3 - 1
ddl-tpcds/bin_partitioned/customer_address.sql

@@ -5,4 +5,6 @@ drop table if exists customer_address;
 
 create table customer_address
 stored as ${FILE}
-as select * from ${SOURCE}.customer_address;
+as select * from ${SOURCE}.customer_address 
+CLUSTER BY ca_address_sk
+;

+ 3 - 1
ddl-tpcds/bin_partitioned/item.sql

@@ -5,4 +5,6 @@ drop table if exists item;
 
 create table item
 stored as ${FILE}
-as select * from ${SOURCE}.item;
+as select * from ${SOURCE}.item
+CLUSTER BY i_item_sk
+;

+ 3 - 1
ddl-tpcds/bin_partitioned/store.sql

@@ -5,4 +5,6 @@ drop table if exists store;
 
 create table store
 stored as ${FILE}
-as select * from ${SOURCE}.store;
+as select * from ${SOURCE}.store
+CLUSTER BY s_store_sk
+;

+ 19 - 13
tpcds-setup.sh

@@ -25,7 +25,8 @@ fi
 
 # Tables in the TPC-DS schema.
 DIMS="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
-FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
+FACTS=""
+#FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
 
 # Get the parameters.
 SCALE=$1
@@ -65,11 +66,16 @@ if [ $? -ne 0 ]; then
 	echo "Data generation failed, exiting."
 	exit 1
 fi
+
+hadoop fs -chmod -R 777  /${DIR}/${SCALE}
+
 echo "TPC-DS text data generation complete."
 
+HIVE="beeline -n hive -u 'jdbc:hive2://localhost:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2?tez.queue.name=default' "
+
 # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
 echo "Loading text data into external tables."
-runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
+runcommand "$HIVE  -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql --hivevar DB=tpcds_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE}"
 
 # Create the partitioned and bucketed tables.
 if [ "X$FORMAT" = "X" ]; then
@@ -93,26 +99,26 @@ REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo $
 # Populate the smaller tables.
 for t in ${DIMS}
 do
-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
-            -d SCALE=${SCALE} \
-	    -d REDUCERS=${REDUCERS} \
-	    -d FILE=${FORMAT}"
+	COMMAND="$HIVE  -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
+	    --hivevar DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} --hivevar SOURCE=tpcds_text_${SCALE} \
+            --hivevar SCALE=${SCALE} \
+	    --hivevar REDUCERS=${REDUCERS} \
+	    --hivevar FILE=${FORMAT}"
 	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
 	i=`expr $i + 1`
 done
 
 for t in ${FACTS}
 do
-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
-            -d SCALE=${SCALE} \
-	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
-	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
+	COMMAND="$HIVE  -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
+	    --hivevar DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
+            --hivevar SCALE=${SCALE} \
+	    --hivevar SOURCE=tpcds_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \
+	    --hivevar RETURN_BUCKETS=${RETURN_BUCKETS} --hivevar REDUCERS=${REDUCERS} --hivevar FILE=${FORMAT}"
 	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
 	i=`expr $i + 1`
 done
 
-make -j 2 -f $LOAD_FILE
+make -j 1 -f $LOAD_FILE
 
 echo "Data loaded into database ${DATABASE}."