|
@@ -16,12 +16,12 @@ if [ $? -ne 0 ]; then
|
|
|
fi
|
|
fi
|
|
|
|
|
|
|
|
# Tables in the TPC-DS schema.
|
|
# Tables in the TPC-DS schema.
|
|
|
-LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
|
|
|
|
|
|
|
+LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
|
|
|
|
|
+FACTS="web_returns store_sales store_returns web_sales catalog_sales catalog_returns inventory"
|
|
|
|
|
|
|
|
# Get the parameters.
|
|
# Get the parameters.
|
|
|
SCALE=$1
|
|
SCALE=$1
|
|
|
DIR=$2
|
|
DIR=$2
|
|
|
-MODE=$3
|
|
|
|
|
|
|
|
|
|
# Ensure arguments exist.
|
|
# Ensure arguments exist.
|
|
|
if [ X"$SCALE" = "X" ]; then
|
|
if [ X"$SCALE" = "X" ]; then
|
|
@@ -30,9 +30,6 @@ fi
|
|
|
if [ X"$DIR" = "X" ]; then
|
|
if [ X"$DIR" = "X" ]; then
|
|
|
DIR=/tmp/tpcds-generate
|
|
DIR=/tmp/tpcds-generate
|
|
|
fi
|
|
fi
|
|
|
-if [ X"$MODE" = "X" ]; then
|
|
|
|
|
- MODE=partitioned
|
|
|
|
|
-fi
|
|
|
|
|
|
|
|
|
|
# Sanity checking.
|
|
# Sanity checking.
|
|
|
if [ $SCALE -eq 1 ]; then
|
|
if [ $SCALE -eq 1 ]; then
|
|
@@ -43,9 +40,6 @@ fi
|
|
|
BUCKETS=13
|
|
BUCKETS=13
|
|
|
RETURN_BUCKETS=1
|
|
RETURN_BUCKETS=1
|
|
|
SPLIT=16
|
|
SPLIT=16
|
|
|
-STORE_CLAUSES=( "orc" )
|
|
|
|
|
-FILE_FORMATS=( "orc" )
|
|
|
|
|
-SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
|
|
|
|
|
|
|
|
|
|
set -x
|
|
set -x
|
|
|
set -e
|
|
set -e
|
|
@@ -55,34 +49,33 @@ hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DI
|
|
|
hadoop dfs -ls ${DIR}/${SCALE}
|
|
hadoop dfs -ls ${DIR}/${SCALE}
|
|
|
|
|
|
|
|
# Generate the text/flat tables. These will be later be converted to ORCFile.
|
|
# Generate the text/flat tables. These will be later be converted to ORCFile.
|
|
|
-hive -i settings/load-flat.sql -f ddl/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}
|
|
|
|
|
|
|
+# hive -i settings/load-flat.sql -f ddl/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}
|
|
|
|
|
|
|
|
-# Generate the binary forms of the data.
|
|
|
|
|
-if [ $MODE = "partitioned" ]; then
|
|
|
|
|
- i=0
|
|
|
|
|
- for file in "${STORE_CLAUSES[@]}"
|
|
|
|
|
- do
|
|
|
|
|
- for t in ${LIST}
|
|
|
|
|
- do
|
|
|
|
|
- hive -i settings/load-partitioned.sql -f ddl/bin_partitioned/${t}.sql \
|
|
|
|
|
- -d DB=tpcds_bin_partitioned_${FILE_FORMATS[$i]}_${SCALE} \
|
|
|
|
|
- -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
|
|
|
|
|
- -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
|
|
|
|
|
- -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
|
|
|
|
|
- done
|
|
|
|
|
- i=$((i+1))
|
|
|
|
|
- done
|
|
|
|
|
-else
|
|
|
|
|
- i=0
|
|
|
|
|
- for file in "${STORE_CLAUSES[@]}"
|
|
|
|
|
- do
|
|
|
|
|
- for t in ${LIST}
|
|
|
|
|
- do
|
|
|
|
|
- hive -i settings/load-flat.sql -f ddl/bin_flat/${t}.sql \
|
|
|
|
|
- -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} \
|
|
|
|
|
- -d SOURCE=tpcds_text_${SCALE} -d FILE="${file}" \
|
|
|
|
|
- -d SERDE=${SERDES[$i]}
|
|
|
|
|
- done
|
|
|
|
|
- i=$((i+1))
|
|
|
|
|
- done
|
|
|
|
|
-fi
|
|
|
|
|
|
|
+# Populate the smaller tables.
|
|
|
|
|
+#for t in ${LIST}
|
|
|
|
|
+#do
|
|
|
|
|
+# hive -i settings/load-partitioned.sql -f ddl/bin_partitioned/${t}.sql \
|
|
|
|
|
+# -d DB=tpcds_bin_partitioned_orc_${SCALE} \
|
|
|
|
|
+# -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
|
|
|
|
|
+# -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
|
|
|
|
|
+# -d SERDE=org.apache.hadoop.hive.ql.io.orc.OrcSerde -d SPLIT=${SPLIT}
|
|
|
|
|
+#done
|
|
|
|
|
+
|
|
|
|
|
+# Create the partitioned tables.
|
|
|
|
|
+for t in ${FACTS}
|
|
|
|
|
+do
|
|
|
|
|
+ hive -i settings/load-partitioned.sql -f ddl/bin_partitioned/${t}.sql \
|
|
|
|
|
+ -d DB=tpcds_bin_partitioned_orc_${SCALE} \
|
|
|
|
|
+ -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
|
|
|
|
|
+ -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
|
|
|
|
|
+ -d SERDE=org.apache.hadoop.hive.ql.io.orc.OrcSerde -d SPLIT=${SPLIT}
|
|
|
|
|
+done
|
|
|
|
|
+
|
|
|
|
|
+# Populate the partitioned tables.
|
|
|
|
|
+for t in ${FACTS}
|
|
|
|
|
+do
|
|
|
|
|
+ hadoop jar tpcds-parts-1.0-SNAPSHOT.jar -t ${t}
|
|
|
|
|
+ -i ${DIR}/${t}/
|
|
|
|
|
+ -o /apps/hive/warehouse/tpcds_bin_partitioned_orc_${SCALE}.db/${t}
|
|
|
|
|
+ hive -e "msck repair table ${t}"
|
|
|
|
|
+done
|