|
|
@@ -30,40 +30,42 @@ FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_ret
|
|
|
# Get the parameters.
|
|
|
SCALE=$1
|
|
|
DIR=$2
|
|
|
+TEXT_ONLY=$3
|
|
|
+
|
|
|
if [ "X$BUCKET_DATA" != "X" ]; then
|
|
|
- BUCKETS=13
|
|
|
- RETURN_BUCKETS=13
|
|
|
+ BUCKETS=13
|
|
|
+ RETURN_BUCKETS=13
|
|
|
else
|
|
|
- BUCKETS=1
|
|
|
- RETURN_BUCKETS=1
|
|
|
+ BUCKETS=1
|
|
|
+ RETURN_BUCKETS=1
|
|
|
fi
|
|
|
if [ "X$DEBUG_SCRIPT" != "X" ]; then
|
|
|
- set -x
|
|
|
+ set -x
|
|
|
fi
|
|
|
|
|
|
# Sanity checking.
|
|
|
if [ X"$SCALE" = "X" ]; then
|
|
|
- usage
|
|
|
+ usage
|
|
|
fi
|
|
|
if [ X"$DIR" = "X" ]; then
|
|
|
- DIR=/tmp/tpcds-generate
|
|
|
+ DIR=/tmp/tpcds-generate
|
|
|
fi
|
|
|
if [ $SCALE -eq 1 ]; then
|
|
|
- echo "Scale factor must be greater than 1"
|
|
|
- exit 1
|
|
|
+ echo "Scale factor must be greater than 1"
|
|
|
+ exit 1
|
|
|
fi
|
|
|
|
|
|
# Do the actual data load.
|
|
|
hdfs dfs -mkdir -p ${DIR}
|
|
|
hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
|
|
|
if [ $? -ne 0 ]; then
|
|
|
- echo "Generating data at scale factor $SCALE."
|
|
|
- (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
|
|
|
+ echo "Generating data at scale factor $SCALE."
|
|
|
+ (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
|
|
|
fi
|
|
|
hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
|
|
|
if [ $? -ne 0 ]; then
|
|
|
- echo "Data generation failed, exiting."
|
|
|
- exit 1
|
|
|
+ echo "Data generation failed, exiting."
|
|
|
+ exit 1
|
|
|
fi
|
|
|
echo "TPC-DS text data generation complete."
|
|
|
|
|
|
@@ -71,48 +73,53 @@ echo "TPC-DS text data generation complete."
|
|
|
echo "Loading text data into external tables."
|
|
|
runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
|
|
|
|
|
|
-# Create the partitioned and bucketed tables.
|
|
|
-if [ "X$FORMAT" = "X" ]; then
|
|
|
- FORMAT=orc
|
|
|
-fi
|
|
|
-
|
|
|
-LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
|
|
|
-SILENCE="2> /dev/null 1> /dev/null"
|
|
|
-if [ "X$DEBUG_SCRIPT" != "X" ]; then
|
|
|
- SILENCE=""
|
|
|
-fi
|
|
|
-
|
|
|
-echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
|
|
|
-
|
|
|
-i=1
|
|
|
-total=24
|
|
|
-DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
|
|
|
-MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
|
|
|
-REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
|
|
|
-
|
|
|
-# Populate the smaller tables.
|
|
|
-for t in ${DIMS}
|
|
|
-do
|
|
|
- COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
|
|
|
- -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
|
|
|
- -d SCALE=${SCALE} \
|
|
|
- -d REDUCERS=${REDUCERS} \
|
|
|
- -d FILE=${FORMAT}"
|
|
|
- echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
|
|
|
- i=`expr $i + 1`
|
|
|
-done
|
|
|
-
|
|
|
-for t in ${FACTS}
|
|
|
-do
|
|
|
- COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
|
|
|
- -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
|
|
|
- -d SCALE=${SCALE} \
|
|
|
- -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
|
|
|
- -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
|
|
|
- echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
|
|
|
- i=`expr $i + 1`
|
|
|
-done
|
|
|
-
|
|
|
-make -j 2 -f $LOAD_FILE
|
|
|
-
|
|
|
-echo "Data loaded into database ${DATABASE}."
|
|
|
+if [ "$TEXT_ONLY" = "text" ];then
|
|
|
+ echo "Text data has been generated. Exiting"
|
|
|
+ exit 0
|
|
|
+else
|
|
|
+ # Create the partitioned and bucketed tables.
|
|
|
+ if [ "X$FORMAT" = "X" ]; then
|
|
|
+ FORMAT=orc
|
|
|
+ fi
|
|
|
+
|
|
|
+ LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
|
|
|
+ SILENCE="2> /dev/null 1> /dev/null"
|
|
|
+ if [ "X$DEBUG_SCRIPT" != "X" ]; then
|
|
|
+ SILENCE=""
|
|
|
+ fi
|
|
|
+
|
|
|
+ echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
|
|
|
+
|
|
|
+ i=1
|
|
|
+ total=24
|
|
|
+ DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
|
|
|
+ MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
|
|
|
+ REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
|
|
|
+
|
|
|
+ # Populate the smaller tables.
|
|
|
+ for t in ${DIMS}
|
|
|
+ do
|
|
|
+ COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
|
|
|
+ -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
|
|
|
+ -d SCALE=${SCALE} \
|
|
|
+ -d REDUCERS=${REDUCERS} \
|
|
|
+ -d FILE=${FORMAT}"
|
|
|
+ echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
|
|
|
+ i=`expr $i + 1`
|
|
|
+ done
|
|
|
+
|
|
|
+ for t in ${FACTS}
|
|
|
+ do
|
|
|
+ COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
|
|
|
+ -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
|
|
|
+ -d SCALE=${SCALE} \
|
|
|
+ -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
|
|
|
+ -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
|
|
|
+ echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
|
|
|
+ i=`expr $i + 1`
|
|
|
+ done
|
|
|
+
|
|
|
+ make -j 2 -f $LOAD_FILE
|
|
|
+
|
|
|
+ echo "Data loaded into database ${DATABASE}."
|
|
|
+fi
|