| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- #!/bin/bash
- function usage {
- echo "Usage: tpcds-setup.sh scale_factor [temp_directory]"
- exit 1
- }
- function runcommand {
- if [ "X$DEBUG_SCRIPT" != "X" ]; then
- $1
- else
- $1 2>/dev/null
- fi
- }
- if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
- echo "Please build the data generator with ./tpcds-build.sh first"
- exit 1
- fi
- which hive > /dev/null 2>&1
- if [ $? -ne 0 ]; then
- echo "Script must be run where Hive is installed"
- exit 1
- fi
- # Tables in the TPC-DS schema.
- DIMS="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
- FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
- # Get the parameters.
- SCALE=$1
- DIR=$2
- if [ "X$BUCKET_DATA" != "X" ]; then
- BUCKETS=13
- RETURN_BUCKETS=13
- else
- BUCKETS=1
- RETURN_BUCKETS=1
- fi
- if [ "X$DEBUG_SCRIPT" != "X" ]; then
- set -x
- fi
- # Sanity checking.
- if [ X"$SCALE" = "X" ]; then
- usage
- fi
- if [ X"$DIR" = "X" ]; then
- DIR=/tmp/tpcds-generate
- fi
- if [ $SCALE -eq 1 ]; then
- echo "Scale factor must be greater than 1"
- exit 1
- fi
- # Do the actual data load.
- hdfs dfs -mkdir -p ${DIR}
- hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
- if [ $? -ne 0 ]; then
- echo "Generating data at scale factor $SCALE."
- (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
- fi
- hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
- if [ $? -ne 0 ]; then
- echo "Data generation failed, exiting."
- exit 1
- fi
- echo "TPC-DS text data generation complete."
- # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
- echo "Loading text data into external tables."
- runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
- # Create the partitioned and bucketed tables.
- if [ "X$FORMAT" = "X" ]; then
- FORMAT=orc
- fi
- LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
- SILENCE="2> /dev/null 1> /dev/null"
- if [ "X$DEBUG_SCRIPT" != "X" ]; then
- SILENCE=""
- fi
- echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
- i=1
- total=24
- DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
- MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
- REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
- # Populate the smaller tables.
- for t in ${DIMS}
- do
- COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
- -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
- -d SCALE=${SCALE} \
- -d REDUCERS=${REDUCERS} \
- -d FILE=${FORMAT}"
- echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
- i=`expr $i + 1`
- done
- for t in ${FACTS}
- do
- COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
- -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
- -d SCALE=${SCALE} \
- -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
- -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
- echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
- i=`expr $i + 1`
- done
- make -j 2 -f $LOAD_FILE
- echo "Data loaded into database ${DATABASE}."
|