7 anni fa · 1b125549df
--- a/tpcds-setup.sh
+++ b/tpcds-setup.sh
@@ -30,89 +30,96 @@ FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_ret
 
				 # Get the parameters.
			
 
				 SCALE=$1
			
 
				 DIR=$2
			
 
				-if [ "X$BUCKET_DATA" != "X" ]; then
			
 
				-	BUCKETS=13
			
 
				-	RETURN_BUCKETS=13
			
 
				-else
			
 
				-	BUCKETS=1
			
 
				-	RETURN_BUCKETS=1
			
 
				-fi
			
 
				-if [ "X$DEBUG_SCRIPT" != "X" ]; then
			
 
				-	set -x
			
 
				-fi
			
 
				-
			
 
				-# Sanity checking.
			
 
				-if [ X"$SCALE" = "X" ]; then
			
 
				-	usage
			
 
				-fi
			
 
				-if [ X"$DIR" = "X" ]; then
			
 
				-	DIR=/tmp/tpcds-generate
			
 
				-fi
			
 
				-if [ $SCALE -eq 1 ]; then
			
 
				-	echo "Scale factor must be greater than 1"
			
 
				-	exit 1
			
 
				-fi
			
 
				+TEXT_ONLY=$3
			
 
				 
			
 
				-# Do the actual data load.
			
 
				-hdfs dfs -mkdir -p ${DIR}
			
 
				-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				-if [ $? -ne 0 ]; then
			
 
				-	echo "Generating data at scale factor $SCALE."
			
 
				-	(cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
			
 
				-fi
			
 
				-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				-if [ $? -ne 0 ]; then
			
 
				-	echo "Data generation failed, exiting."
			
 
				-	exit 1
			
 
				-fi
			
 
				-echo "TPC-DS text data generation complete."
			
 
				-
			
 
				-# Create the text/flat tables as external tables. These will be later be converted to ORCFile.
			
 
				-echo "Loading text data into external tables."
			
 
				-runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
			
 
				-
			
 
				-# Create the partitioned and bucketed tables.
			
 
				-if [ "X$FORMAT" = "X" ]; then
			
 
				-	FORMAT=orc
			
 
				-fi
			
 
				-
			
 
				-LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
			
 
				-SILENCE="2> /dev/null 1> /dev/null" 
			
 
				-if [ "X$DEBUG_SCRIPT" != "X" ]; then
			
 
				-	SILENCE=""
			
 
				+if [ "$TEXT_ONLY" = "text" ];then
			
 
				+    echo "Text data has been generated. Exiting"
			
 
				+    exit 0
			
 
				+else
			
 
				+    if [ "X$BUCKET_DATA" != "X" ]; then
			
 
				+        BUCKETS=13
			
 
				+        RETURN_BUCKETS=13
			
 
				+    else
			
 
				+        BUCKETS=1
			
 
				+        RETURN_BUCKETS=1
			
 
				+    fi
			
 
				+    if [ "X$DEBUG_SCRIPT" != "X" ]; then
			
 
				+        set -x
			
 
				+    fi
			
 
				+
			
 
				+    # Sanity checking.
			
 
				+    if [ X"$SCALE" = "X" ]; then
			
 
				+        usage
			
 
				+    fi
			
 
				+    if [ X"$DIR" = "X" ]; then
			
 
				+        DIR=/tmp/tpcds-generate
			
 
				+    fi
			
 
				+    if [ $SCALE -eq 1 ]; then
			
 
				+        echo "Scale factor must be greater than 1"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+
			
 
				+    # Do the actual data load.
			
 
				+    hdfs dfs -mkdir -p ${DIR}
			
 
				+    hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				+    if [ $? -ne 0 ]; then
			
 
				+        echo "Generating data at scale factor $SCALE."
			
 
				+        (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
			
 
				+    fi
			
 
				+    hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				+    if [ $? -ne 0 ]; then
			
 
				+        echo "Data generation failed, exiting."
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    echo "TPC-DS text data generation complete."
			
 
				+
			
 
				+    # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
			
 
				+    echo "Loading text data into external tables."
			
 
				+    runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
			
 
				+
			
 
				+    # Create the partitioned and bucketed tables.
			
 
				+    if [ "X$FORMAT" = "X" ]; then
			
 
				+        FORMAT=orc
			
 
				+    fi
			
 
				+
			
 
				+    LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
			
 
				+    SILENCE="2> /dev/null 1> /dev/null"
			
 
				+    if [ "X$DEBUG_SCRIPT" != "X" ]; then
			
 
				+        SILENCE=""
			
 
				+    fi
			
 
				+
			
 
				+    echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
			
 
				+
			
 
				+    i=1
			
 
				+    total=24
			
 
				+    DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
			
 
				+    MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
			
 
				+    REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
			
 
				+
			
 
				+    # Populate the smaller tables.
			
 
				+    for t in ${DIMS}
			
 
				+    do
			
 
				+        COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				+            -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
			
 
				+                -d SCALE=${SCALE} \
			
 
				+            -d REDUCERS=${REDUCERS} \
			
 
				+            -d FILE=${FORMAT}"
			
 
				+        echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
			
 
				+        i=`expr $i + 1`
			
 
				+    done
			
 
				+
			
 
				+    for t in ${FACTS}
			
 
				+    do
			
 
				+        COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				+            -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
			
 
				+                -d SCALE=${SCALE} \
			
 
				+            -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				+            -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
			
 
				+        echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
			
 
				+        i=`expr $i + 1`
			
 
				+    done
			
 
				+
			
 
				+    make -j 2 -f $LOAD_FILE
			
 
				+
			
 
				+    echo "Data loaded into database ${DATABASE}."
			
 
				 fi
			
 
				-
			
 
				-echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
			
 
				-
			
 
				-i=1
			
 
				-total=24
			
 
				-DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
			
 
				-MAX_REDUCERS=2500 # maximum number of useful reducers for any scale 
			
 
				-REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
			
 
				-
			
 
				-# Populate the smaller tables.
			
 
				-for t in ${DIMS}
			
 
				-do
			
 
				-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
			
 
				-            -d SCALE=${SCALE} \
			
 
				-	    -d REDUCERS=${REDUCERS} \
			
 
				-	    -d FILE=${FORMAT}"
			
 
				-	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
			
 
				-	i=`expr $i + 1`
			
 
				-done
			
 
				-
			
 
				-for t in ${FACTS}
			
 
				-do
			
 
				-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
			
 
				-            -d SCALE=${SCALE} \
			
 
				-	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				-	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
			
 
				-	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
			
 
				-	i=`expr $i + 1`
			
 
				-done
			
 
				-
			
 
				-make -j 2 -f $LOAD_FILE
			
 
				-
			
 
				-echo "Data loaded into database ${DATABASE}."