Bladeren bron

Option to Generate only text data

Vimal Sharma 7 jaren geleden
bovenliggende
commit
6d6167d905
1 gewijzigde bestanden met toevoegingen van 65 en 58 verwijderingen
  1. 65 58
      tpcds-setup.sh

+ 65 - 58
tpcds-setup.sh

@@ -30,40 +30,42 @@ FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_ret
 # Get the parameters.
 SCALE=$1
 DIR=$2
+TEXT_ONLY=$3
+
 if [ "X$BUCKET_DATA" != "X" ]; then
-	BUCKETS=13
-	RETURN_BUCKETS=13
+    BUCKETS=13
+    RETURN_BUCKETS=13
 else
-	BUCKETS=1
-	RETURN_BUCKETS=1
+    BUCKETS=1
+    RETURN_BUCKETS=1
 fi
 if [ "X$DEBUG_SCRIPT" != "X" ]; then
-	set -x
+    set -x
 fi
 
 # Sanity checking.
 if [ X"$SCALE" = "X" ]; then
-	usage
+    usage
 fi
 if [ X"$DIR" = "X" ]; then
-	DIR=/tmp/tpcds-generate
+    DIR=/tmp/tpcds-generate
 fi
 if [ $SCALE -eq 1 ]; then
-	echo "Scale factor must be greater than 1"
-	exit 1
+    echo "Scale factor must be greater than 1"
+    exit 1
 fi
 
 # Do the actual data load.
 hdfs dfs -mkdir -p ${DIR}
 hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
 if [ $? -ne 0 ]; then
-	echo "Generating data at scale factor $SCALE."
-	(cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
+    echo "Generating data at scale factor $SCALE."
+    (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
 fi
 hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
 if [ $? -ne 0 ]; then
-	echo "Data generation failed, exiting."
-	exit 1
+    echo "Data generation failed, exiting."
+    exit 1
 fi
 echo "TPC-DS text data generation complete."
 
@@ -71,48 +73,53 @@ echo "TPC-DS text data generation complete."
 echo "Loading text data into external tables."
 runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
 
-# Create the partitioned and bucketed tables.
-if [ "X$FORMAT" = "X" ]; then
-	FORMAT=orc
-fi
-
-LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
-SILENCE="2> /dev/null 1> /dev/null" 
-if [ "X$DEBUG_SCRIPT" != "X" ]; then
-	SILENCE=""
-fi
-
-echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
-
-i=1
-total=24
-DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
-MAX_REDUCERS=2500 # maximum number of useful reducers for any scale 
-REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
-
-# Populate the smaller tables.
-for t in ${DIMS}
-do
-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
-            -d SCALE=${SCALE} \
-	    -d REDUCERS=${REDUCERS} \
-	    -d FILE=${FORMAT}"
-	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
-	i=`expr $i + 1`
-done
-
-for t in ${FACTS}
-do
-	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
-	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
-            -d SCALE=${SCALE} \
-	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
-	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
-	echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
-	i=`expr $i + 1`
-done
-
-make -j 2 -f $LOAD_FILE
-
-echo "Data loaded into database ${DATABASE}."
+if [ "$TEXT_ONLY" = "text" ];then
+    echo "Text data has been generated. Exiting"
+    exit 0
+else
+    # Create the partitioned and bucketed tables.
+    if [ "X$FORMAT" = "X" ]; then
+        FORMAT=orc
+    fi
+
+    LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
+    SILENCE="2> /dev/null 1> /dev/null"
+    if [ "X$DEBUG_SCRIPT" != "X" ]; then
+        SILENCE=""
+    fi
+
+    echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
+
+    i=1
+    total=24
+    DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
+    MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
+    REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
+
+    # Populate the smaller tables.
+    for t in ${DIMS}
+    do
+        COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
+            -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
+                -d SCALE=${SCALE} \
+            -d REDUCERS=${REDUCERS} \
+            -d FILE=${FORMAT}"
+        echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
+        i=`expr $i + 1`
+    done
+
+    for t in ${FACTS}
+    do
+        COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
+            -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
+                -d SCALE=${SCALE} \
+            -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
+            -d RETURN_BUCKETS=${RETURN_BUCKETS} -d REDUCERS=${REDUCERS} -d FILE=${FORMAT}"
+        echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
+        i=`expr $i + 1`
+    done
+
+    make -j 2 -f $LOAD_FILE
+
+    echo "Data loaded into database ${DATABASE}."
+fi