преди 12 години · 640bd0c5e6
--- a/build.sh
+++ b/build.sh
@@ -1,7 +1,7 @@
 
				 #!/bin/sh
			
 
				 
			
 
				 # Check for all the stuff I need to function.
			
 
				-for f in gcc mvn; do
			
 
				+for f in gcc; do
			
 
				 	which $f > /dev/null 2>&1
			
 
				 	if [ $? -ne 0 ]; then
			
 
				 		echo "Required program $f is missing. Please install it and try again."
			
@@ -9,5 +9,16 @@ for f in gcc mvn; do
 
				 	fi
			
 
				 done
			
 
				 
			
 
				+# Check if Maven is installed and install it if not.
			
 
				+which mvn > /dev/null 2>&1
			
 
				+if [ $? -ne 0 ]; then
			
 
				+	echo "Maven not found, installing it."
			
 
				+	wget -c http://www.us.apache.org/dist/maven/maven-3/3.0.5/binaries/apache-maven-3.0.5-bin.tar.gz
			
 
				+	tar -zxf apache-maven-3.0.5-bin.tar.gz
			
 
				+	CWD=$(pwd)
			
 
				+	export MAVEN_HOME="$CWD/apache-maven-3.0.5"
			
 
				+	export PATH=$PATH:$MAVEN_HOME/bin
			
 
				+fi
			
 
				+
			
 
				 echo "Building Data Generator"
			
 
				 (cd tpcds-gen; make)
			
--- a/tpcds-setup-sandbox.sh
+++ b/tpcds-setup-sandbox.sh
@@ -1,48 +1,3 @@
 
				 #!/bin/bash
			
 
				 
			
 
				-if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
			
 
				-	echo "Build the data generator with build.sh first"
			
 
				-	exit 1
			
 
				-fi
			
 
				-which hive > /dev/null 2>&1
			
 
				-if [ $? -ne 0 ]; then
			
 
				-	echo "Script must be run where Hive is installed"
			
 
				-	exit 1
			
 
				-fi
			
 
				-
			
 
				-set -x
			
 
				-set -e
			
 
				-
			
 
				-# Tables in the TPC-DS schema.
			
 
				-LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
			
 
				-
			
 
				-SCALE=$1
			
 
				-DIR=$2
			
 
				-BUCKETS=13
			
 
				-RETURN_BUCKETS=1
			
 
				-SPLIT=16
			
 
				-STORE_CLAUSES=( "orc" )
			
 
				-FILE_FORMATS=( "orc" )
			
 
				-SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
			
 
				-
			
 
				-hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
			
 
				-hadoop dfs -ls ${DIR}/${SCALE}
			
 
				-
			
 
				-# Generate the text/flat tables. These will be later be converted to ORCFile.
			
 
				-if true; then
			
 
				-for t in ${LIST}
			
 
				-do
			
 
				-    hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
			
 
				-done
			
 
				-fi
			
 
				-
			
 
				-# Generate a flat (unpartitioned) schema in ORCFile format.
			
 
				-i=0
			
 
				-for file in "${STORE_CLAUSES[@]}"
			
 
				-do
			
 
				-    for t in ${LIST}
			
 
				-    do
			
 
				-        hive -i settings/load.sql -f ddl/bin_flat/${t}.sql -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
			
 
				-    done
			
 
				-    i=$((i+1))
			
 
				-done
			
 
				+./tpcds-setup.sh "$1" "$2" unpartitioned
			
--- a/tpcds-setup.sh
+++ b/tpcds-setup.sh
@@ -1,5 +1,10 @@
 
				 #!/bin/bash
			
 
				 
			
 
				+function usage {
			
 
				+	echo "Usage: tpcds-setup.sh scale directory"
			
 
				+	exit 1
			
 
				+}
			
 
				+
			
 
				 if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
			
 
				 	echo "Build the data generator with build.sh first"
			
 
				 	exit 1
			
@@ -10,14 +15,31 @@ if [ $? -ne 0 ]; then
 
				 	exit 1
			
 
				 fi
			
 
				 
			
 
				-set -x
			
 
				-set -e
			
 
				-
			
 
				 # Tables in the TPC-DS schema.
			
 
				 LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
			
 
				 
			
 
				+# Get the parameters.
			
 
				 SCALE=$1
			
 
				 DIR=$2
			
 
				+MODE=$3
			
 
				+
			
 
				+# Ensure arguments exist.
			
 
				+if [ X"$SCALE" = "X" ]; then
			
 
				+	usage
			
 
				+fi
			
 
				+if [ X"$DIR" = "X" ]; then
			
 
				+	usage
			
 
				+fi
			
 
				+if [ X"$MODE" = "X" ]; then
			
 
				+	MODE=partitioned
			
 
				+fi
			
 
				+
			
 
				+# Sanity checking.
			
 
				+if [ $SCALE -eq 1 ]; then
			
 
				+	echo "Scale factor must be greater than 1"
			
 
				+	exit 1
			
 
				+fi
			
 
				+
			
 
				 BUCKETS=13
			
 
				 RETURN_BUCKETS=1
			
 
				 SPLIT=16
			
@@ -25,25 +47,44 @@ STORE_CLAUSES=( "orc" )
 
				 FILE_FORMATS=( "orc" )
			
 
				 SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
			
 
				 
			
 
				+set -x
			
 
				+set -e
			
 
				+
			
 
				 hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
			
 
				 hadoop dfs -ls ${DIR}/${SCALE}
			
 
				 
			
 
				 # Generate the text/flat tables. These will be later be converted to ORCFile.
			
 
				-if true; then
			
 
				 for t in ${LIST}
			
 
				 do
			
 
				-    hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
			
 
				+	hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
			
 
				 done
			
 
				-fi
			
 
				-
			
 
				-# Generate the partitioned schema in ORCFile format.
			
 
				-i=0
			
 
				-for file in "${STORE_CLAUSES[@]}"
			
 
				-do
			
 
				-    for t in ${LIST}
			
 
				-    do
			
 
				-	hive -i settings/load.sql -f ddl/bin_partitioned/${t}.sql -d DB=tpcds_bin_partitioned_${FILE_FORMATS[$i]}_${SCALE} -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
			
 
				-    done
			
 
				 
			
 
				-    i=$((i+1))
			
 
				-done
			
 
				+# Generate the binary forms of the data.
			
 
				+if [ $MODE = "partitioned" ]; then
			
 
				+	i=0
			
 
				+	for file in "${STORE_CLAUSES[@]}"
			
 
				+	do
			
 
				+		for t in ${LIST}
			
 
				+		do
			
 
				+			hive -i settings/load.sql -f ddl/bin_partitioned/${t}.sql \
			
 
				+			    -d DB=tpcds_bin_partitioned_${FILE_FORMATS[$i]}_${SCALE} \
			
 
				+			    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				+			    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
			
 
				+			    -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
			
 
				+		done
			
 
				+		i=$((i+1))
			
 
				+	done
			
 
				+else
			
 
				+	i=0
			
 
				+	for file in "${STORE_CLAUSES[@]}"
			
 
				+	do
			
 
				+		for t in ${LIST}
			
 
				+		do
			
 
				+			hive -i settings/load.sql -f ddl/bin_flat/${t}.sql \
			
 
				+			    -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} \
			
 
				+			    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				+			    -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
			
 
				+		done
			
 
				+	i=$((i+1))
			
 
				+	done
			
 
				+fi