Преглед на файлове

Consolidate setup scripts.
Automatically suck in Maven if it's missing.

cartershanklin преди 12 години
родител
ревизия
640bd0c5e6
променени са 3 файла, в които са добавени 71 реда и са изтрити 64 реда
  1. 12 1
      build.sh
  2. 1 46
      tpcds-setup-sandbox.sh
  3. 58 17
      tpcds-setup.sh

+ 12 - 1
build.sh

@@ -1,7 +1,7 @@
 #!/bin/sh
 
 # Check for all the stuff I need to function.
-for f in gcc mvn; do
+for f in gcc; do
 	which $f > /dev/null 2>&1
 	if [ $? -ne 0 ]; then
 		echo "Required program $f is missing. Please install it and try again."
@@ -9,5 +9,16 @@ for f in gcc mvn; do
 	fi
 done
 
+# Check if Maven is installed and install it if not.
+which mvn > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+	echo "Maven not found, installing it."
+	wget -c http://www.us.apache.org/dist/maven/maven-3/3.0.5/binaries/apache-maven-3.0.5-bin.tar.gz
+	tar -zxf apache-maven-3.0.5-bin.tar.gz
+	CWD=$(pwd)
+	export MAVEN_HOME="$CWD/apache-maven-3.0.5"
+	export PATH=$PATH:$MAVEN_HOME/bin
+fi
+
 echo "Building Data Generator"
 (cd tpcds-gen; make)

+ 1 - 46
tpcds-setup-sandbox.sh

@@ -1,48 +1,3 @@
 #!/bin/bash
 
-if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
-	echo "Build the data generator with build.sh first"
-	exit 1
-fi
-which hive > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-	echo "Script must be run where Hive is installed"
-	exit 1
-fi
-
-set -x
-set -e
-
-# Tables in the TPC-DS schema.
-LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
-
-SCALE=$1
-DIR=$2
-BUCKETS=13
-RETURN_BUCKETS=1
-SPLIT=16
-STORE_CLAUSES=( "orc" )
-FILE_FORMATS=( "orc" )
-SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
-
-hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
-hadoop dfs -ls ${DIR}/${SCALE}
-
-# Generate the text/flat tables. These will be later be converted to ORCFile.
-if true; then
-for t in ${LIST}
-do
-    hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
-done
-fi
-
-# Generate a flat (unpartitioned) schema in ORCFile format.
-i=0
-for file in "${STORE_CLAUSES[@]}"
-do
-    for t in ${LIST}
-    do
-        hive -i settings/load.sql -f ddl/bin_flat/${t}.sql -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
-    done
-    i=$((i+1))
-done
+./tpcds-setup.sh "$1" "$2" unpartitioned

+ 58 - 17
tpcds-setup.sh

@@ -1,5 +1,10 @@
 #!/bin/bash
 
+function usage {
+	echo "Usage: tpcds-setup.sh scale directory"
+	exit 1
+}
+
 if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
 	echo "Build the data generator with build.sh first"
 	exit 1
@@ -10,14 +15,31 @@ if [ $? -ne 0 ]; then
 	exit 1
 fi
 
-set -x
-set -e
-
 # Tables in the TPC-DS schema.
 LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
 
+# Get the parameters.
 SCALE=$1
 DIR=$2
+MODE=$3
+
+# Ensure arguments exist.
+if [ X"$SCALE" = "X" ]; then
+	usage
+fi
+if [ X"$DIR" = "X" ]; then
+	usage
+fi
+if [ X"$MODE" = "X" ]; then
+	MODE=partitioned
+fi
+
+# Sanity checking.
+if [ $SCALE -eq 1 ]; then
+	echo "Scale factor must be greater than 1"
+	exit 1
+fi
+
 BUCKETS=13
 RETURN_BUCKETS=1
 SPLIT=16
@@ -25,25 +47,44 @@ STORE_CLAUSES=( "orc" )
 FILE_FORMATS=( "orc" )
 SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
 
+set -x
+set -e
+
 hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
 hadoop dfs -ls ${DIR}/${SCALE}
 
 # Generate the text/flat tables. These will be later be converted to ORCFile.
-if true; then
 for t in ${LIST}
 do
-    hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
+	hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
 done
-fi
-
-# Generate the partitioned schema in ORCFile format.
-i=0
-for file in "${STORE_CLAUSES[@]}"
-do
-    for t in ${LIST}
-    do
-	hive -i settings/load.sql -f ddl/bin_partitioned/${t}.sql -d DB=tpcds_bin_partitioned_${FILE_FORMATS[$i]}_${SCALE} -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
-    done
 
-    i=$((i+1))
-done
+# Generate the binary forms of the data.
+if [ $MODE = "partitioned" ]; then
+	i=0
+	for file in "${STORE_CLAUSES[@]}"
+	do
+		for t in ${LIST}
+		do
+			hive -i settings/load.sql -f ddl/bin_partitioned/${t}.sql \
+			    -d DB=tpcds_bin_partitioned_${FILE_FORMATS[$i]}_${SCALE} \
+			    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
+			    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
+			    -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
+		done
+		i=$((i+1))
+	done
+else
+	i=0
+	for file in "${STORE_CLAUSES[@]}"
+	do
+		for t in ${LIST}
+		do
+			hive -i settings/load.sql -f ddl/bin_flat/${t}.sql \
+			    -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} \
+			    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
+			    -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
+		done
+	i=$((i+1))
+	done
+fi