10 年之前 · 4d79cb267f
--- a/settings/load-flat.sql
+++ b/settings/load-flat.sql
@@ -1,5 +1,5 @@
 
				-set hive.enforce.bucketing=true;
			
 
				-set hive.enforce.sorting=true;
			
 
				+--set hive.enforce.bucketing=true;
			
 
				+--set hive.enforce.sorting=true;
			
 
				 set hive.exec.dynamic.partition.mode=nonstrict;
			
 
				 set hive.exec.max.dynamic.partitions.pernode=1000000;
			
 
				 set hive.exec.max.dynamic.partitions=1000000;
			
@@ -9,5 +9,7 @@ set mapreduce.input.fileinputformat.split.minsize=240000000;
 
				 set mapreduce.input.fileinputformat.split.maxsize=240000000;
			
 
				 set mapreduce.input.fileinputformat.split.minsize.per.node=240000000;
			
 
				 set mapreduce.input.fileinputformat.split.minsize.per.rack=240000000;
			
 
				-set hive.exec.parallel=true;
			
 
				+--set hive.exec.parallel=true;
			
 
				 set hive.stats.autogather=true;
			
 
				+set hive.support.concurrency=false;
			
 
				+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
			
--- a/settings/load-partitioned.sql
+++ b/settings/load-partitioned.sql
@@ -1,11 +1,11 @@
 
				-set hive.enforce.bucketing=true;
			
 
				-set hive.enforce.sorting=true;
			
 
				+-- set hive.enforce.bucketing=true;
			
 
				+-- set hive.enforce.sorting=true;
			
 
				 set hive.exec.dynamic.partition.mode=nonstrict;
			
 
				 set hive.exec.max.dynamic.partitions.pernode=100000;
			
 
				 set hive.exec.max.dynamic.partitions=100000;
			
 
				 set hive.exec.max.created.files=1000000;
			
 
				 set hive.exec.parallel=true;
			
 
				-set hive.exec.reducers.max=2000;
			
 
				+set hive.exec.reducers.max=${SCALE};
			
 
				 set hive.stats.autogather=true;
			
 
				 set hive.optimize.sort.dynamic.partition=true;
			
 
				 
			
@@ -14,6 +14,7 @@ set mapreduce.input.fileinputformat.split.minsizee=240000000;
 
				 set mapreduce.input.fileinputformat.split.minsize.per.node=240000000;
			
 
				 set mapreduce.input.fileinputformat.split.minsize.per.rack=240000000;
			
 
				 set hive.optimize.sort.dynamic.partition=true;
			
 
				+set hive.tez.java.opts=-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/;
			
 
				 
			
 
				 -- set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true;
			
 
				 -- set mapred.reduce.child.java.opts=-server -Xms1024m -Xmx3800m -Djava.net.preferIPv4Stack=true;
			
--- a/tpcds-setup.sh
+++ b/tpcds-setup.sh
@@ -83,6 +83,7 @@ do
 
				 	echo "Optimizing table $t ($i/$total)."
			
 
				 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				 	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
			
 
				+            -d SCALE=${SCALE} \
			
 
				 	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				 	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=${FORMAT}"
			
 
				 	runcommand "$COMMAND"
			
@@ -99,6 +100,7 @@ do
 
				 	echo "Optimizing table $t ($i/$total)."
			
 
				 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
			
 
				 	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
			
 
				+            -d SCALE=${SCALE} \
			
 
				 	    -d FILE=${FORMAT}"
			
 
				 	runcommand "$COMMAND"
			
 
				 	if [ $? -ne 0 ]; then
			
--- a/tpch-setup.sh
+++ b/tpch-setup.sh
@@ -48,12 +48,12 @@ fi
 
				 
			
 
				 # Do the actual data load.
			
 
				 hdfs dfs -mkdir -p ${DIR}
			
 
				-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				+hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
			
 
				 if [ $? -ne 0 ]; then
			
 
				 	echo "Generating data at scale factor $SCALE."
			
 
				 	(cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
			
 
				 fi
			
 
				-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
			
 
				+hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
			
 
				 if [ $? -ne 0 ]; then
			
 
				 	echo "Data generation failed, exiting."
			
 
				 	exit 1
			
@@ -67,13 +67,22 @@ runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d
 
				 # Create the optimized tables.
			
 
				 i=1
			
 
				 total=8
			
 
				-DATABASE=tpch_flat_orc_${SCALE}
			
 
				+
			
 
				+if test $SCALE -le 1000; then 
			
 
				+	SCHEMA_TYPE=flat
			
 
				+else
			
 
				+	SCHEMA_TYPE=partitioned
			
 
				+fi
			
 
				+
			
 
				+DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE}
			
 
				+
			
 
				 for t in ${TABLES}
			
 
				 do
			
 
				 	echo "Optimizing table $t ($i/$total)."
			
 
				-	COMMAND="hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/${t}.sql \
			
 
				+	COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
			
 
				 	    -d DB=${DATABASE} \
			
 
				 	    -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
			
 
				+            -d SCALE=${SCALE} \
			
 
				 	    -d FILE=orc"
			
 
				 	runcommand "$COMMAND"
			
 
				 	if [ $? -ne 0 ]; then
			
@@ -83,4 +92,6 @@ do
 
				 	i=`expr $i + 1`
			
 
				 done
			
 
				 
			
 
				+hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}; 
			
 
				+
			
 
				 echo "Data loaded into database ${DATABASE}."