瀏覽代碼

Add Scale into the total # of reducers

Gopal V 10 年之前
父節點
當前提交
4d79cb267f
共有 4 個文件被更改,包括 26 次插入10 次删除
  1. 5 3
      settings/load-flat.sql
  2. 4 3
      settings/load-partitioned.sql
  3. 2 0
      tpcds-setup.sh
  4. 15 4
      tpch-setup.sh

+ 5 - 3
settings/load-flat.sql

@@ -1,5 +1,5 @@
-set hive.enforce.bucketing=true;
-set hive.enforce.sorting=true;
+--set hive.enforce.bucketing=true;
+--set hive.enforce.sorting=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
 set hive.exec.max.dynamic.partitions.pernode=1000000;
 set hive.exec.max.dynamic.partitions=1000000;
@@ -9,5 +9,7 @@ set mapreduce.input.fileinputformat.split.minsize=240000000;
 set mapreduce.input.fileinputformat.split.maxsize=240000000;
 set mapreduce.input.fileinputformat.split.minsize.per.node=240000000;
 set mapreduce.input.fileinputformat.split.minsize.per.rack=240000000;
-set hive.exec.parallel=true;
+--set hive.exec.parallel=true;
 set hive.stats.autogather=true;
+set hive.support.concurrency=false;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;

+ 4 - 3
settings/load-partitioned.sql

@@ -1,11 +1,11 @@
-set hive.enforce.bucketing=true;
-set hive.enforce.sorting=true;
+-- set hive.enforce.bucketing=true;
+-- set hive.enforce.sorting=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
 set hive.exec.max.dynamic.partitions.pernode=100000;
 set hive.exec.max.dynamic.partitions=100000;
 set hive.exec.max.created.files=1000000;
 set hive.exec.parallel=true;
-set hive.exec.reducers.max=2000;
+set hive.exec.reducers.max=${SCALE};
 set hive.stats.autogather=true;
 set hive.optimize.sort.dynamic.partition=true;
 
@@ -14,6 +14,7 @@ set mapreduce.input.fileinputformat.split.minsizee=240000000;
 set mapreduce.input.fileinputformat.split.minsize.per.node=240000000;
 set mapreduce.input.fileinputformat.split.minsize.per.rack=240000000;
 set hive.optimize.sort.dynamic.partition=true;
+set hive.tez.java.opts=-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/;
 
 -- set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true;
 -- set mapred.reduce.child.java.opts=-server -Xms1024m -Xmx3800m -Djava.net.preferIPv4Stack=true;

+ 2 - 0
tpcds-setup.sh

@@ -83,6 +83,7 @@ do
 	echo "Optimizing table $t ($i/$total)."
 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
 	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
+            -d SCALE=${SCALE} \
 	    -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
 	    -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=${FORMAT}"
 	runcommand "$COMMAND"
@@ -99,6 +100,7 @@ do
 	echo "Optimizing table $t ($i/$total)."
 	COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
 	    -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
+            -d SCALE=${SCALE} \
 	    -d FILE=${FORMAT}"
 	runcommand "$COMMAND"
 	if [ $? -ne 0 ]; then

+ 15 - 4
tpch-setup.sh

@@ -48,12 +48,12 @@ fi
 
 # Do the actual data load.
 hdfs dfs -mkdir -p ${DIR}
-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
+hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
 if [ $? -ne 0 ]; then
 	echo "Generating data at scale factor $SCALE."
 	(cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
 fi
-hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
+hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
 if [ $? -ne 0 ]; then
 	echo "Data generation failed, exiting."
 	exit 1
@@ -67,13 +67,22 @@ runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d
 # Create the optimized tables.
 i=1
 total=8
-DATABASE=tpch_flat_orc_${SCALE}
+
+if test $SCALE -le 1000; then 
+	SCHEMA_TYPE=flat
+else
+	SCHEMA_TYPE=partitioned
+fi
+
+DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE}
+
 for t in ${TABLES}
 do
 	echo "Optimizing table $t ($i/$total)."
-	COMMAND="hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/${t}.sql \
+	COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
 	    -d DB=${DATABASE} \
 	    -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
+            -d SCALE=${SCALE} \
 	    -d FILE=orc"
 	runcommand "$COMMAND"
 	if [ $? -ne 0 ]; then
@@ -83,4 +92,6 @@ do
 	i=`expr $i + 1`
 done
 
+hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}; 
+
 echo "Data loaded into database ${DATABASE}."