1
0

tpch-setup.sh 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpch-setup.sh scale_factor [temp_directory]"
  4. exit 1
  5. }
  6. function runcommand {
  7. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  8. $1
  9. else
  10. $1 2>/dev/null
  11. fi
  12. }
  13. if [ ! -f tpch-gen/target/tpch-gen-1.0-SNAPSHOT.jar ]; then
  14. echo "Please build the data generator with ./tpch-build.sh first"
  15. exit 1
  16. fi
  17. which hive > /dev/null 2>&1
  18. if [ $? -ne 0 ]; then
  19. echo "Script must be run where Hive is installed"
  20. exit 1
  21. fi
  22. # Tables in the TPC-H schema.
  23. TABLES="part partsupp supplier customer orders lineitem nation region"
  24. # Get the parameters.
  25. SCALE=$1
  26. DIR=$2
  27. BUCKETS=13
  28. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  29. set -x
  30. fi
  31. # Sanity checking.
  32. if [ X"$SCALE" = "X" ]; then
  33. usage
  34. fi
  35. if [ X"$DIR" = "X" ]; then
  36. DIR=/tmp/tpch-generate
  37. fi
  38. if [ $SCALE -eq 1 ]; then
  39. echo "Scale factor must be greater than 1"
  40. exit 1
  41. fi
  42. # Do the actual data load.
  43. hdfs dfs -mkdir -p ${DIR}
  44. hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
  45. if [ $? -ne 0 ]; then
  46. echo "Generating data at scale factor $SCALE."
  47. (cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  48. fi
  49. hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null
  50. if [ $? -ne 0 ]; then
  51. echo "Data generation failed, exiting."
  52. exit 1
  53. fi
  54. echo "TPC-H text data generation complete."
  55. # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
  56. echo "Loading text data into external tables."
  57. runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
  58. # Create the optimized tables.
  59. i=1
  60. total=8
  61. if test $SCALE -le 1000; then
  62. SCHEMA_TYPE=flat
  63. else
  64. SCHEMA_TYPE=partitioned
  65. fi
  66. DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE}
  67. MAX_REDUCERS=2600 # ~7 years of data
  68. REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
  69. for t in ${TABLES}
  70. do
  71. echo "Optimizing table $t ($i/$total)."
  72. COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
  73. -d DB=${DATABASE} \
  74. -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
  75. -d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \
  76. -d FILE=orc"
  77. runcommand "$COMMAND"
  78. if [ $? -ne 0 ]; then
  79. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  80. exit 1
  81. fi
  82. i=`expr $i + 1`
  83. done
  84. hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE};
  85. echo "Data loaded into database ${DATABASE}."