1
0

tpch-setup.sh 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpch-setup.sh scale_factor [temp_directory]"
  4. exit 1
  5. }
  6. function runcommand {
  7. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  8. $1
  9. else
  10. $1 2>/dev/null
  11. fi
  12. }
  13. BOLD=`tput bold`
  14. NORMAL=`tput sgr0`
  15. if [ ! -f tpch-gen/target/tpch-gen-1.0-SNAPSHOT.jar ]; then
  16. echo "Please build the data generator with ./build-tpch.sh first"
  17. exit 1
  18. fi
  19. which hive > /dev/null 2>&1
  20. if [ $? -ne 0 ]; then
  21. echo "Script must be run where Hive is installed"
  22. exit 1
  23. fi
  24. # Tables in the TPC-H schema.
  25. TABLES="part partsupp supplier customer orders lineitem nation region"
  26. # Get the parameters.
  27. SCALE=$1
  28. DIR=$2
  29. BUCKETS=13
  30. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  31. set -x
  32. fi
  33. # Sanity checking.
  34. if [ X"$SCALE" = "X" ]; then
  35. usage
  36. fi
  37. if [ X"$DIR" = "X" ]; then
  38. DIR=/tmp/tpch-generate
  39. fi
  40. if [ $SCALE -eq 1 ]; then
  41. echo "Scale factor must be greater than 1"
  42. exit 1
  43. fi
  44. # Do the actual data load.
  45. hdfs dfs -mkdir -p ${DIR}
  46. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  47. if [ $? -ne 0 ]; then
  48. echo "${BOLD}Generating data at scale factor $SCALE.${NORMAL}"
  49. (cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  50. fi
  51. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  52. if [ $? -ne 0 ]; then
  53. echo "${BOLD}Data generation failed, exiting.${NORMAL}"
  54. exit 1
  55. fi
  56. echo "${BOLD}TPC-H text data generation complete.${NORMAL}"
  57. # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
  58. echo "${BOLD}Loading text data into external tables.${NORMAL}"
  59. runcommand "hive -i settings/load-flat.sql -f ddl-tpch/text/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
  60. # Create the partitioned and bucketed tables.
  61. i=1
  62. total=8
  63. for t in ${TABLES}
  64. do
  65. echo "${BOLD}Optimizing table $t ($i/$total).${NORMAL}"
  66. COMMAND="hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/${t}.sql \
  67. -d DB=tpch_bin_partitioned_orc_${SCALE} \
  68. -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
  69. -d FILE=orc"
  70. runcommand "$COMMAND"
  71. if [ $? -ne 0 ]; then
  72. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  73. exit 1
  74. fi
  75. i=`expr $i + 1`
  76. done