tpcds-setup.sh 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpcds-setup.sh scale_factor [temp_directory]"
  4. exit 1
  5. }
  6. function runcommand {
  7. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  8. $1
  9. else
  10. $1 2>/dev/null
  11. fi
  12. }
  13. BOLD=`tput bold`
  14. NORMAL=`tput sgr0`
  15. if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
  16. echo "Please build the data generator with ./tpcds-build.sh first"
  17. exit 1
  18. fi
  19. which hive > /dev/null 2>&1
  20. if [ $? -ne 0 ]; then
  21. echo "Script must be run where Hive is installed"
  22. exit 1
  23. fi
  24. # Tables in the TPC-DS schema.
  25. DIMS="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
  26. FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
  27. # Get the parameters.
  28. SCALE=$1
  29. DIR=$2
  30. if [ "X$BUCKET_DATA" != "X" ]; then
  31. BUCKETS=13
  32. RETURN_BUCKETS=13
  33. else
  34. BUCKETS=1
  35. RETURN_BUCKETS=1
  36. fi
  37. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  38. set -x
  39. fi
  40. # Sanity checking.
  41. if [ X"$SCALE" = "X" ]; then
  42. usage
  43. fi
  44. if [ X"$DIR" = "X" ]; then
  45. DIR=/tmp/tpcds-generate
  46. fi
  47. if [ $SCALE -eq 1 ]; then
  48. echo "Scale factor must be greater than 1"
  49. exit 1
  50. fi
  51. # Do the actual data load.
  52. hdfs dfs -mkdir -p ${DIR}
  53. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  54. if [ $? -ne 0 ]; then
  55. echo "${BOLD}Generating data at scale factor $SCALE.${NORMAL}"
  56. (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  57. fi
  58. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  59. if [ $? -ne 0 ]; then
  60. echo "${BOLD}Data generation failed, exiting.${NORMAL}"
  61. exit 1
  62. fi
  63. echo "${BOLD}TPC-DS text data generation complete.${NORMAL}"
  64. # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
  65. echo "${BOLD}Loading text data into external tables.${NORMAL}"
  66. runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
  67. # Create the partitioned and bucketed tables.
  68. i=1
  69. total=24
  70. DATABASE=tpcds_bin_partitioned_orc_${SCALE}
  71. for t in ${FACTS}
  72. do
  73. echo "${BOLD}Optimizing table $t ($i/$total).${NORMAL}"
  74. COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  75. -d DB=tpcds_bin_partitioned_orc_${SCALE} \
  76. -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
  77. -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=orc"
  78. runcommand "$COMMAND"
  79. if [ $? -ne 0 ]; then
  80. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  81. exit 1
  82. fi
  83. i=`expr $i + 1`
  84. done
  85. # Populate the smaller tables.
  86. for t in ${DIMS}
  87. do
  88. echo "${BOLD}Optimizing table $t ($i/$total).${NORMAL}"
  89. COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  90. -d DB=tpcds_bin_partitioned_orc_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
  91. -d FILE=orc"
  92. runcommand "$COMMAND"
  93. if [ $? -ne 0 ]; then
  94. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  95. exit 1
  96. fi
  97. i=`expr $i + 1`
  98. done
  99. echo "${BOLD}Data loaded into database ${DATABASE}.${NORMAL}"