1
0

tpcds-setup.sh 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpcds-setup.sh scale_factor [temp_directory]"
  4. exit 1
  5. }
  6. function runcommand {
  7. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  8. $1
  9. else
  10. $1 2>/dev/null
  11. fi
  12. }
  13. if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
  14. echo "Please build the data generator with ./tpcds-build.sh first"
  15. exit 1
  16. fi
  17. which hive > /dev/null 2>&1
  18. if [ $? -ne 0 ]; then
  19. echo "Script must be run where Hive is installed"
  20. exit 1
  21. fi
  22. # Tables in the TPC-DS schema.
  23. DIMS="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
  24. FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
  25. # Get the parameters.
  26. SCALE=$1
  27. DIR=$2
  28. if [ "X$BUCKET_DATA" != "X" ]; then
  29. BUCKETS=13
  30. RETURN_BUCKETS=13
  31. else
  32. BUCKETS=1
  33. RETURN_BUCKETS=1
  34. fi
  35. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  36. set -x
  37. fi
  38. # Sanity checking.
  39. if [ X"$SCALE" = "X" ]; then
  40. usage
  41. fi
  42. if [ X"$DIR" = "X" ]; then
  43. DIR=/tmp/tpcds-generate
  44. fi
  45. if [ $SCALE -eq 1 ]; then
  46. echo "Scale factor must be greater than 1"
  47. exit 1
  48. fi
  49. # Do the actual data load.
  50. hdfs dfs -mkdir -p ${DIR}
  51. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  52. if [ $? -ne 0 ]; then
  53. echo "Generating data at scale factor $SCALE."
  54. (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  55. fi
  56. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  57. if [ $? -ne 0 ]; then
  58. echo "Data generation failed, exiting."
  59. exit 1
  60. fi
  61. echo "TPC-DS text data generation complete."
  62. # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
  63. echo "Loading text data into external tables."
  64. runcommand "hive -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
  65. # Create the partitioned and bucketed tables.
  66. if [ "X$FORMAT" = "X" ]; then
  67. FORMAT=orc
  68. fi
  69. i=1
  70. total=24
  71. DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
  72. for t in ${FACTS}
  73. do
  74. echo "Optimizing table $t ($i/$total)."
  75. COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  76. -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
  77. -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
  78. -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE=${FORMAT}"
  79. runcommand "$COMMAND"
  80. if [ $? -ne 0 ]; then
  81. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  82. exit 1
  83. fi
  84. i=`expr $i + 1`
  85. done
  86. # Populate the smaller tables.
  87. for t in ${DIMS}
  88. do
  89. echo "Optimizing table $t ($i/$total)."
  90. COMMAND="hive -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  91. -d DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} -d SOURCE=tpcds_text_${SCALE} \
  92. -d FILE=${FORMAT}"
  93. runcommand "$COMMAND"
  94. if [ $? -ne 0 ]; then
  95. echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
  96. exit 1
  97. fi
  98. i=`expr $i + 1`
  99. done
  100. echo "Data loaded into database ${DATABASE}."