tpcds-setup.sh 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpcds-setup.sh scale_factor [temp_directory]"
  4. exit 1
  5. }
  6. function runcommand {
  7. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  8. $1
  9. else
  10. $1 2>/dev/null
  11. fi
  12. }
  13. if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
  14. echo "Please build the data generator with ./tpcds-build.sh first"
  15. exit 1
  16. fi
  17. which hive > /dev/null 2>&1
  18. if [ $? -ne 0 ]; then
  19. echo "Script must be run where Hive is installed"
  20. exit 1
  21. fi
  22. # Tables in the TPC-DS schema.
  23. DIMS="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
  24. FACTS="store_sales store_returns web_sales web_returns catalog_sales catalog_returns inventory"
  25. # Get the parameters.
  26. SCALE=$1
  27. DIR=$2
  28. TEXT_ONLY=$3
  29. if [ "X$BUCKET_DATA" != "X" ]; then
  30. BUCKETS=13
  31. RETURN_BUCKETS=13
  32. else
  33. BUCKETS=1
  34. RETURN_BUCKETS=1
  35. fi
  36. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  37. set -x
  38. fi
  39. # Sanity checking.
  40. if [ X"$SCALE" = "X" ]; then
  41. usage
  42. fi
  43. if [ X"$DIR" = "X" ]; then
  44. DIR=/tmp/tpcds-generate
  45. fi
  46. if [ $SCALE -eq 1 ]; then
  47. echo "Scale factor must be greater than 1"
  48. exit 1
  49. fi
  50. # Do the actual data load.
  51. hdfs dfs -mkdir -p ${DIR}
  52. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  53. if [ $? -ne 0 ]; then
  54. echo "Generating data at scale factor $SCALE."
  55. (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  56. fi
  57. hdfs dfs -ls ${DIR}/${SCALE} > /dev/null
  58. if [ $? -ne 0 ]; then
  59. echo "Data generation failed, exiting."
  60. exit 1
  61. fi
  62. hadoop fs -chmod -R 777 /${DIR}/${SCALE}
  63. echo "TPC-DS text data generation complete."
  64. HIVE="beeline -n hive -u 'jdbc:hive2://localhost:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2?tez.queue.name=default' "
  65. # Create the text/flat tables as external tables. These will be later be converted to ORCFile.
  66. echo "Loading text data into external tables."
  67. runcommand "$HIVE -i settings/load-flat.sql -f ddl-tpcds/text/alltables.sql --hivevar DB=tpcds_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE}"
  68. if [ "$TEXT_ONLY" = "text" ];then
  69. echo "Text data has been generated. Exiting"
  70. exit 0
  71. else
  72. # Create the partitioned and bucketed tables.
  73. if [ "X$FORMAT" = "X" ]; then
  74. FORMAT=orc
  75. fi
  76. LOAD_FILE="load_${FORMAT}_${SCALE}.mk"
  77. SILENCE="2> /dev/null 1> /dev/null"
  78. if [ "X$DEBUG_SCRIPT" != "X" ]; then
  79. SILENCE=""
  80. fi
  81. echo -e "all: ${DIMS} ${FACTS}" > $LOAD_FILE
  82. i=1
  83. total=24
  84. DATABASE=tpcds_bin_partitioned_${FORMAT}_${SCALE}
  85. MAX_REDUCERS=2500 # maximum number of useful reducers for any scale
  86. REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})
  87. # Populate the smaller tables.
  88. for t in ${DIMS}
  89. do
  90. COMMAND="$HIVE -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  91. --hivevar DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} --hivevar SOURCE=tpcds_text_${SCALE} \
  92. --hivevar SCALE=${SCALE} \
  93. --hivevar REDUCERS=${REDUCERS} \
  94. --hivevar FILE=${FORMAT}"
  95. echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
  96. i=`expr $i + 1`
  97. done
  98. for t in ${FACTS}
  99. do
  100. COMMAND="$HIVE -i settings/load-partitioned.sql -f ddl-tpcds/bin_partitioned/${t}.sql \
  101. --hivevar DB=tpcds_bin_partitioned_${FORMAT}_${SCALE} \
  102. --hivevar SCALE=${SCALE} \
  103. --hivevar SOURCE=tpcds_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \
  104. --hivevar RETURN_BUCKETS=${RETURN_BUCKETS} --hivevar REDUCERS=${REDUCERS} --hivevar FILE=${FORMAT}"
  105. echo -e "${t}:\n\t@$COMMAND $SILENCE && echo 'Optimizing table $t ($i/$total).'" >> $LOAD_FILE
  106. i=`expr $i + 1`
  107. done
  108. make -j 1 -f $LOAD_FILE
  109. echo "Data loaded into database ${DATABASE}."
  110. fi