tpcds-setup.sh 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/bin/bash
  2. function usage {
  3. echo "Usage: tpcds-setup.sh scale [temp directory] [partitioned|unpartitioned]"
  4. exit 1
  5. }
  6. if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
  7. echo "Build the data generator with build.sh first"
  8. exit 1
  9. fi
  10. which hive > /dev/null 2>&1
  11. if [ $? -ne 0 ]; then
  12. echo "Script must be run where Hive is installed"
  13. exit 1
  14. fi
  15. # Tables in the TPC-DS schema.
  16. LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page web_site"
  17. FACTS="web_returns store_sales store_returns web_sales catalog_sales catalog_returns inventory"
  18. # Get the parameters.
  19. SCALE=$1
  20. DIR=$2
  21. # Ensure arguments exist.
  22. if [ X"$SCALE" = "X" ]; then
  23. usage
  24. fi
  25. if [ X"$DIR" = "X" ]; then
  26. DIR=/tmp/tpcds-generate
  27. fi
  28. # Sanity checking.
  29. if [ $SCALE -eq 1 ]; then
  30. echo "Scale factor must be greater than 1"
  31. exit 1
  32. fi
  33. BUCKETS=13
  34. RETURN_BUCKETS=1
  35. SPLIT=16
  36. set -x
  37. set -e
  38. hadoop dfs -mkdir -p ${DIR}
  39. hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  40. hadoop dfs -ls ${DIR}/${SCALE}
  41. # Generate the text/flat tables. These will be later be converted to ORCFile.
  42. # hive -i settings/load-flat.sql -f ddl/text/alltables.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}
  43. # Create the partitioned tables.
  44. for t in ${FACTS}
  45. do
  46. hive -i settings/load-partitioned.sql -f ddl/bin_partitioned/${t}.sql \
  47. -d DB=tpcds_bin_partitioned_orc_${SCALE} \
  48. -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
  49. -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
  50. -d SPLIT=${SPLIT}
  51. done
  52. # Populate the smaller tables.
  53. for t in ${LIST}
  54. do
  55. hive -i settings/load-partitioned.sql -f ddl/bin_partitioned/${t}.sql \
  56. -d DB=tpcds_bin_partitioned_orc_${SCALE} \
  57. -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} \
  58. -d RETURN_BUCKETS=${RETURN_BUCKETS} -d FILE="${file}" \
  59. -d SPLIT=${SPLIT}
  60. done