tpcds-setup-sandbox.sh 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #!/bin/bash
  2. if [ ! -f tpcds-gen/target/tpcds-gen-1.0-SNAPSHOT.jar ]; then
  3. echo "Build the data generator with build.sh first"
  4. exit 1
  5. fi
  6. set -x
  7. set -e
  8. # Tables in the TPC-DS schema.
  9. LIST="date_dim time_dim item customer customer_demographics household_demographics customer_address store promotion warehouse ship_mode reason income_band call_center web_page catalog_page inventory store_sales store_returns web_sales web_returns web_site catalog_sales catalog_returns"
  10. SCALE=$1
  11. DIR=$2
  12. BUCKETS=13
  13. RETURN_BUCKETS=1
  14. SPLIT=16
  15. STORE_CLAUSES=( "orc" )
  16. FILE_FORMATS=( "orc" )
  17. SERDES=( "org.apache.hadoop.hive.ql.io.orc.OrcSerde" )
  18. hadoop dfs -ls ${DIR}/${SCALE} || (cd tpcds-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE})
  19. hadoop dfs -ls ${DIR}/${SCALE}
  20. # Generate the text/flat tables. These will be later be converted to ORCFile.
  21. if true; then
  22. for t in ${LIST}
  23. do
  24. hive -i settings/load.sql -f ddl/text/${t}.sql -d DB=tpcds_text_${SCALE} -d LOCATION=${DIR}/${SCALE}/${t}
  25. done
  26. fi
  27. # Generate a flat (unpartitioned) schema in ORCFile format.
  28. i=0
  29. for file in "${STORE_CLAUSES[@]}"
  30. do
  31. for t in ${LIST}
  32. do
  33. hive -i settings/load.sql -f ddl/bin_flat/${t}.sql -d DB=tpcds_bin_flat_${FILE_FORMATS[$i]}_${SCALE} -d SOURCE=tpcds_text_${SCALE} -d BUCKETS=${BUCKETS} -d FILE="${file}" -d SERDE=${SERDES[$i]} -d SPLIT=${SPLIT}
  34. done
  35. i=$((i+1))
  36. done