ソースを参照

Merge pull request #13 from t3rmin4t0r/master

Update DDLs for TPC-H for scale & fix nation/region table gen
Carter Shanklin 11 年 前
コミット
8c9847b940

+ 1 - 0
settings/load-partitioned.sql

@@ -7,6 +7,7 @@ set hive.exec.max.created.files=1000000;
 set hive.exec.parallel=true;
 set hive.exec.reducers.max=2000;
 set hive.stats.autogather=true;
+set hive.optimize.sort.dynamic.partition=true;
 
 set mapred.job.reduce.input.buffer.percent=0.0;
 set mapreduce.input.fileinputformat.split.minsizee=240000000;

+ 97 - 0
tpch-gen/ddl/orc.sql

@@ -0,0 +1,97 @@
+set hive.stats.autogather=true;
+set hive.stats.dbclass=fs;
+
+create table if not exists lineitem 
+(L_ORDERKEY BIGINT,
+ L_PARTKEY BIGINT,
+ L_SUPPKEY BIGINT,
+ L_LINENUMBER INT,
+ L_QUANTITY DOUBLE,
+ L_EXTENDEDPRICE DOUBLE,
+ L_DISCOUNT DOUBLE,
+ L_TAX DOUBLE,
+ L_RETURNFLAG STRING,
+ L_LINESTATUS STRING,
+ L_SHIPDATE STRING,
+ L_COMMITDATE STRING,
+ L_RECEIPTDATE STRING,
+ L_SHIPINSTRUCT STRING,
+ L_SHIPMODE STRING,
+ L_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists part (P_PARTKEY INT,
+ P_NAME STRING,
+ P_MFGR STRING,
+ P_BRAND STRING,
+ P_TYPE STRING,
+ P_SIZE INT,
+ P_CONTAINER STRING,
+ P_RETAILPRICE DOUBLE,
+ P_COMMENT STRING) 
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists supplier (S_SUPPKEY BIGINT,
+ S_NAME STRING,
+ S_ADDRESS STRING,
+ S_NATIONKEY INT,
+ S_PHONE STRING,
+ S_ACCTBAL DOUBLE,
+ S_COMMENT STRING) 
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists partsupp (PS_PARTKEY BIGINT,
+ PS_SUPPKEY BIGINT,
+ PS_AVAILQTY INT,
+ PS_SUPPLYCOST DOUBLE,
+ PS_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists nation (N_NATIONKEY INT,
+ N_NAME STRING,
+ N_REGIONKEY INT,
+ N_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists region (R_REGIONKEY INT,
+ R_NAME STRING,
+ R_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists customer (C_CUSTKEY BIGINT,
+ C_NAME STRING,
+ C_ADDRESS STRING,
+ C_NATIONKEY INT,
+ C_PHONE STRING,
+ C_ACCTBAL DOUBLE,
+ C_MKTSEGMENT STRING,
+ C_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+create table if not exists orders (O_ORDERKEY BIGINT,
+ O_CUSTKEY BIGINT,
+ O_ORDERSTATUS STRING,
+ O_TOTALPRICE DOUBLE,
+ O_ORDERDATE STRING,
+ O_ORDERPRIORITY STRING,
+ O_CLERK STRING,
+ O_SHIPPRIORITY INT,
+ O_COMMENT STRING)
+STORED AS ORC TBLPROPERTIES ("orc.compress"="SNAPPY")
+;
+
+insert overwrite table nation select * from ${SOURCE}.nation;
+insert overwrite table region select * from ${SOURCE}.region;
+insert overwrite table part select * from ${SOURCE}.part;
+insert overwrite table supplier select * from ${SOURCE}.supplier;
+insert overwrite table partsupp select * from ${SOURCE}.partsupp;
+insert overwrite table customer select * from ${SOURCE}.customer;
+insert overwrite table lineitem select * from ${SOURCE}.lineitem;
+insert overwrite table orders select * from ${SOURCE}.orders;

+ 10 - 10
tpch-gen/ddl/text.sql

@@ -1,7 +1,7 @@
 create external table lineitem 
-(L_ORDERKEY INT,
- L_PARTKEY INT,
- L_SUPPKEY INT,
+(L_ORDERKEY BIGINT,
+ L_PARTKEY BIGINT,
+ L_SUPPKEY BIGINT,
  L_LINENUMBER INT,
  L_QUANTITY DOUBLE,
  L_EXTENDEDPRICE DOUBLE,
@@ -18,7 +18,7 @@ create external table lineitem
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE 
 LOCATION '${LOCATION}/lineitem';
 
-create external table part (P_PARTKEY INT,
+create external table part (P_PARTKEY BIGINT,
  P_NAME STRING,
  P_MFGR STRING,
  P_BRAND STRING,
@@ -30,7 +30,7 @@ create external table part (P_PARTKEY INT,
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE 
 LOCATION '${LOCATION}/part/';
 
-create external table supplier (S_SUPPKEY INT,
+create external table supplier (S_SUPPKEY BIGINT,
  S_NAME STRING,
  S_ADDRESS STRING,
  S_NATIONKEY INT,
@@ -40,8 +40,8 @@ create external table supplier (S_SUPPKEY INT,
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE 
 LOCATION '${LOCATION}/supplier/';
 
-create external table partsupp (PS_PARTKEY INT,
- PS_SUPPKEY INT,
+create external table partsupp (PS_PARTKEY BIGINT,
+ PS_SUPPKEY BIGINT,
  PS_AVAILQTY INT,
  PS_SUPPLYCOST DOUBLE,
  PS_COMMENT STRING)
@@ -61,7 +61,7 @@ create external table region (R_REGIONKEY INT,
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
 LOCATION '${LOCATION}/region';
 
-create external table customer (C_CUSTKEY INT,
+create external table customer (C_CUSTKEY BIGINT,
  C_NAME STRING,
  C_ADDRESS STRING,
  C_NATIONKEY INT,
@@ -72,8 +72,8 @@ create external table customer (C_CUSTKEY INT,
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXTFILE
 LOCATION '${LOCATION}/customer';
 
-create external table orders (O_ORDERKEY INT,
- O_CUSTKEY INT,
+create external table orders (O_ORDERKEY BIGINT,
+ O_CUSTKEY BIGINT,
  O_ORDERSTATUS STRING,
  O_TOTALPRICE DOUBLE,
  O_ORDERDATE STRING,

+ 1 - 1
tpch-gen/pom.xml

@@ -19,7 +19,7 @@
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
-      <version>2.2.0</version>
+      <version>2.4.0</version>
       <scope>compile</scope>
     </dependency>
     <dependency>

+ 15 - 2
tpch-gen/src/main/java/org/notmysock/tpch/GenTable.java

@@ -4,13 +4,14 @@ import org.apache.hadoop.conf.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.hdfs.*;
 import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.io.compress.SnappyCodec;
 import org.apache.hadoop.util.*;
 import org.apache.hadoop.filecache.*;
 import org.apache.hadoop.mapreduce.*;
 import org.apache.hadoop.mapreduce.lib.input.*;
 import org.apache.hadoop.mapreduce.lib.output.*;
 import org.apache.hadoop.mapreduce.lib.reduce.*;
-
 import org.apache.commons.cli.*;
 import org.apache.commons.*;
 
@@ -65,6 +66,8 @@ public class GenTable extends Configured implements Tool {
         options.addOption("t","table", true, "table");
         options.addOption("d","dir", true, "dir");
         options.addOption("p", "parallel", true, "parallel");
+        options.addOption("text", "text", false, "text");
+        options.addOption("snappy", "snappy", false, "snappy");
         CommandLine line = parser.parse(options, remainingArgs);
 
         if(!(line.hasOption("scale") && line.hasOption("dir"))) {
@@ -122,6 +125,15 @@ public class GenTable extends Configured implements Tool {
         MultipleOutputs.addNamedOutput(job, "text", 
           TextOutputFormat.class, LongWritable.class, Text.class);
 
+        if (line.hasOption("snappy") || (line.hasOption("text") == false)) {
+          TextOutputFormat.setCompressOutput(job, true);
+          if (line.hasOption("snappy")) {
+             TextOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
+          } else {
+             TextOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
+          }
+        }
+
         boolean success = job.waitForCompletion(true);
 
         // cleanup
@@ -219,10 +231,11 @@ public class GenTable extends Configured implements Tool {
 
         File cwd = new File(".");
         final String suffix = String.format(".tbl.%s", child);
+        final boolean firstMapper = child.equals("1");
 
         FilenameFilter tables = new FilenameFilter() {
           public boolean accept(File dir, String name) {
-            return name.endsWith(suffix) || name.endsWith(".tbl");
+            return name.endsWith(suffix) || (name.endsWith(".tbl") && firstMapper);
           }
         };