Stop Thinking, Just Do!

Sung-Soo Kim's Blog



8 December 2015

Article Source

TPC-H Benchmark

TPC-H is an ad-hoc and decision support benchmark. Some of queries are available in the current Tajo. You can download the TPC-H data generator from here.

DDL for TPC-H datasets

TPC-H benchmark provides 8 table datasets. The below DDL statements is for them.

    create external table supplier (S_SUPPKEY bigint, S_NAME text, S_ADDRESS text, S_NATIONKEY bigint, S_PHONE text, S_ACCTBAL double, S_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table lineitem (L_ORDERKEY bigint, L_PARTKEY bigint, L_SUPPKEY bigint, L_LINENUMBER bigint, L_QUANTITY double, L_EXTENDEDPRICE double, L_DISCOUNT double, L_TAX double, L_RETURNFLAG text, L_LINESTATUS text, L_SHIPDATE text, L_COMMITDATE text, L_RECEIPTDATE text, L_SHIPINSTRUCT text, L_SHIPMODE text, L_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table part (P_PARTKEY bigint, P_NAME text, P_MFGR text, P_BRAND text, P_TYPE text, P_SIZE integer, P_CONTAINER text, P_RETAILPRICE double, P_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table partsupp (PS_PARTKEY bigint, PS_SUPPKEY bigint, PS_AVAILQTY int, PS_SUPPLYCOST double, PS_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table customer (C_CUSTKEY bigint, C_NAME text, C_ADDRESS text, C_NATIONKEY bigint, C_PHONE text, C_ACCTBAL double, C_MKTSEGMENT text, C_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table orders (O_ORDERKEY bigint, O_CUSTKEY bigint, O_ORDERSTATUS text, O_TOTALPRICE double, O_ORDERDATE text, O_ORDERPRIORITY text, O_CLERK text, O_SHIPPRIORITY int, O_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

    create external table nation (N_NATIONKEY bigint, N_NAME text, N_REGIONKEY bigint, N_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y' 

    create external table region (R_REGIONKEY bigint, R_NAME text, R_COMMENT text) using csv with ('csvfile.delimiter'='|') location 'hdfs://x/y'

TPC-H Queries


    select l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, sum(l_extendedprice*(1-l_discount)) as sum_disc_price, sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge, avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, avg(l_discount) as avg_disc, count(*) as count_order from lineitem where l_shipdate <= '1998-09-01' group by l_returnflag, l_linestatus order by l_returnflag, l_linestatus


Tajo does not support sclar subquery yet. So, you should use multiple queries as follows:

    create table nation_region as select n_regionkey, r_regionkey, n_nationkey, n_name, r_name from region join nation on n_regionkey = r_regionkey where r_name = 'EUROPE'

    create table r2_1 as select s_acctbal, s_name, n_name, p_partkey, p_mfgr, s_address, s_phone, s_comment, ps_supplycost from nation_region join supplier on s_nationkey = n_nationkey join partsupp on s_suppkey = ps_suppkey join part on p_partkey = ps_partkey  where p_size = 15 and p_type like '%BRASS'

    create table r2_2 as select p_partkey, min(ps_supplycost) as min_ps_supplycost from r2_1 group by p_partkey

    select s_acctbal, s_name, n_name, r2_1.p_partkey, p_mfgr, s_address, s_phone, s_comment from r2_1 join r2_2 on r2_1.p_partkey = r2_2.p_partkey where ps_supplycost = min_ps_supplycost order by s_acctbal, n_name, s_name, r2_1.p_partkey


    select l_orderkey,  sum(l_extendedprice*(1-l_discount)) as revenue, o_orderdate, o_shippriority from customer as c join orders as o on c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey join lineitem as l on l.l_orderkey = o.o_orderkey where o_orderdate < '1995-03-15' and l_shipdate > '1995-03-15' group by l_orderkey, o_orderdate, o_shippriority order by revenue desc, o_orderdate


    select sum(l_extendedprice*l_discount) as revenue from lineitem where l_shipdate >= '1994-01-01' and l_shipdate < '1995-01-01' and l_discount >= 0.05 and l_discount <= 0.07 and l_quantity < 24;


    select c_custkey, c_name, sum(l_extendedprice * (1 - l_discount)) as revenue, c_acctbal, n_name, c_address, c_phone, c_comment from customer_rc as c join nation_rc as n on c.c_nationkey = n.n_nationkey join orders as o on c.c_custkey = o.o_custkey and o.o_orderdate >= '1993-10-01' and o.o_orderdate < '1994-01-01' join lineitem_rc as l on l.l_orderkey = o.o_orderkey and l.l_returnflag = 'R' group by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment order by revenue desc


    select l_shipmode, sum(case when o_orderpriority ='1-URGENT' or o_orderpriority ='2-HIGH' then 1 else 0 end) as high_line_count, sum(case when o_orderpriority <> '1-URGENT' and o_orderpriority <> '2-HIGH' then 1 else 0 end) as low_line_count from orders, lineitem where o_orderkey = l_orderkey and (l_shipmode = 'MAIL' or l_shipmode = 'SHIP') and l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_receiptdate >= '1994-01-01' and l_receiptdate < '1995-01-01' group by l_shipmode order by l_shipmode


    select 100.00 * sum(case when p_type like 'PROMO%' then l_extendedprice*(1-l_discount) else 0 end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue from lineitem, part where l_partkey = p_partkey and l_shipdate >= '1995-09-01' and l_shipdate < '1995-10-01'

comments powered by Disqus