Created
February 26, 2018 11:59
-
-
Save hannes/fbf26247d5879cffc9de2b31460c3d31 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pkg.add("CSV") | |
Pkg.update() | |
using DataFrames, CSV | |
wr = true | |
ne = false | |
region = CSV.read("region.tbl", delim='|', header=["r_regionkey", "r_name", "r_comment"], weakrefstrings=wr, nullable=ne) | |
nation = CSV.read("nation.tbl", delim='|', header=["n_nationkey", "n_name", "n_regionkey", "n_comment"], weakrefstrings=wr, nullable=ne) | |
supplier = CSV.read("supplier.tbl", delim='|', header=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"], weakrefstrings=wr, nullable=ne) | |
customer = CSV.read("customer.tbl", delim='|', header=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], weakrefstrings=wr, nullable=ne) | |
part = CSV.read("part.tbl", delim='|', header=["p_partkey","p_name","p_mfgr","p_brand","p_type","p_size","p_container","p_retailprice","p_comment"], weakrefstrings=wr, nullable=ne) | |
partsupp = CSV.read("partsupp.tbl", delim='|', header=["ps_partkey","ps_suppkey","ps_availqty","ps_supplycost","ps_comment"], weakrefstrings=wr, nullable=ne) | |
orders = CSV.read("orders.tbl", delim='|', header=["o_orderkey","o_custkey","o_orderstatus","o_totalprice","o_orderdate","o_orderpriority","o_clerk","o_shippriority","o_comment"], weakrefstrings=wr, nullable=ne) | |
lineitem = CSV.read("lineitem.tbl", delim='|', header=["l_orderkey","l_partkey","l_suppkey","l_linenumber","l_quantity","l_extendedprice","l_discount","l_tax","l_returnflag","l_linestatus","l_shipdate","l_commitdate","l_receiptdate","l_shipinstruct","l_shipmode","l_comment"], weakrefstrings=wr, nullable=ne) | |
function q1() | |
gc_enable(false) | |
res = sort!(by(lineitem[lineitem[:l_shipdate] .<= Date("1998-09-01"), [:l_returnflag, :l_linestatus, :l_quantity, :l_extendedprice, :l_discount, :l_tax]], [:l_returnflag, :l_linestatus], df -> DataFrame( | |
sum_qty = sum(df[:l_quantity]), | |
sum_base_price = sum(df[:l_extendedprice]), | |
sum_disc_price = sum(df[:l_extendedprice] .* (1 - df[:l_discount])), | |
sum_charge = sum(df[:l_extendedprice] .* (1 - df[:l_discount]) .* (1 + df[:l_tax])), | |
avg_qty = mean(df[:l_quantity]), | |
avg_price = mean(df[:l_extendedprice]), | |
avg_disc = mean(df[:l_discount]), | |
count_order = nrow(df) | |
)), cols=[:l_returnflag, :l_linestatus]) | |
gc_enable(true) | |
res | |
end | |
function q2() | |
gc_enable(false) | |
ps = partsupp[[:ps_partkey, :ps_suppkey, :ps_supplycost]] | |
p = part[part[:p_size] .== 15 .& map(x->ismatch(r".*BRASS$", x), part[:p_type]), [:p_partkey, :p_mfgr]] | |
psp = join(ps, p, on = :ps_partkey => :p_partkey) | |
sp = supplier[[:s_suppkey, :s_nationkey, :s_acctbal, :s_name, :s_address, :s_phone, :s_comment]] | |
psps = join(psp, sp, on = :ps_suppkey => :s_suppkey) | |
nr = join(nation, region, on = :n_regionkey => :r_regionkey)[[:n_nationkey, :n_name]] | |
pspsnr = join(psps, nr, on = :s_nationkey => :n_nationkey)[[:ps_partkey, :ps_supplycost, :p_mfgr, :n_name, :s_acctbal, :s_name, :s_address, :s_phone, :s_comment]] | |
aggr = by(pspsnr, :ps_partkey, df -> DataFrame(ps_supplycost = minimum(df[:ps_supplycost]))) | |
sj = join(pspsnr, aggr, on = [:ps_partkey, :ps_supplycost]) | |
res = head(sort(sj[[:s_acctbal, :s_name, :n_name, :ps_partkey, :p_mfgr, :s_address, :s_phone, :s_comment]], | |
cols=[order(:s_acctbal, rev=true), :n_name, :s_name, :ps_partkey]), 100) | |
gc_enable(true) | |
res | |
end | |
function q3() | |
gc_enable(false) | |
o = orders[orders[:o_orderdate] .<= Date("1995-03-15"), [:o_orderkey, :o_custkey, :o_orderdate, :o_shippriority]] | |
c = customer[customer[:c_mktsegment] .== "BUILDING", [:c_custkey, :c_mktsegment]] | |
oc = join(o, c, on = :o_custkey => :c_custkey)[[:o_orderkey, :o_orderdate, :o_shippriority]] | |
l = lineitem[lineitem[:l_shipdate] .> Date("1995-03-15"), [:l_orderkey, :l_extendedprice, :l_discount]] | |
loc = join(oc, l, on = :o_orderkey => :l_orderkey) | |
aggr = by(loc, [:o_orderkey, :o_orderdate, :o_shippriority], df -> DataFrame( | |
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount])) | |
)) | |
res = head(sort(aggr[[:o_orderkey, :revenue, :o_orderdate, :o_shippriority]], | |
cols=[order(:revenue, rev=true), :o_orderkey]), 10) | |
gc_enable(true) | |
res | |
end | |
function q4() | |
gc_enable(false) | |
l = lineitem[lineitem[:l_commitdate] .< lineitem[:l_receiptdate], [:l_orderkey]] | |
o = orders[(orders[:o_orderdate] .>= Date("1993-07-01")) .& | |
(orders[:o_orderdate] .< Date("1993-10-01")) , [:o_orderkey, :o_orderpriority]] | |
lo = unique(join(o, l, on = :o_orderkey => :l_orderkey))[[:o_orderpriority]] | |
res = sort(by(lo, :o_orderpriority, df -> DataFrame(order_count=nrow(df))), cols=[:o_orderpriority]) | |
gc_enable(true) | |
res | |
end | |
function q5() | |
gc_enable(false) | |
nr = join(nation, region[region[:r_name] .== "ASIA", :], | |
on = :n_regionkey => :r_regionkey)[[:n_nationkey, :n_name]] | |
snr = join(supplier[[:s_suppkey, :s_nationkey]], nr, on = :s_nationkey => :n_nationkey)[[:s_suppkey, :s_nationkey, :n_name]] | |
lsnr = join(snr, lineitem[[:l_suppkey, :l_orderkey, :l_extendedprice, :l_discount]], on = :s_suppkey => :l_suppkey)[[:l_orderkey, :l_extendedprice, :l_discount, :n_name, :s_nationkey]] | |
o = orders[(orders[:o_orderdate] .>= Date("1994-01-01")) .& | |
(orders[:o_orderdate] .< Date("1995-01-01")), [:o_orderkey, :o_custkey]] | |
oc = join(o, customer[[:c_custkey, "c_nationkey"]], on = :o_custkey => :c_custkey)[[:o_orderkey, :c_nationkey]] | |
lsnroc = join(oc, lsnr, on = [:o_orderkey => :l_orderkey, :c_nationkey => :s_nationkey])[[:l_extendedprice, :l_discount, :n_name]] | |
res = sort(by(lsnroc, :n_name, df -> DataFrame( | |
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount])) | |
)), cols=[order(:revenue, rev=true)]) | |
gc_enable(true) | |
res | |
end | |
function q6() | |
gc_enable(false) | |
l = lineitem[(lineitem[:l_shipdate] .>= Date("1994-01-01")) .& | |
(lineitem[:l_shipdate] .< Date("1995-01-01")) .& | |
(lineitem[:l_discount] .>= 0.05) .& | |
(lineitem[:l_discount] .<= 0.07) .& | |
(lineitem[:l_quantity] .< 24), [:l_extendedprice, :l_discount]] | |
res = sum(l[:l_extendedprice] .* l[:l_discount]) | |
gc_enable(true) | |
res | |
end | |
function q7() | |
gc_enable(false) | |
n = nation[(nation[:n_name] .== "GERMANY") .| (nation[:n_name] .== "FRANCE"), [:n_nationkey, :n_name]] | |
sn = rename(join(supplier[[:s_nationkey, :s_suppkey]], n, on = :s_nationkey => :n_nationkey), :n_name => :n1_name)[[:s_suppkey, :n1_name]] | |
cn = rename(join(customer[[:c_custkey, :c_nationkey]], n, | |
on = :c_nationkey => :n_nationkey), :n_name => :n2_name) | |
cno = join(orders, cn, on = :o_custkey => :c_custkey)[[:o_orderkey, :n2_name]] | |
l = lineitem[(lineitem[:l_shipdate] .>= Date("1995-01-01")) .& | |
(lineitem[:l_shipdate] .<= Date("1995-12-31")), [:l_orderkey, :l_suppkey, :l_shipdate, :l_extendedprice, :l_discount]] | |
cnol = join(cno, l, on=:o_orderkey => :l_orderkey)[[:l_suppkey, :l_shipdate, :l_extendedprice, :l_discount, :n2_name]] | |
cnolsn = join(cnol, sn, on=:l_suppkey => :s_suppkey) | |
cnolsnf = cnolsn[((cnolsn[:n1_name] .== "FRANCE") .& | |
(cnolsn[:n2_name] .== "GERMANY")) .| | |
((cnolsn[:n1_name] .== "GERMANY") .& | |
(cnolsn[:n2_name] .== "FRANCE")), :] | |
cnolsnf[:l_year] = map(x -> Dates.year(x), cnolsnf[:l_shipdate]) | |
res = sort(by(cnolsnf, [:n1_name, :n2_name, :l_year], df -> DataFrame( | |
revenue = sum(df[:l_extendedprice] .* (1 - df[:l_discount])) | |
)), cols=[:n1_name, :n2_name, :l_year]) | |
gc_enable(true) | |
res | |
end | |
function q8() | |
gc_enable(false) | |
nr = join(nation, region[region[:r_name] .== "AMERICA", :], on = :n_regionkey => :r_regionkey)[[:n_nationkey]] | |
cnr = join(customer[[:c_custkey, :c_nationkey]], nr, on=:c_nationkey => :n_nationkey)[[:c_custkey]] | |
o = orders[(orders[:o_orderdate] .>= Date("1995-01-01")) .& | |
(orders[:o_orderdate] .<= Date("1996-12-31")) ,[:o_orderkey, :o_orderdate, :o_custkey]] | |
ocnr = join(o, cnr, on=:o_custkey => :c_custkey)[[:o_orderkey, :o_orderdate]] | |
locnr = join(lineitem[[:l_orderkey, :l_partkey, :l_suppkey, :l_extendedprice, :l_discount]], ocnr, | |
on = :l_orderkey => :o_orderkey)[[:l_partkey, :l_suppkey, :l_extendedprice, :l_discount, :o_orderdate]] | |
p = part[part[:p_type] .== "ECONOMY ANODIZED STEEL", [:p_partkey]] | |
locnrp = join(locnr, p, on=:l_partkey => :p_partkey)[[:l_suppkey, :l_extendedprice, :l_discount, :o_orderdate]] | |
locnrps = join(supplier[[:s_suppkey, :s_nationkey]], locnrp, | |
on=:s_suppkey => :l_suppkey)[[:l_extendedprice, :l_discount, :o_orderdate, :s_nationkey]] | |
locnrpsn = join(locnrps, nation[[:n_nationkey, :n_name]], on=:s_nationkey => :n_nationkey) | |
locnrpsn[:o_year] = map(x -> Dates.year(x), locnrpsn[:o_orderdate]) | |
locnrpsn[:volume] = locnrpsn[:l_extendedprice] .* (1 - locnrpsn[:l_discount]) | |
res = sort(by(locnrpsn, :o_year, df -> DataFrame( | |
mkt_share = sum(ifelse.(df[:n_name] .== "BRAZIL", df[:volume], 0)) / | |
sum(df[:volume]) | |
)), cols=:o_year) | |
gc_enable(true) | |
res | |
end | |
function q9() | |
gc_enable(false) | |
p = part[map(x->ismatch(r".*green.*", x), part[:p_name]), [:p_partkey]] | |
psp = join(partsupp[[:ps_suppkey, :ps_partkey, :ps_supplycost]], p, on=:ps_partkey => :p_partkey) | |
sn = join(supplier[[:s_suppkey, :s_nationkey]], nation[[:n_nationkey, :n_name]], on=:s_nationkey => :n_nationkey)[[:s_suppkey, :n_name]] | |
pspsn = join(psp, sn, on=:ps_suppkey => :s_suppkey) | |
lpspsn = join(lineitem[[:l_suppkey, :l_partkey, :l_orderkey, :l_extendedprice, :l_discount, :l_quantity]], pspsn, | |
on=[:l_suppkey => :ps_suppkey, :l_partkey => :ps_partkey])[[:l_orderkey, :l_extendedprice, :l_discount, :l_quantity, :ps_supplycost, :n_name]] | |
lpspsno = join(lpspsn, orders[[:o_orderkey, :o_orderdate]], | |
on=:l_orderkey => :o_orderkey)[[:l_extendedprice, :l_discount, :l_quantity, :ps_supplycost, :n_name, :o_orderdate]] | |
lpspsno[:o_year] = map(x -> Dates.year(x), lpspsno[:o_orderdate]) | |
lpspsno[:amount] = lpspsno[:l_extendedprice] .* (1 .- lpspsno[:l_discount]) .- | |
lpspsno[:ps_supplycost] .* lpspsno[:l_quantity] | |
res = sort(by(lpspsno[[:n_name, :o_year, :amount]], [:n_name, :o_year], df -> DataFrame( | |
sum_profit = sum(df[:amount]) | |
)), cols = [:n_name, order(:o_year, rev = true)]) | |
gc_enable(true) | |
res | |
end | |
function q10() | |
gc_enable(false) | |
l = lineitem[lineitem[:l_returnflag] .== "R", [:l_orderkey, :l_extendedprice, :l_discount]] | |
o = orders[(orders[:o_orderdate] .>= Date("1993-10-01")) .& | |
(orders[:o_orderdate] .< Date("1994-01-01")) ,[:o_orderkey, :o_custkey]] | |
lo = join(o, l, on = :o_orderkey => :l_orderkey) | |
lo[:volume] = lo[:l_extendedprice] .* (1 - lo[:l_discount]) | |
lo_aggr = by(lo, :o_custkey, df -> DataFrame(revenue = sum(df[:volume]))) | |
c = customer[[:c_custkey, :c_nationkey, :c_name, :c_acctbal, :c_phone, :c_address, :c_comment]] | |
loc = join(lo_aggr, c, on = :o_custkey => :c_custkey) | |
locn = join(loc, nation[[:n_nationkey, :n_name]], on = :c_nationkey => :n_nationkey) | |
res = head(sort(locn[[:o_custkey, :c_name, :revenue, :c_acctbal, :n_name, :c_address, :c_phone, :c_comment]], cols = order(:revenue, rev = true)), 20) | |
gc_enable(true) | |
res | |
end | |
function bench(f, n) | |
tic() | |
f() | |
r1 = toc() | |
tic() | |
f() | |
r2 = toc() | |
tic() | |
f() | |
r3 = toc() | |
tic() | |
f() | |
r4 = toc() | |
tic() | |
f() | |
r5 = toc() | |
CSV.write("julia.csv", DataFrame(exp="Julia", query=n, time_median_ms=median([r1, r2, r3, r4, r5])*1000); append=true) | |
end | |
bench(q1, 1) | |
bench(q2, 2) | |
bench(q3, 3) | |
bench(q4, 4) | |
bench(q5, 5) | |
bench(q6, 6) | |
bench(q7, 7) | |
bench(q8, 8) | |
bench(q9, 9) | |
bench(q10, 10) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import timeit, csv | |
region = pd.read_csv("region.tbl", sep='|', names=["r_regionkey", "r_name", "r_comment"]) | |
nation = pd.read_csv("nation.tbl", sep='|', names=["n_nationkey", "n_name", "n_regionkey", "n_comment"]) | |
supplier = pd.read_csv("supplier.tbl", sep='|', names=["s_suppkey","s_name","s_address","s_nationkey","s_phone","s_acctbal","s_comment"]) | |
customer = pd.read_csv("customer.tbl", sep='|', names=["c_custkey","c_name","c_address","c_nationkey","c_phone","c_acctbal","c_mktsegment","c_comment"], dtype={'c_mktsegment' : 'category'}) | |
part = pd.read_csv("part.tbl", sep='|', names=["p_partkey","p_name","p_mfgr","p_brand","p_type","p_size","p_container","p_retailprice","p_comment"], dtype={'p_container' : 'category'}) | |
partsupp = pd.read_csv("partsupp.tbl", sep='|', names=["ps_partkey","ps_suppkey","ps_availqty","ps_supplycost","ps_comment"]) | |
orders = pd.read_csv("orders.tbl", sep='|', names=["o_orderkey","o_custkey","o_orderstatus","o_totalprice","o_orderdate","o_orderpriority","o_clerk","o_shippriority","o_comment"], dtype={'o_orderstatus' : 'category', 'o_orderpriority' : 'category'}, parse_dates=['o_orderdate']) | |
lineitem = pd.read_csv("lineitem.tbl", sep='|', names=["l_orderkey","l_partkey","l_suppkey","l_linenumber","l_quantity","l_extendedprice","l_discount","l_tax","l_returnflag","l_linestatus","l_shipdate","l_commitdate","l_receiptdate","l_shipinstruct","l_shipmode","l_comment"], dtype={'l_returnflag': 'category', 'l_linestatus': 'category'}, parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate']) | |
def udf_disc_price(extended, discount): | |
return np.multiply(extended, np.subtract(1, discount)) | |
def udf_charge(extended, discount, tax): | |
return np.multiply(extended, np.multiply(np.subtract(1, discount), np.add(1, tax))) | |
def q1(): | |
df = lineitem[["l_shipdate", "l_returnflag", "l_linestatus", "l_quantity", "l_extendedprice", "l_discount", "l_tax"]][(lineitem['l_shipdate'] <= '1998-09-01')] | |
df['disc_price'] = udf_disc_price(df['l_extendedprice'], df['l_discount']) | |
df['charge'] = udf_charge(df['l_extendedprice'], df['l_discount'], df['l_tax']) | |
return df.groupby(['l_returnflag', 'l_linestatus'])\ | |
.agg({'l_quantity': 'sum', 'l_extendedprice': 'sum', 'disc_price': 'sum', 'charge': 'sum', | |
'l_quantity': 'mean', 'l_extendedprice': 'mean', 'l_discount': 'mean', 'l_shipdate': 'count'}) | |
def q2(): | |
ps = partsupp[["ps_partkey", "ps_suppkey", "ps_supplycost"]] | |
p = part[["p_partkey", "p_mfgr", "p_size", "p_type"]][(part.p_size == 15) & (part.p_type.str.match(".*BRASS$"))][["p_partkey", "p_mfgr"]] | |
psp = ps.merge(p, left_on="ps_partkey", right_on="p_partkey") | |
s = supplier[["s_suppkey", "s_nationkey", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]] | |
psps = psp.merge(s, left_on="ps_suppkey", right_on="s_suppkey")[["ps_partkey", "ps_supplycost", "p_mfgr", "s_nationkey", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]] | |
nr = nation.merge(region[region.r_name == "EUROPE"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey", "n_name"]] | |
pspsnr = psps.merge(nr, left_on="s_nationkey", right_on="n_nationkey")[["ps_partkey", "ps_supplycost", "p_mfgr", "n_name", "s_acctbal", "s_name", "s_address", "s_phone", "s_comment"]] | |
aggr = pspsnr.groupby("ps_partkey").agg({'ps_supplycost' : min}).reset_index() | |
sj = pspsnr.merge(aggr, left_on=["ps_partkey", "ps_supplycost"], right_on=["ps_partkey", "ps_supplycost"]) | |
res = sj[["s_acctbal", "s_name", "n_name", "ps_partkey", "p_mfgr", "s_address", "s_phone", "s_comment"]].sort_values(["s_acctbal", "n_name", "s_name", "ps_partkey"], ascending=[False, True, True, True]).head(100) | |
return res | |
def q3(): | |
o = orders[["o_orderkey", "o_custkey", "o_orderdate", "o_shippriority"]][orders.o_orderdate < "1995-03-15"][["o_orderkey", "o_custkey", "o_orderdate", "o_shippriority"]] | |
c = customer[["c_custkey", "c_mktsegment"]][customer.c_mktsegment == "BUILDING"][["c_custkey", "c_mktsegment"]] | |
oc = o.merge(c, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "o_orderdate", "o_shippriority"]] | |
l = lineitem[["l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"]][lineitem.l_shipdate > "1995-03-15"][["l_orderkey", "l_extendedprice", "l_discount"]] | |
loc = l.merge(oc, left_on="l_orderkey", right_on="o_orderkey") | |
loc["volume"] = loc.l_extendedprice * (1 - loc.l_discount) | |
res = loc.groupby(["l_orderkey", "o_orderdate", "o_shippriority"]).agg({'volume' : sum}).reset_index()[["l_orderkey", "volume", "o_orderdate", "o_shippriority"]].sort_values(["volume", "o_orderdate"], ascending=[False, True]).head(10) | |
return res | |
def q4(): | |
l = lineitem[["l_orderkey", "l_commitdate"]][lineitem.l_commitdate < lineitem.l_receiptdate][["l_orderkey"]] | |
o = orders[["o_orderkey", "o_orderpriority", "o_orderdate"]][(orders.o_orderdate >= "1993-07-01") & (orders.o_orderdate < "1993-10-01")][["o_orderkey", "o_orderpriority"]] | |
lo = l.merge(o, left_on="l_orderkey", right_on="o_orderkey").drop_duplicates()[["o_orderpriority"]] | |
res = lo.groupby("o_orderpriority").size().reset_index(name='counts').sort_values('o_orderpriority') | |
return res | |
def q5(): | |
nr = nation.merge(region[region.r_name == "ASIA"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey", "n_name"]] | |
snr = supplier[["s_suppkey", "s_nationkey"]].merge(nr, left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "s_nationkey", "n_name"]] | |
lsnr = lineitem[["l_suppkey", "l_orderkey", "l_extendedprice", "l_discount"]].merge(snr, left_on="l_suppkey", right_on="s_suppkey") | |
o = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1994-01-01") & (orders.o_orderdate < "1995-01-01")][["o_orderkey", "o_custkey"]] | |
oc = o.merge(customer[["c_custkey", "c_nationkey"]], left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "c_nationkey"]] | |
lsnroc = lsnr.merge(oc, left_on=["l_orderkey", "s_nationkey"], right_on=["o_orderkey", "c_nationkey"])[["l_extendedprice", "l_discount", "n_name"]] | |
lsnroc["volume"] = lsnroc.l_extendedprice * (1 - lsnroc.l_discount) | |
res = lsnroc.groupby("n_name").agg({'volume' : sum}).reset_index().sort_values("volume", ascending=False) | |
return res | |
def q6(): | |
l = lineitem[["l_extendedprice", "l_discount", "l_shipdate", "l_quantity"]][ | |
(lineitem.l_shipdate >= "1994-01-01") & | |
(lineitem.l_shipdate < "1995-01-01") & | |
(lineitem.l_discount >= 0.05) & | |
(lineitem.l_discount <= 0.07) & | |
(lineitem.l_quantity < 24)][["l_extendedprice", "l_discount"]] | |
res = (l.l_extendedprice * l.l_discount).sum() | |
return res | |
def q7(): | |
sn = supplier[["s_nationkey", "s_suppkey"]].merge(nation[["n_nationkey", "n_name"]][(nation.n_name == "FRANCE") | (nation.n_name == "GERMANY")], left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "n_name"]] | |
sn.columns = ["s_suppkey", "n1_name"] | |
cn = customer[["c_custkey", "c_nationkey"]].merge(nation[["n_nationkey", "n_name"]][(nation.n_name == "FRANCE") | (nation.n_name == "GERMANY")], left_on="c_nationkey", right_on="n_nationkey")[["c_custkey", "n_name"]] | |
cn.columns = ["c_custkey", "n2_name"] | |
cno = orders[["o_custkey", "o_orderkey"]].merge(cn, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "n2_name"]] | |
cnol = lineitem[["l_orderkey", "l_suppkey", "l_shipdate", "l_extendedprice", "l_discount"]][(lineitem.l_shipdate >= "1995-01-01") & (lineitem.l_shipdate <= "1996-12-31")][["l_orderkey", "l_suppkey", "l_shipdate", "l_extendedprice", "l_discount"]].merge(cno, left_on="l_orderkey", right_on="o_orderkey")[["l_suppkey", "l_shipdate", "l_extendedprice", "l_discount", "n2_name"]] | |
cnolsn = cnol.merge(sn, left_on="l_suppkey", right_on="s_suppkey") | |
cnolsn["volume"] = cnolsn.l_extendedprice * (1 - cnolsn.l_discount) | |
cnolsn["l_year"] = cnolsn.l_shipdate.dt.year | |
cnolsnf = cnolsn[((cnolsn.n1_name == "FRANCE") & (cnolsn.n2_name == "GERMANY")) | ((cnolsn.n1_name == "GERMANY") & (cnolsn.n2_name == "FRANCE"))] | |
res = cnolsnf[["n1_name", "n2_name", "l_year", "volume"]].groupby(["n1_name", "n2_name", "l_year"]).agg({'volume' : sum}).reset_index().sort_values(["n1_name", "n2_name", "l_year"]) | |
return res | |
def q8(): | |
nr = nation.merge(region[region.r_name == "AMERICA"], left_on="n_regionkey", right_on="r_regionkey")[["n_nationkey"]] | |
cnr = customer[["c_custkey", "c_nationkey"]].merge(nr, left_on="c_nationkey", right_on="n_nationkey")[["c_custkey"]] | |
ocnr = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1995-01-01") & (orders.o_orderdate <= "1996-12-31")].merge(cnr, left_on="o_custkey", right_on="c_custkey")[["o_orderkey", "o_orderdate"]] | |
locnr = lineitem[["l_orderkey", "l_partkey", "l_suppkey", "l_extendedprice", "l_discount"]].merge(ocnr, left_on="l_orderkey", right_on="o_orderkey")[["l_partkey", "l_suppkey", "l_extendedprice", "l_discount", "o_orderdate"]] | |
p = part[["p_partkey", "p_type"]][part.p_type == "ECONOMY ANODIZED STEEL"][["p_partkey"]] | |
locnrp = locnr.merge(p, left_on="l_partkey", right_on="p_partkey")[["l_suppkey", "l_extendedprice", "l_discount", "o_orderdate"]] | |
locnrps = locnrp.merge(supplier[["s_suppkey", "s_nationkey"]], left_on="l_suppkey", right_on="s_suppkey")[["l_extendedprice", "l_discount", "o_orderdate", "s_nationkey"]] | |
locnrpsn = locnrps.merge(nation[["n_nationkey", "n_name"]], left_on="s_nationkey", right_on="n_nationkey")[["l_extendedprice", "l_discount", "o_orderdate", "n_name"]] | |
locnrpsn["volume"] = locnrpsn.l_extendedprice * (1 - locnrpsn.l_discount) | |
locnrpsn["o_year"] = locnrpsn.o_orderdate.dt.year | |
res = locnrpsn[["o_year", "volume", "n_name"]].groupby("o_year").apply(lambda df : pd.DataFrame({ | |
'mkt_share' : np.where(df.n_name == "BRAZIL", df.volume, 0).sum() | |
}, index=[0])).reset_index().sort_values("o_year") | |
return res | |
def q9(): | |
p = part[["p_partkey", "p_name"]][part.p_name.str.match(".*green.*")][["p_partkey"]] | |
psp = partsupp[["ps_suppkey", "ps_partkey", "ps_supplycost"]].merge(p, left_on="ps_partkey", right_on="p_partkey") | |
sn = supplier[["s_suppkey", "s_nationkey"]].merge(nation[["n_nationkey", "n_name"]], left_on="s_nationkey", right_on="n_nationkey")[["s_suppkey", "n_name"]] | |
pspsn = psp.merge(sn, left_on="ps_suppkey", right_on="s_suppkey") | |
lpspsn = lineitem[["l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"]].merge(pspsn, left_on=["l_suppkey", "l_partkey"], right_on=["ps_suppkey", "ps_partkey"])[["l_orderkey", "l_extendedprice", "l_discount", "l_quantity", "ps_supplycost", "n_name"]] | |
olpspsn = orders[["o_orderkey", "o_orderdate"]].merge(lpspsn, left_on="o_orderkey", right_on="l_orderkey")[["l_extendedprice", "l_discount", "l_quantity", "ps_supplycost", "n_name", "o_orderdate"]] | |
olpspsn["amount"] = olpspsn.l_extendedprice * (1 - olpspsn.l_discount) - olpspsn.ps_supplycost * olpspsn.l_quantity | |
olpspsn["o_year"] = olpspsn.o_orderdate.dt.year | |
res = olpspsn[["n_name", "o_year", "amount"]].groupby(["n_name", "o_year"]).agg({'amount' : sum}).reset_index().sort_values(["n_name", "o_year" ], ascending=[True, False]) | |
return res | |
def q10(): | |
l = lineitem[["l_orderkey", "l_extendedprice", "l_discount", "l_returnflag"]][lineitem.l_returnflag == "R"][["l_orderkey", "l_extendedprice", "l_discount"]] | |
o = orders[["o_orderkey", "o_custkey", "o_orderdate"]][(orders.o_orderdate >= "1993-10-01") & (orders.o_orderdate < "1994-01-01")][["o_orderkey", "o_custkey"]] | |
lo = l.merge(o, left_on="l_orderkey", right_on="o_orderkey")[["l_extendedprice", "l_discount", "o_custkey"]] | |
lo["volume"] = lo.l_extendedprice * (1 - lo.l_discount) | |
lo_aggr = lo.groupby("o_custkey").agg({'volume' : sum}).reset_index() | |
c = customer[["c_custkey", "c_nationkey", "c_name", "c_acctbal", "c_phone", "c_address", "c_comment"]] | |
loc = lo_aggr.merge(c, left_on="o_custkey", right_on="c_custkey") | |
locn = loc.merge(nation[["n_nationkey", "n_name"]], left_on="c_nationkey", right_on="n_nationkey") | |
res = locn[["o_custkey", "c_name", "volume", "c_acctbal", "n_name", "c_address", "c_phone", "c_comment"]].sort_values("volume", ascending=False).head(20) | |
return res | |
##### | |
n = 5 | |
f = open("pandas.csv", 'w') | |
writer = csv.writer(f) | |
def bench(q): | |
res = np.median(timeit.repeat("q%d()" % q, setup="from __main__ import q%d" % q, number=1, repeat=n)) | |
print(res) | |
writer.writerow(["pandas", "%d" % q, "%f" % res]) | |
f.flush() | |
bench(1) | |
bench(2) | |
bench(3) | |
bench(4) | |
bench(5) | |
bench(6) | |
bench(7) | |
bench(8) | |
bench(9) | |
bench(10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment