Last active
November 30, 2022 20:10
-
-
Save samukweku/c2a83fb37cd8d43e3f0e7742db8d653b to your computer and use it in GitHub Desktop.
Code for pydata global conditional_join talk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data is from DuckDB's github repo | |
# https://github.com/duckdb/duckdb/tree/master/benchmark/micro/join | |
import pandas as pd; import numpy as np; import janitor as jn; import duckdb | |
query = """SELECT SETSEED(0.8675309); | |
CREATE TABLE events AS ( | |
SELECT *, | |
"start" + INTERVAL (CASE WHEN random() < 0.1 THEN 120 ELSE (5 + round(random() * 50, 0)::BIGINT) END) MINUTE | |
AS "end" | |
FROM ( | |
SELECT id, | |
'Event ' || id::VARCHAR as "name", | |
(5 + round(random() * 5000, 0)::BIGINT) AS audience, | |
'1992-01-01'::TIMESTAMP | |
+ INTERVAL (round(random() * 40 * 365, 0)::BIGINT) DAY | |
+ INTERVAL (round(random() * 23, 0)::BIGINT) HOUR | |
AS "start", | |
'Sponsor ' || (1 + round(random() * 10, 0)::BIGINT) AS sponsor | |
FROM range(1, 30000) tbl(id) | |
) q | |
);""" | |
con = duckdb.connect() | |
con.execute(query) # create table | |
# duckdb computation | |
event_count = """SELECT COUNT(*) FROM ( | |
SELECT r.id, s.id | |
FROM events r, events s | |
WHERE r.start <= s.end AND r.end >= s.start | |
AND r.id <> s.id | |
) q2;""" | |
%timeit con.execute(event_count) | |
# create Pandas dataframe and run conditional_join | |
events = con.execute("select * from events").df() | |
%%timeit | |
(events | |
.conditional_join( | |
events, | |
("start", "end", "<="), | |
("end", "start", ">="), | |
("id", "id", "!="), | |
use_numba = False) | |
) | |
%%timeit | |
(events | |
.conditional_join( | |
events, | |
("start", "end", "<="), | |
("end", "start", ">="), | |
("id", "id", "!="), | |
use_numba = True) | |
) | |
##### Tax audits | |
query = """CREATE TYPE surname_t AS ENUM ( | |
'Smith', | |
'Johnson', | |
'Williams', | |
'Jones', | |
'Brown', | |
'Davis', | |
'Miller', | |
'Wilson', | |
'Moore', | |
'Taylor', | |
'Anderson', | |
'Thomas', | |
'Jackson', | |
'White', | |
'Harris', | |
'Martin', | |
'Thompson', | |
'Garcia', | |
'Martinez', | |
'Robinson' | |
); | |
SELECT SETSEED(0.8675309); | |
CREATE TABLE employees AS | |
SELECT | |
facts.id AS id, | |
surname AS "name", | |
dept, | |
salary, | |
(salary / 10 - CASE WHEN random() <= 0.01 THEN (10 + 1) ELSE 0 END)::INTEGER AS tax | |
FROM ( | |
SELECT | |
id, | |
enum_range(NULL::surname_t)[(round(random() * 19))::INTEGER] AS surname, | |
round(random() * 5)::INTEGER AS dept, | |
100 * id AS salary | |
FROM (SELECT UNNEST(range(1, 10000000))) tbl(id) | |
) facts | |
;""" | |
con = duckdb.connect() | |
con.execute(query) | |
counts = """SELECT COUNT(*) FROM ( | |
SELECT r.id, s.id | |
FROM employees r, employees s | |
WHERE r.salary < s.salary AND r.tax > s.tax | |
) q1;""" | |
%timeit con.execute(counts) | |
# creata Pandas dataframe | |
employees = con.execute("select * from employees").df() | |
%%timeit | |
(employees | |
.conditional_join( | |
employees, | |
('salary', 'salary', '<'), | |
('tax', 'tax', '>'), | |
use_numba = True) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment