-
-
Save macdice/10001471 to your computer and use it in GitHub Desktop.
-- Midnight graph hacking with your host Thomas Munro <[email protected]> | |
create table data_source ( | |
id serial primary key, | |
name text not null | |
); | |
create table currency ( | |
id text primary key | |
); | |
create table quote_stream ( | |
id serial primary key, | |
data_source integer not null references data_source(id), | |
base_currency text not null references currency(id), | |
quote_currency text not null references currency(id) | |
); | |
create table quote ( | |
quote_stream integer not null references quote_stream(id), | |
time timestamptz not null, | |
bid decimal not null, | |
ask decimal not null | |
); | |
create index quote_time_idx on quote(quote_stream, time); | |
create or replace function last_quote_at_time(quote_stream integer, t timestamptz) | |
returns table (quote_stream integer, t timestamptz, bid decimal, ask decimal) | |
as | |
$$ | |
select quote_stream, time, bid, ask | |
from quote | |
where quote_stream = $1 | |
and time <= $2 | |
order by time desc | |
limit 1 | |
$$ | |
language sql; | |
create or replace function last_quotes_at_time(t timestamptz) | |
returns table (quote_stream integer, t2 timestamptz, bid decimal, ask decimal) | |
as | |
$$ | |
declare | |
quote_stream integer; | |
record record; | |
begin | |
for quote_stream in select id from quote_stream | |
loop | |
return query select * from last_quote_at_time(quote_stream, t); | |
end loop; | |
return; | |
end; | |
$$ | |
language plpgsql; | |
create or replace function two_way_rates_at_time(t timestamptz) | |
returns table (quote_stream integer, side text, from_currency text, to_currency text, rate decimal) | |
as | |
$$ | |
-- each quote is expanded to a pair of simple X->Y rates | |
with quote_at_time as (select q.quote_stream, | |
i.base_currency, | |
i.quote_currency, | |
q.bid, | |
q.ask, | |
q.t2 | |
from last_quotes_at_time($1) as q | |
join quote_stream i on q.quote_stream = i.id) | |
select quote_stream, 'BUY', quote_currency, base_currency, ask | |
from quote_at_time | |
union all | |
select quote_stream, 'SELL', base_currency, quote_currency, 1 / bid | |
from quote_at_time | |
$$ | |
language sql; | |
create type edge_descriptor as ( | |
quote_stream integer, | |
side text, | |
from_currency text, | |
to_currency text, | |
rate decimal | |
); | |
-- From http://stackoverflow.com/questions/8798055/finding-the-position-of-a-value-in-postgresql-arrays | |
create or replace function array_search(needle anyelement, haystack anyarray) | |
returns int as $$ | |
select i | |
from generate_subscripts($2, 1) as i | |
where $2[i] = $1 | |
order by i | |
$$ language sql stable; | |
-- The Bellman-Ford algorithm hacked up from pseudo-code at: | |
-- http://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm | |
-- Adjusted for products rather than sums, using natural log, as described at: | |
-- http://math.stackexchange.com/questions/271809/does-dijkstras-algorithm-work-when-i-multiply-weights-of-successive-nodes | |
-- Also hacked to track the 'route' through the graph via edges, | |
-- unlike the Wikipedia pseudo-code which only tracks the predecessor, | |
-- since we might have more than one edge between two given vertices | |
-- (I mean we might have two brokers quoting EUR/USD with different | |
-- rates and we need to find the best one, so there are TWO edges | |
-- between EUR and USD). | |
create or replace function cheapest_path_at_time(t timestamptz, from_currency text, to_currency text) | |
returns table (quote_stream integer, side text, rate decimal) | |
as | |
$$ | |
declare | |
vertices text[]; -- the distinct vertices | |
edges edge_descriptor[]; -- the edges | |
edge_from integer[]; -- the edge sources as index to vertices | |
edge_to integer[]; -- the edge targets as index to vertices | |
edge_weight double precision[]; -- the edge weights, log(rate) | |
distance double precision[]; -- computed distances | |
predecessor integer[]; -- computed path as index to vertices | |
route integer[]; -- arrival edge index indexed by vertex | |
edge integer; -- temp variable | |
u integer; -- ditto | |
v integer; -- ditto | |
w double precision; -- ditto | |
to_currency_index integer; -- to currency as index to vertices | |
path integer[]; -- where we build the result | |
begin | |
edges := array(select two_way_rates_at_time(t)); | |
vertices := array(select distinct e.from_currency from unnest(edges) e); | |
edge_from := array(select array_search(e.from_currency, vertices) from unnest(edges) e); | |
edge_to := array(select array_search(e.to_currency, vertices) from unnest(edges) e); | |
edge_weight := array(select ln(e.rate) from unnest(edges) e); | |
--raise notice 'edge_from = %', edge_from; | |
--raise notice 'edge_to = %', edge_to; | |
--raise notice 'edges = %', edges; | |
--raise notice 'vertices = %', vertices; | |
--raise notice 'edge_weight = %', edge_weight; | |
to_currency_index := array_search(to_currency, vertices); | |
if to_currency_index is null then | |
raise exception 'Currency % is unknown or unreachable', to_currency; | |
-- or just return no rows? | |
end if; | |
-- Step 1: initialize graph | |
for v in 1..array_length(vertices, 1) loop | |
if vertices[v] = from_currency then | |
distance[v] := 0; | |
else | |
distance[v] := 'Infinity'; | |
end if; | |
predecessor[v] := null; | |
route[v] := null; | |
end loop; | |
--raise notice 'distance = %', distance; | |
--raise notice 'predecessor = %', predecessor; | |
--raise notice 'route = %', route; | |
-- Step 2: relax edges repeatedly | |
for i in 1..array_length(vertices, 1) - 1 loop | |
--raise notice 'i = %', i; | |
for edge in 1..array_length(edges, 1) loop | |
--raise notice 'edge = %', edge; | |
u := edge_from[edge]; | |
v := edge_to[edge]; | |
w := edge_weight[edge]; | |
if distance[u] + w < distance[v] then | |
distance[v] := distance[u] + w; | |
predecessor[v] := u; | |
route[v] := edge; | |
end if; | |
end loop; | |
end loop; | |
--raise notice 'distance = %', distance; | |
--raise notice 'predecessor = %', predecessor; | |
--raise notice 'route = %', route; | |
-- Step 3: check for negative-weight cycles | |
for edge in 1..array_length(edges, 1) loop | |
u := edge_from[edge]; | |
v := edge_to[edge]; | |
w := edge_weight[edge]; | |
if distance[u] + w < distance[v] then | |
raise exception 'Graph contains a negative-weight cycle'; | |
end if; | |
end loop; | |
-- decode the results to find the path to to_currency, | |
-- building path in the right order | |
edge := route[to_currency_index]; | |
while edge is not null loop | |
path := edge || path; | |
edge := route[edge_from[edge]]; | |
end loop; | |
-- generate the result rows | |
foreach edge in array path loop | |
quote_stream := edges[edge].quote_stream; | |
side := edges[edge].side; | |
rate := edges[edge].rate; | |
return next; | |
end loop; | |
end; | |
$$ | |
language plpgsql; | |
insert into data_source values (1, 'Banque de Foo'), (2, 'Banco de Bar'); | |
insert into currency values ('AUD'), ('USD'), ('EUR'), ('GBP'), ('BTC'); | |
insert into quote_stream values (1, 1, 'BTC', 'USD'), | |
(2, 1, 'AUD', 'USD'), | |
(3, 1, 'EUR', 'USD'), | |
(4, 1, 'EUR', 'GBP'), | |
(5, 2, 'EUR', 'USD'); | |
insert into quote values (1, '2014-01-01 12:00:00+00'::timestamptz, 440, 450), | |
(1, '2014-01-01 12:05:00+00'::timestamptz, 450, 460), | |
(2, '2014-01-01 12:01:00+00'::timestamptz, 0.9242, 0.9244), | |
(3, '2014-01-01 12:01:00+00'::timestamptz, 1.3790, 1.3792), | |
(4, '2014-01-01 12:01:00+00'::timestamptz, 0.8294, 0.8296), | |
(5, '2014-01-01 12:01:00+00'::timestamptz, 1.3790, 1.3791); | |
-- Ok check dis: | |
-- Here are our quote streams: | |
hack=> select * from quote_stream; | |
id | data_source | base_currency | quote_currency | |
----+-------------+---------------+---------------- | |
1 | 1 | BTC | USD | |
2 | 1 | AUD | USD | |
3 | 1 | EUR | USD | |
4 | 1 | EUR | GBP | |
5 | 2 | EUR | USD | |
(5 rows) | |
-- Here are the active (ie most recent) quotes at time 'now()': | |
hack=> select * from last_quotes_at_time(now()); | |
quote_stream | t2 | bid | ask | |
--------------+------------------------+--------+-------- | |
1 | 2014-01-01 12:05:00+00 | 450 | 460 | |
2 | 2014-01-01 12:01:00+00 | 0.9242 | 0.9244 | |
3 | 2014-01-01 12:01:00+00 | 1.3790 | 1.3792 | |
4 | 2014-01-01 12:01:00+00 | 0.8294 | 0.8296 | |
5 | 2014-01-01 12:01:00+00 | 1.3790 | 1.3791 | |
(5 rows) | |
-- Here is the same information exploded into individual edges: | |
hack=> select * from two_way_rates_at_time(now()); | |
quote_stream | side | from_currency | to_currency | rate | |
--------------+------+---------------+-------------+------------------------ | |
1 | BUY | USD | BTC | 460 | |
2 | BUY | USD | AUD | 0.9244 | |
3 | BUY | USD | EUR | 1.3792 | |
4 | BUY | GBP | EUR | 0.8296 | |
5 | BUY | USD | EUR | 1.3791 | |
1 | SELL | BTC | USD | 0.00222222222222222222 | |
2 | SELL | AUD | USD | 1.0820168794633196 | |
3 | SELL | EUR | USD | 0.72516316171138506164 | |
4 | SELL | EUR | GBP | 1.2056908608632747 | |
5 | SELL | EUR | USD | 0.72516316171138506164 | |
(10 rows) | |
-- Given those the rates, and the desire to get from BTC to GBP, | |
-- find the cheapest path: | |
hack=> select * from cheapest_path_at_time(now(), 'BTC', 'GBP'); | |
quote_stream | side | rate | |
--------------+------+------------------------ | |
1 | SELL | 0.00222222222222222222 | |
5 | BUY | 1.3791 | |
4 | SELL | 1.2056908608632747 | |
(3 rows) | |
In English, this says you should: | |
sell BTC/USD to go BTC->USD with quote stream 1 (data source 1) | |
buy EUR/USD to go USD->EUR with quote stream 5 (data source 2) | |
sell EUR/GBP to go EUR->GBP with quote stream 4 (data source 1) | |
Note that in the second step it correctly picked stream 5 over stream 3 | |
because it's one pip cheaper to buy (ie the ask quote). | |
The total effective rate is the product of the rates shown, so | |
0.003695040591555365, or 1/270.63... which is about right for | |
the BTC/GBP rate. | |
Hardly tested but it seems to work. | |
Two problems I imagine: | |
* there may occasionally be negative weight cycles, that is, a path | |
you can go around repeatedly to get free money, but these would | |
most likely be mirages caused by stale data or some other data | |
quality issue. These cycles do occasionally exist in real | |
life currency markets -- see http://scenic.princeton.edu/network20q/wiki/index.php?title=Bellman_Ford_Application:_Currency_Arbitrage | |
that may sound like a fun prospect but it breaks this code because | |
it means there is no 'cheapest' path, it could always go around | |
once more to get cheaper. | |
* this doesn't account for stale quotes, like, some feed you set | |
up that has a most recent quote from last week sitting there | |
which is actually bogus -- probably need to introduce the concept | |
of quote expiry time, and ignore quotes older than that |
I could fix that by explicitly reversing the results... The result of BF is the shortest path from your chosen source node to every other node, and for each node it tells you which one is the 'predecessor' in the path from the source (unlike the Wikipedia article's pseudocode I actually record not just the predecessor node but also the edge used, as 'route', since you could have multiple edges between two nodes eg quotes from several banks for the same rate, whereas the pseudocode given would just tell you the order of the nodes to visit) . So in that last bit at line 194 you can see I have to find the destination node, and then hop back edge by edge to the source, to get the full path, feeding the result rows out as I go. Backwards!
Fixed, the result of cheapest_path_at_time is now in forwards order.
Luvvley! How come the cheapest_path_at_time() results are in reverse order?