Created
March 24, 2012 00:25
-
-
Save ericbmerritt/2176689 to your computer and use it in GitHub Desktop.
myrmas path compression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%% Myrmas Path Compression | |
%%% ======================= | |
%%% * author: Eric Merritt | |
%%% * copyright: Afiniate, Inc | |
%%% | |
%%% License | |
%%% ------- | |
%%% Licensed under the Apache License, Version 2.0 you may not use | |
%%% this file except in compliance with the License. You may obtain a | |
%%% copy of the License at http://www.apache.org/licenses/LICENSE-2.0 | |
%%% | |
%%% Overview | |
%%% -------- | |
%%% | |
%%% Path compression is about taking a tree (which all erlang terms | |
%%% are) and compressing that tree to a set of id/value pairs. In this | |
%%% case the ide is a representation of that leaf's unique position in | |
%%% the tree. This allows us to flatten the tree and produce a set of | |
%%% values that represent it in a consistant way. The downside of this | |
%%% is, of course, greater space usage. However the upside is much | |
%%% simpler manipulation and distribution. | |
%%% | |
%%% | |
%%% Path Compression | |
%%% ---------------- | |
%%% | |
%%% Path compression is down using 16 bit values to represent every | |
%%% level in the tree (every vertice) and the edge. The first two bits | |
%%% is used to uniquely identify the type of the edge, while the | |
%%% remaining 13bits is used to uniquely represent the position in the | |
%%% at that level. | |
%%% | |
%%% To illustrate. Lets take the simple value foo. Is an atom (tag 6) | |
%%% that has a position of zero so we could encode its value as | |
%%% <<?ATOM_TAG:3, 0:13>>. | |
%%% | |
%%% Lets take another example. Something more complex. A list of two values. | |
%%% | |
%%% [1, 2] | |
%%% | |
%%% We will end up with a list of tuples, each identified by a 32 bit | |
%%% attribute. 16 bits to identify the list and 16 bits to identify | |
%%% the integer. so the first value in the attribute will be | |
%%% | |
%%% <<?LIST_TAG:3, 0:13>> | |
%%% | |
%%% This will be followed for each value by the id for the leaf. So in | |
%%% the case of '1' the identifier will look like. | |
%%% | |
%%% << <<?INT_TAG:3, 0:13>>, <<?LIST_TAG:3, 0:13>> >> | |
%%% | |
%%% We are using a bit of psuedo code here to show the value. The | |
%%% attribute for 2 will look as folows. | |
%%% | |
%%% << <<?INT_TAG:3, 1:13>>, <<?LIST_TAG:3, 0:13>> >> | |
%%% | |
%%% You may notice that the attribute set is in reverse order. we do | |
%%% that simply because thats how we build up the binary. | |
%%% | |
%%% Because we expect to interact with native code that has certain | |
%%% limitations we are going to limit the size of ints to those that | |
%%% can be represented by a 64 bit integer. | |
-module(myr_path). | |
-export([compress/1]). | |
-define(MAX_SIZE_REPR, 8191). | |
-define(MAX_INTEGER, 9223372036854775807). | |
-define(MIN_INTEGER, -9223372036854775808). | |
-define(TUPLE_TAG, 1). | |
-define(LIST_TAG, 2). | |
-define(INT_TAG, 3). | |
-define(UNSIGNED_TAG, 4). | |
-define(FLOAT_TAG, 5). | |
-define(BIN_TAG, 6). | |
-define(ATOM_TAG, 7). | |
-define(TUPLE_VAL, 16#EFFAB1E). | |
-define(LIST_VAL, 1157). | |
%%==================================================================== | |
%% API | |
%%==================================================================== | |
compress(Term) -> | |
%% We use md5 here because it has a reasonably low chance of | |
%% collision while being 128 bit, which will allow us to do a | |
%% single instruction compare on the native side. | |
Id = create_id(Term), | |
case flatten(Id, Term, 0, <<>>) of | |
Result when is_list(Result) -> | |
lists:flatten(Result); | |
Else -> | |
Else | |
end. | |
flatten(Id, Term, Count, Attr) when is_tuple(Term) -> | |
StageRep = type(Term, Count), | |
NewAttr = <<StageRep/binary, Attr/binary>>, | |
{Vals, _} = | |
lists:foldl(fun(El, {DataAcc, NewCount}) -> | |
{[flatten(Id, El, NewCount, NewAttr) | DataAcc], | |
NewCount + 1} | |
end, {[], 0}, erlang:tuple_to_list(Term)), | |
[{Id, StageRep, ?TUPLE_VAL} | Vals]; | |
flatten(Id, Term, Count, Attr) when is_list(Term) -> | |
StageRep = type(Term, Count), | |
NewAttr = <<StageRep/binary, Attr/binary>>, | |
{Vals, _} = | |
lists:foldl(fun(El, {DataAcc, NewCount}) -> | |
{[flatten(Id, El, NewCount, NewAttr) | DataAcc], | |
NewCount + 1} | |
end, {[], 0}, Term), | |
[{Id, StageRep, ?LIST_VAL} | Vals]; | |
flatten(Id, Term, Count, Attr) -> | |
StageRep = type(Term, Count), | |
NewAttr = <<StageRep/binary, Attr/binary>>, | |
{Id, NewAttr, Term}. | |
type(Term, Count) when is_tuple(Term), | |
Count =< ?MAX_SIZE_REPR -> | |
<<?TUPLE_TAG:3, Count:13>>; | |
type(Term, Count) when is_list(Term), | |
Count =< ?MAX_SIZE_REPR -> | |
<<?LIST_TAG:3, Count:13>>; | |
type(Term, Count) when is_integer(Term), | |
Count =< ?MAX_SIZE_REPR, | |
Term >= ?MIN_INTEGER, | |
Term =< ?MAX_INTEGER -> | |
<<?INT_TAG:3, Count:13>>; | |
type(Term, Count) when is_float(Term), | |
Count =< ?MAX_SIZE_REPR -> | |
<<?FLOAT_TAG:3, Count:13>>; | |
type(Term, Count) when is_binary(Term), | |
Count =< ?MAX_SIZE_REPR -> | |
<<?BIN_TAG:3, Count:13>>; | |
type(Term, Count) when is_atom(Term), | |
Count =< ?MAX_SIZE_REPR -> | |
<<?ATOM_TAG:3, Count:13>>; | |
type(Term, _Count) -> | |
erlang:throw({unencodable_term, Term}). | |
create_id(Term) -> | |
crypto:md5(erlang:term_to_binary(Term)). | |
%%==================================================================== | |
%% Tests | |
%%==================================================================== | |
-ifndef(NOTEST). | |
-include_lib("eunit/include/eunit.hrl"). | |
basic_test() -> | |
AtomId = create_id(foo), | |
AtomAttr = <<?ATOM_TAG:3, 0:13>>, | |
?assertMatch({AtomId, AtomAttr, foo}, | |
compress(foo)), | |
TupleId = create_id({foo}), | |
Tuple0Attr = <<?TUPLE_TAG:3, 0:13>>, | |
Atom1Attr = <<?ATOM_TAG:3, 0:13>>, | |
Tuple1Attr = <<Atom1Attr/binary, Tuple0Attr/binary>>, | |
?assertMatch([{TupleId, | |
<<32,0>>, | |
251636510}, | |
{TupleId, Tuple1Attr, foo}], | |
compress({foo})), | |
ListId = create_id([foo]), | |
List0Attr = <<?LIST_TAG:3, 0:13>>, | |
List1Attr = <<Atom1Attr/binary, List0Attr/binary>>, | |
?assertMatch([{ListId, <<64,0>>, 1157}, | |
{ListId, List1Attr, foo}], | |
compress([foo])), | |
IntegerId = create_id(-1), | |
IntegerAttr = <<?INT_TAG:3, 0:13>>, | |
?assertMatch({IntegerId, IntegerAttr, -1}, | |
compress(-1)), | |
FloatId = create_id(7.1e+50), | |
FloatAttr = <<?FLOAT_TAG:3, 0:13>>, | |
?assertMatch({FloatId, FloatAttr, 7.1e+50}, | |
compress(7.1e+50)), | |
BinaryId = create_id(<<"foo">>), | |
BinaryAttr = <<?BIN_TAG:3, 0:13>>, | |
?assertMatch({BinaryId, BinaryAttr, <<"foo">>}, | |
compress(<<"foo">>)). | |
-endif. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment