Skip to content

Instantly share code, notes, and snippets.

@ericbmerritt
Created March 24, 2012 00:25
Show Gist options
  • Save ericbmerritt/2176689 to your computer and use it in GitHub Desktop.
Save ericbmerritt/2176689 to your computer and use it in GitHub Desktop.
myrmas path compression
%%% Myrmas Path Compression
%%% =======================
%%% * author: Eric Merritt
%%% * copyright: Afiniate, Inc
%%%
%%% License
%%% -------
%%% Licensed under the Apache License, Version 2.0 you may not use
%%% this file except in compliance with the License. You may obtain a
%%% copy of the License at http://www.apache.org/licenses/LICENSE-2.0
%%%
%%% Overview
%%% --------
%%%
%%% Path compression is about taking a tree (which all erlang terms
%%% are) and compressing that tree to a set of id/value pairs. In this
%%% case the ide is a representation of that leaf's unique position in
%%% the tree. This allows us to flatten the tree and produce a set of
%%% values that represent it in a consistant way. The downside of this
%%% is, of course, greater space usage. However the upside is much
%%% simpler manipulation and distribution.
%%%
%%%
%%% Path Compression
%%% ----------------
%%%
%%% Path compression is down using 16 bit values to represent every
%%% level in the tree (every vertice) and the edge. The first two bits
%%% is used to uniquely identify the type of the edge, while the
%%% remaining 13bits is used to uniquely represent the position in the
%%% at that level.
%%%
%%% To illustrate. Lets take the simple value foo. Is an atom (tag 6)
%%% that has a position of zero so we could encode its value as
%%% <<?ATOM_TAG:3, 0:13>>.
%%%
%%% Lets take another example. Something more complex. A list of two values.
%%%
%%% [1, 2]
%%%
%%% We will end up with a list of tuples, each identified by a 32 bit
%%% attribute. 16 bits to identify the list and 16 bits to identify
%%% the integer. so the first value in the attribute will be
%%%
%%% <<?LIST_TAG:3, 0:13>>
%%%
%%% This will be followed for each value by the id for the leaf. So in
%%% the case of '1' the identifier will look like.
%%%
%%% << <<?INT_TAG:3, 0:13>>, <<?LIST_TAG:3, 0:13>> >>
%%%
%%% We are using a bit of psuedo code here to show the value. The
%%% attribute for 2 will look as folows.
%%%
%%% << <<?INT_TAG:3, 1:13>>, <<?LIST_TAG:3, 0:13>> >>
%%%
%%% You may notice that the attribute set is in reverse order. we do
%%% that simply because thats how we build up the binary.
%%%
%%% Because we expect to interact with native code that has certain
%%% limitations we are going to limit the size of ints to those that
%%% can be represented by a 64 bit integer.
-module(myr_path).
-export([compress/1]).
-define(MAX_SIZE_REPR, 8191).
-define(MAX_INTEGER, 9223372036854775807).
-define(MIN_INTEGER, -9223372036854775808).
-define(TUPLE_TAG, 1).
-define(LIST_TAG, 2).
-define(INT_TAG, 3).
-define(UNSIGNED_TAG, 4).
-define(FLOAT_TAG, 5).
-define(BIN_TAG, 6).
-define(ATOM_TAG, 7).
-define(TUPLE_VAL, 16#EFFAB1E).
-define(LIST_VAL, 1157).
%%====================================================================
%% API
%%====================================================================
compress(Term) ->
%% We use md5 here because it has a reasonably low chance of
%% collision while being 128 bit, which will allow us to do a
%% single instruction compare on the native side.
Id = create_id(Term),
case flatten(Id, Term, 0, <<>>) of
Result when is_list(Result) ->
lists:flatten(Result);
Else ->
Else
end.
flatten(Id, Term, Count, Attr) when is_tuple(Term) ->
StageRep = type(Term, Count),
NewAttr = <<StageRep/binary, Attr/binary>>,
{Vals, _} =
lists:foldl(fun(El, {DataAcc, NewCount}) ->
{[flatten(Id, El, NewCount, NewAttr) | DataAcc],
NewCount + 1}
end, {[], 0}, erlang:tuple_to_list(Term)),
[{Id, StageRep, ?TUPLE_VAL} | Vals];
flatten(Id, Term, Count, Attr) when is_list(Term) ->
StageRep = type(Term, Count),
NewAttr = <<StageRep/binary, Attr/binary>>,
{Vals, _} =
lists:foldl(fun(El, {DataAcc, NewCount}) ->
{[flatten(Id, El, NewCount, NewAttr) | DataAcc],
NewCount + 1}
end, {[], 0}, Term),
[{Id, StageRep, ?LIST_VAL} | Vals];
flatten(Id, Term, Count, Attr) ->
StageRep = type(Term, Count),
NewAttr = <<StageRep/binary, Attr/binary>>,
{Id, NewAttr, Term}.
type(Term, Count) when is_tuple(Term),
Count =< ?MAX_SIZE_REPR ->
<<?TUPLE_TAG:3, Count:13>>;
type(Term, Count) when is_list(Term),
Count =< ?MAX_SIZE_REPR ->
<<?LIST_TAG:3, Count:13>>;
type(Term, Count) when is_integer(Term),
Count =< ?MAX_SIZE_REPR,
Term >= ?MIN_INTEGER,
Term =< ?MAX_INTEGER ->
<<?INT_TAG:3, Count:13>>;
type(Term, Count) when is_float(Term),
Count =< ?MAX_SIZE_REPR ->
<<?FLOAT_TAG:3, Count:13>>;
type(Term, Count) when is_binary(Term),
Count =< ?MAX_SIZE_REPR ->
<<?BIN_TAG:3, Count:13>>;
type(Term, Count) when is_atom(Term),
Count =< ?MAX_SIZE_REPR ->
<<?ATOM_TAG:3, Count:13>>;
type(Term, _Count) ->
erlang:throw({unencodable_term, Term}).
create_id(Term) ->
crypto:md5(erlang:term_to_binary(Term)).
%%====================================================================
%% Tests
%%====================================================================
-ifndef(NOTEST).
-include_lib("eunit/include/eunit.hrl").
basic_test() ->
AtomId = create_id(foo),
AtomAttr = <<?ATOM_TAG:3, 0:13>>,
?assertMatch({AtomId, AtomAttr, foo},
compress(foo)),
TupleId = create_id({foo}),
Tuple0Attr = <<?TUPLE_TAG:3, 0:13>>,
Atom1Attr = <<?ATOM_TAG:3, 0:13>>,
Tuple1Attr = <<Atom1Attr/binary, Tuple0Attr/binary>>,
?assertMatch([{TupleId,
<<32,0>>,
251636510},
{TupleId, Tuple1Attr, foo}],
compress({foo})),
ListId = create_id([foo]),
List0Attr = <<?LIST_TAG:3, 0:13>>,
List1Attr = <<Atom1Attr/binary, List0Attr/binary>>,
?assertMatch([{ListId, <<64,0>>, 1157},
{ListId, List1Attr, foo}],
compress([foo])),
IntegerId = create_id(-1),
IntegerAttr = <<?INT_TAG:3, 0:13>>,
?assertMatch({IntegerId, IntegerAttr, -1},
compress(-1)),
FloatId = create_id(7.1e+50),
FloatAttr = <<?FLOAT_TAG:3, 0:13>>,
?assertMatch({FloatId, FloatAttr, 7.1e+50},
compress(7.1e+50)),
BinaryId = create_id(<<"foo">>),
BinaryAttr = <<?BIN_TAG:3, 0:13>>,
?assertMatch({BinaryId, BinaryAttr, <<"foo">>},
compress(<<"foo">>)).
-endif.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment