ericbmerritt · March 24, 2012 00:25
diff --git a/gistfile1.erl b/gistfile1.erl
 %%% Myrmas Path Compression
 %%% =======================
 %%% * author: Eric Merritt
 %%% * copyright: Afiniate, Inc
 %%%
 %%% License
 %%% -------
 %%% Licensed under the Apache License, Version 2.0 you may not use
 %%% this file except in compliance with the License.  You may obtain a
 %%% copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 %%%
 %%% Overview
 %%% --------
 %%%
 %%% Path compression is about taking a tree (which all erlang terms
 %%% are) and compressing that tree to a set of id/value pairs. In this
 %%% case the ide is a representation of that leaf's unique position in
 %%% the tree. This allows us to flatten the tree and produce a set of
 %%% values that represent it in a consistant way. The downside of this
 %%% is, of course, greater space usage. However the upside is much
 %%% simpler manipulation and distribution.
 %%%
 %%%
 %%% Path Compression
 %%% ----------------
 %%%
 %%% Path compression is down using 16 bit values to represent every
 %%% level in the tree (every vertice) and the edge. The first two bits
 %%% is used to uniquely identify the type of the edge, while the
 %%% remaining 13bits is used to uniquely represent the position in the
 %%% at that level.
 %%%
 %%% To illustrate. Lets take the simple value foo. Is an atom (tag 6)
 %%% that has a position of zero so we could encode its value as
 %%%     <<?ATOM_TAG:3, 0:13>>.
 %%%
 %%% Lets take another example. Something more complex. A list of two values.
 %%%
 %%%     [1, 2]
 %%%
 %%% We will end up with a list of tuples, each identified by a 32 bit
 %%% attribute. 16 bits to identify the list and 16 bits to identify
 %%% the integer. so the first value in the attribute will be
 %%%
 %%%     <<?LIST_TAG:3, 0:13>>
 %%%
 %%% This will be followed for each value by the id for the leaf. So in
 %%% the case of '1' the identifier will look like.
 %%%
 %%%    << <<?INT_TAG:3, 0:13>>, <<?LIST_TAG:3, 0:13>> >>
 %%%
 %%% We are using a bit of psuedo code here to show the value. The
 %%% attribute for 2 will look as folows.
 %%%
 %%%   << <<?INT_TAG:3, 1:13>>, <<?LIST_TAG:3, 0:13>> >>
 %%%
 %%% You may notice that the attribute set is in reverse order. we do
 %%% that simply because thats how we build up the binary.
 %%%
 %%% Because we expect to interact with native code that has certain
 %%% limitations we are going to limit the size of ints to those that
 %%% can be represented by a 64 bit integer.
 -module(myr_path).

 -export([compress/1]).

 -define(MAX_SIZE_REPR, 8191).
 -define(MAX_INTEGER, 9223372036854775807).
 -define(MIN_INTEGER, -9223372036854775808).
 -define(TUPLE_TAG, 1).
 -define(LIST_TAG, 2).
 -define(INT_TAG, 3).
 -define(UNSIGNED_TAG, 4).
 -define(FLOAT_TAG, 5).
 -define(BIN_TAG, 6).
 -define(ATOM_TAG, 7).

 -define(TUPLE_VAL, 16#EFFAB1E).
 -define(LIST_VAL, 1157).

 %%====================================================================
 %% API
 %%====================================================================

 compress(Term) ->
    %% We use md5 here because it has a reasonably low chance of
    %% collision while being 128 bit, which will allow us to do a
    %% single instruction compare on the native side.
    Id = create_id(Term),
    case flatten(Id, Term, 0, <<>>) of
       Result when is_list(Result) ->
           lists:flatten(Result);
       Else ->
           Else
   end.

 flatten(Id, Term, Count, Attr) when is_tuple(Term) ->
    StageRep = type(Term, Count),
    NewAttr = <<StageRep/binary, Attr/binary>>,
    {Vals, _} =
        lists:foldl(fun(El, {DataAcc, NewCount}) ->
                            {[flatten(Id, El, NewCount, NewAttr) | DataAcc],
                             NewCount + 1}
                    end, {[], 0}, erlang:tuple_to_list(Term)),
    [{Id, StageRep, ?TUPLE_VAL} | Vals];
 flatten(Id, Term, Count, Attr) when is_list(Term) ->
    StageRep = type(Term, Count),
    NewAttr = <<StageRep/binary, Attr/binary>>,
    {Vals, _} =
        lists:foldl(fun(El, {DataAcc, NewCount}) ->
                            {[flatten(Id, El, NewCount, NewAttr) | DataAcc],
                             NewCount + 1}
                    end, {[], 0}, Term),
    [{Id, StageRep, ?LIST_VAL} | Vals];
 flatten(Id, Term, Count, Attr) ->
    StageRep = type(Term, Count),
    NewAttr = <<StageRep/binary, Attr/binary>>,
    {Id, NewAttr, Term}.

 type(Term, Count) when is_tuple(Term),
       Count =< ?MAX_SIZE_REPR ->
    <<?TUPLE_TAG:3, Count:13>>;
 type(Term, Count) when is_list(Term),
                       Count =< ?MAX_SIZE_REPR ->
    <<?LIST_TAG:3, Count:13>>;
 type(Term, Count) when is_integer(Term),
                       Count =< ?MAX_SIZE_REPR,
                       Term >= ?MIN_INTEGER,
                       Term =< ?MAX_INTEGER ->
    <<?INT_TAG:3, Count:13>>;
 type(Term, Count) when is_float(Term),
                       Count =< ?MAX_SIZE_REPR ->
    <<?FLOAT_TAG:3, Count:13>>;
 type(Term, Count) when is_binary(Term),
                       Count =< ?MAX_SIZE_REPR ->
    <<?BIN_TAG:3, Count:13>>;
 type(Term, Count) when is_atom(Term),
                       Count =< ?MAX_SIZE_REPR ->
    <<?ATOM_TAG:3, Count:13>>;
 type(Term, _Count) ->
    erlang:throw({unencodable_term, Term}).

 create_id(Term) ->
    crypto:md5(erlang:term_to_binary(Term)).

 %%====================================================================
 %% Tests
 %%====================================================================
 -ifndef(NOTEST).
 -include_lib("eunit/include/eunit.hrl").

 basic_test() ->
    AtomId = create_id(foo),
    AtomAttr = <<?ATOM_TAG:3, 0:13>>,
    ?assertMatch({AtomId, AtomAttr, foo},
                 compress(foo)),

    TupleId = create_id({foo}),
    Tuple0Attr = <<?TUPLE_TAG:3, 0:13>>,
    Atom1Attr = <<?ATOM_TAG:3, 0:13>>,
    Tuple1Attr = <<Atom1Attr/binary, Tuple0Attr/binary>>,
    ?assertMatch([{TupleId,
                   <<32,0>>,
                   251636510},
                  {TupleId, Tuple1Attr, foo}],
                 compress({foo})),

    ListId = create_id([foo]),
    List0Attr = <<?LIST_TAG:3, 0:13>>,
    List1Attr = <<Atom1Attr/binary, List0Attr/binary>>,
    ?assertMatch([{ListId, <<64,0>>, 1157},
                  {ListId, List1Attr, foo}],
                 compress([foo])),

    IntegerId = create_id(-1),
    IntegerAttr = <<?INT_TAG:3, 0:13>>,
    ?assertMatch({IntegerId, IntegerAttr, -1},
                 compress(-1)),

    FloatId = create_id(7.1e+50),
    FloatAttr = <<?FLOAT_TAG:3, 0:13>>,
    ?assertMatch({FloatId, FloatAttr, 7.1e+50},
                 compress(7.1e+50)),

    BinaryId = create_id(<<"foo">>),
    BinaryAttr = <<?BIN_TAG:3, 0:13>>,
    ?assertMatch({BinaryId, BinaryAttr, <<"foo">>},
                 compress(<<"foo">>)).

 -endif.
	%%% Myrmas Path Compression
	%%% =======================
	%%% * author: Eric Merritt
	%%% * copyright: Afiniate, Inc
	%%%
	%%% License
	%%% -------
	%%% Licensed under the Apache License, Version 2.0 you may not use
	%%% this file except in compliance with the License. You may obtain a
	%%% copy of the License at http://www.apache.org/licenses/LICENSE-2.0
	%%%
	%%% Overview
	%%% --------
	%%%
	%%% Path compression is about taking a tree (which all erlang terms
	%%% are) and compressing that tree to a set of id/value pairs. In this
	%%% case the ide is a representation of that leaf's unique position in
	%%% the tree. This allows us to flatten the tree and produce a set of
	%%% values that represent it in a consistant way. The downside of this
	%%% is, of course, greater space usage. However the upside is much
	%%% simpler manipulation and distribution.
	%%%
	%%%
	%%% Path Compression
	%%% ----------------
	%%%
	%%% Path compression is down using 16 bit values to represent every
	%%% level in the tree (every vertice) and the edge. The first two bits
	%%% is used to uniquely identify the type of the edge, while the
	%%% remaining 13bits is used to uniquely represent the position in the
	%%% at that level.
	%%%
	%%% To illustrate. Lets take the simple value foo. Is an atom (tag 6)
	%%% that has a position of zero so we could encode its value as
	%%% <<?ATOM_TAG:3, 0:13>>.
	%%%
	%%% Lets take another example. Something more complex. A list of two values.
	%%%
	%%% [1, 2]
	%%%
	%%% We will end up with a list of tuples, each identified by a 32 bit
	%%% attribute. 16 bits to identify the list and 16 bits to identify
	%%% the integer. so the first value in the attribute will be
	%%%
	%%% <<?LIST_TAG:3, 0:13>>
	%%%
	%%% This will be followed for each value by the id for the leaf. So in
	%%% the case of '1' the identifier will look like.
	%%%
	%%% << <<?INT_TAG:3, 0:13>>, <<?LIST_TAG:3, 0:13>> >>
	%%%
	%%% We are using a bit of psuedo code here to show the value. The
	%%% attribute for 2 will look as folows.
	%%%
	%%% << <<?INT_TAG:3, 1:13>>, <<?LIST_TAG:3, 0:13>> >>
	%%%
	%%% You may notice that the attribute set is in reverse order. we do
	%%% that simply because thats how we build up the binary.
	%%%
	%%% Because we expect to interact with native code that has certain
	%%% limitations we are going to limit the size of ints to those that
	%%% can be represented by a 64 bit integer.
	-module(myr_path).

	-export([compress/1]).

	-define(MAX_SIZE_REPR, 8191).
	-define(MAX_INTEGER, 9223372036854775807).
	-define(MIN_INTEGER, -9223372036854775808).
	-define(TUPLE_TAG, 1).
	-define(LIST_TAG, 2).
	-define(INT_TAG, 3).
	-define(UNSIGNED_TAG, 4).
	-define(FLOAT_TAG, 5).
	-define(BIN_TAG, 6).
	-define(ATOM_TAG, 7).

	-define(TUPLE_VAL, 16#EFFAB1E).
	-define(LIST_VAL, 1157).

	%%====================================================================
	%% API
	%%====================================================================

	compress(Term) ->
	%% We use md5 here because it has a reasonably low chance of
	%% collision while being 128 bit, which will allow us to do a
	%% single instruction compare on the native side.
	Id = create_id(Term),
	case flatten(Id, Term, 0, <<>>) of
	Result when is_list(Result) ->
	lists:flatten(Result);
	Else ->
	Else
	end.

	flatten(Id, Term, Count, Attr) when is_tuple(Term) ->
	StageRep = type(Term, Count),
	NewAttr = <<StageRep/binary, Attr/binary>>,
	{Vals, _} =
	lists:foldl(fun(El, {DataAcc, NewCount}) ->
	{[flatten(Id, El, NewCount, NewAttr) \| DataAcc],
	NewCount + 1}
	end, {[], 0}, erlang:tuple_to_list(Term)),
	[{Id, StageRep, ?TUPLE_VAL} \| Vals];
	flatten(Id, Term, Count, Attr) when is_list(Term) ->
	StageRep = type(Term, Count),
	NewAttr = <<StageRep/binary, Attr/binary>>,
	{Vals, _} =
	lists:foldl(fun(El, {DataAcc, NewCount}) ->
	{[flatten(Id, El, NewCount, NewAttr) \| DataAcc],
	NewCount + 1}
	end, {[], 0}, Term),
	[{Id, StageRep, ?LIST_VAL} \| Vals];
	flatten(Id, Term, Count, Attr) ->
	StageRep = type(Term, Count),
	NewAttr = <<StageRep/binary, Attr/binary>>,
	{Id, NewAttr, Term}.

	type(Term, Count) when is_tuple(Term),
	Count =< ?MAX_SIZE_REPR ->
	<<?TUPLE_TAG:3, Count:13>>;
	type(Term, Count) when is_list(Term),
	Count =< ?MAX_SIZE_REPR ->
	<<?LIST_TAG:3, Count:13>>;
	type(Term, Count) when is_integer(Term),
	Count =< ?MAX_SIZE_REPR,
	Term >= ?MIN_INTEGER,
	Term =< ?MAX_INTEGER ->
	<<?INT_TAG:3, Count:13>>;
	type(Term, Count) when is_float(Term),
	Count =< ?MAX_SIZE_REPR ->
	<<?FLOAT_TAG:3, Count:13>>;
	type(Term, Count) when is_binary(Term),
	Count =< ?MAX_SIZE_REPR ->
	<<?BIN_TAG:3, Count:13>>;
	type(Term, Count) when is_atom(Term),
	Count =< ?MAX_SIZE_REPR ->
	<<?ATOM_TAG:3, Count:13>>;
	type(Term, _Count) ->
	erlang:throw({unencodable_term, Term}).

	create_id(Term) ->
	crypto:md5(erlang:term_to_binary(Term)).

	%%====================================================================
	%% Tests
	%%====================================================================
	-ifndef(NOTEST).
	-include_lib("eunit/include/eunit.hrl").

	basic_test() ->
	AtomId = create_id(foo),
	AtomAttr = <<?ATOM_TAG:3, 0:13>>,
	?assertMatch({AtomId, AtomAttr, foo},
	compress(foo)),

	TupleId = create_id({foo}),
	Tuple0Attr = <<?TUPLE_TAG:3, 0:13>>,
	Atom1Attr = <<?ATOM_TAG:3, 0:13>>,
	Tuple1Attr = <<Atom1Attr/binary, Tuple0Attr/binary>>,
	?assertMatch([{TupleId,
	<<32,0>>,
	251636510},
	{TupleId, Tuple1Attr, foo}],
	compress({foo})),

	ListId = create_id([foo]),
	List0Attr = <<?LIST_TAG:3, 0:13>>,
	List1Attr = <<Atom1Attr/binary, List0Attr/binary>>,
	?assertMatch([{ListId, <<64,0>>, 1157},
	{ListId, List1Attr, foo}],
	compress([foo])),

	IntegerId = create_id(-1),
	IntegerAttr = <<?INT_TAG:3, 0:13>>,
	?assertMatch({IntegerId, IntegerAttr, -1},
	compress(-1)),

	FloatId = create_id(7.1e+50),
	FloatAttr = <<?FLOAT_TAG:3, 0:13>>,
	?assertMatch({FloatId, FloatAttr, 7.1e+50},
	compress(7.1e+50)),

	BinaryId = create_id(<<"foo">>),
	BinaryAttr = <<?BIN_TAG:3, 0:13>>,
	?assertMatch({BinaryId, BinaryAttr, <<"foo">>},
	compress(<<"foo">>)).

	-endif.