colinbankier · March 8, 2017 12:45
diff --git a/index.erl b/index.erl
 -module(index).
 -export([get_index/1]).

 % Used to read a file into a list of lines.
 % Example files available in:
 %   gettysburg-address.txt (short)
 %   dickens-christmas.txt  (long)


 % Get the contents of a text file into a list of lines.
 % Each line has its trailing newline removed.

 get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
 lists:reverse(Rev).

 % Auxiliary function for get_file_contents.
 % Not exported.

 get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

 % The main entry to the module, returns the index for the given file name.
 get_index(FileName) ->
    Contents = get_file_contents(FileName),
    build_index(Contents, 1, #{}).

 % Builds the index recursively, accumulating entries and associated line
 % numbers in the Index parameter.
 % I used a Map data structure here for easier lookup and adding to as lines are
 % processed.
 % In the base case, when in the index is complete, the list of line numbers is
 % grouped into contiguous ranges.
 % The final index is then sorted by word.
 % I decided to utilise a few functions from the Erlang std lib,
 % e.g. lists:sort, string:to_lower, string:tokens
 % given that we have implemented our own sort and similar functions in previous exercises,
 % and using these % allowed more time to focus on other aspects of the challenge.
 build_index([], _LineNum, Index) ->
    GroupedIndex = group_values_by_range(maps:to_list(Index)),
    lists:sort(fun(A, B) -> A =< B end, GroupedIndex);
 build_index([H|T], LineNum, Index) ->
    LineWords = nub(tokenize(string:to_lower(H))),
    FilteredWords = filter_short_words(LineWords),
    NewIndex = index_words(FilteredWords, LineNum, Index),
    build_index(T, LineNum + 1, NewIndex).

 % Indexes words from a single line, adding an entry to Index for LineNum.
 index_words([], _LineNum, Index) ->
    Index;
 index_words([H|T], LineNum, Index) ->
    Lines = maps:get(H, Index, []),
    NewIndex = maps:put(H, [LineNum|Lines], Index),
    index_words(T, LineNum, NewIndex).

 % Splits a line into individual words.
 % Std lib function used.
 tokenize(Line) ->
    string:tokens(Line, " ,.;?!`[]()\\\"").

 % Removes short words from being indexed.
 % Could be extended to filter common words such as "and".
 filter_short_words(Words) when is_list(Words) ->
    filter_short_words(Words, []).
 filter_short_words([], Acc) ->
    Acc;
 filter_short_words([H|T], Acc) ->
    case length(H) < 3 of
        true ->
            filter_short_words(T, Acc);
        false ->
            filter_short_words(T, [H|Acc])
    end.

 % Groups the values by range for an index of Key,Value pairs.
 % The values are assumed to be a list of numbers which are
 % group into contiguous ranges.
 group_values_by_range(Pairs) when is_list(Pairs) ->
    group_values_by_range(Pairs, []).
 group_values_by_range([{Key, Value}|T], Acc) ->
    group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}|Acc]);
 group_values_by_range([], Acc) ->
    Acc.

 % Groups a list of numbers into contiguous ranges.
 group_by_range([]) ->
    [];
 group_by_range([H|T]) ->
    group_by_range(T, {H,H}, []).
 group_by_range([], Range, Acc) ->
    lists:reverse([Range|Acc]);
 group_by_range([H|T], {S,E}, Acc) when E+1 == H ->
    group_by_range(T, {S,H}, Acc);
 group_by_range([H|T], Range={_S,_E}, Acc) ->
    group_by_range(T, {H,H},[Range|Acc]).

 % Returns a list with duplicate elements removed.
 nub(List) ->
    lists:reverse(nub(List, [])).
 nub([], Acc) ->
    Acc;
 nub([H|T], Acc) ->
    NextAcc = case lists:member(H, Acc) of
                  true -> Acc;
                  false -> [H | Acc]
              end,
    nub(T, NextAcc).
	-module(index).
	-export([get_index/1]).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% The main entry to the module, returns the index for the given file name.
	get_index(FileName) ->
	Contents = get_file_contents(FileName),
	build_index(Contents, 1, #{}).

	% Builds the index recursively, accumulating entries and associated line
	% numbers in the Index parameter.
	% I used a Map data structure here for easier lookup and adding to as lines are
	% processed.
	% In the base case, when in the index is complete, the list of line numbers is
	% grouped into contiguous ranges.
	% The final index is then sorted by word.
	% I decided to utilise a few functions from the Erlang std lib,
	% e.g. lists:sort, string:to_lower, string:tokens
	% given that we have implemented our own sort and similar functions in previous exercises,
	% and using these % allowed more time to focus on other aspects of the challenge.
	build_index([], _LineNum, Index) ->
	GroupedIndex = group_values_by_range(maps:to_list(Index)),
	lists:sort(fun(A, B) -> A =< B end, GroupedIndex);
	build_index([H\|T], LineNum, Index) ->
	LineWords = nub(tokenize(string:to_lower(H))),
	FilteredWords = filter_short_words(LineWords),
	NewIndex = index_words(FilteredWords, LineNum, Index),
	build_index(T, LineNum + 1, NewIndex).

	% Indexes words from a single line, adding an entry to Index for LineNum.
	index_words([], _LineNum, Index) ->
	Index;
	index_words([H\|T], LineNum, Index) ->
	Lines = maps:get(H, Index, []),
	NewIndex = maps:put(H, [LineNum\|Lines], Index),
	index_words(T, LineNum, NewIndex).

	% Splits a line into individual words.
	% Std lib function used.
	tokenize(Line) ->
	string:tokens(Line, " ,.;?!`[]()\\\"").

	% Removes short words from being indexed.
	% Could be extended to filter common words such as "and".
	filter_short_words(Words) when is_list(Words) ->
	filter_short_words(Words, []).
	filter_short_words([], Acc) ->
	Acc;
	filter_short_words([H\|T], Acc) ->
	case length(H) < 3 of
	true ->
	filter_short_words(T, Acc);
	false ->
	filter_short_words(T, [H\|Acc])
	end.

	% Groups the values by range for an index of Key,Value pairs.
	% The values are assumed to be a list of numbers which are
	% group into contiguous ranges.
	group_values_by_range(Pairs) when is_list(Pairs) ->
	group_values_by_range(Pairs, []).
	group_values_by_range([{Key, Value}\|T], Acc) ->
	group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}\|Acc]);
	group_values_by_range([], Acc) ->
	Acc.

	% Groups a list of numbers into contiguous ranges.
	group_by_range([]) ->
	[];
	group_by_range([H\|T]) ->
	group_by_range(T, {H,H}, []).
	group_by_range([], Range, Acc) ->
	lists:reverse([Range\|Acc]);
	group_by_range([H\|T], {S,E}, Acc) when E+1 == H ->
	group_by_range(T, {S,H}, Acc);
	group_by_range([H\|T], Range={_S,_E}, Acc) ->
	group_by_range(T, {H,H},[Range\|Acc]).

	% Returns a list with duplicate elements removed.
	nub(List) ->
	lists:reverse(nub(List, [])).
	nub([], Acc) ->
	Acc;
	nub([H\|T], Acc) ->
	NextAcc = case lists:member(H, Acc) of
	true -> Acc;
	false -> [H \| Acc]
	end,
	nub(T, NextAcc).