shakerlxxv · March 19, 2017 17:21
diff --git a/index.erl b/index.erl
 %%==============================================================================
 %% Univ. Kent Functional Programing with Erlang
 %%   Week 2: Part 20 - indexing a file
 %%
 %% Description:
 %%    Use the main function main/1 to create your output. It will produce
 %%    a list of data structures: { "foo" , [{3,5},{7,7},{11,13}] } meaning
 %%    the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
 %%
 %% Enhancements:
 %% -------------
 %%   1. Remove short words:
 %%         I've added to the to_words/1 method the logic to remove words
 %%         with length <= 3. NOTE: this is not entirely a good idea.
 %%
 %%   2. Sorting lexicographically:
 %%         This was part of my original data representation, so not necessary
 %%         The "compression" from a list of Words {"abc",9} to the index format
 %%         is based heavily on the list of words being sorted.
 %%
 %%   3. Normalising words "Foo" and "foo" are the same:
 %%         This is also included in my original solution using string:to_lower
 %%         from the standard modules.
 %%
 %%   4. Normalising for common endings, plurals etc.
 %%         TODO: This is not done or attempted.
 %%
 %%   5. Make the data representation more efficient.
 %%         TODO: My thinking here is about using this structure for lookup, so
 %%         you'd want to quickly say:  "Give me the index entry for word `foo`",
 %%         and right now you can't use standard member/2 approach. So one idea
 %%         would be to store the words and pages as two separate lists
 %%         NOTE: It looks though like lists:keyfind/3 would be useful with they
 %%         the current structure, so I'd do some testing for the actual need
 %%         before beginning structure change.
 %%
 %% Author: Brian L. Shaver
 %% Date  : 2017-03-19
 %%==============================================================================
 -module(index).
 -include_lib("eunit/include/eunit.hrl").
 -export([main/1,get_file_contents/1,show_file_contents/1]).

 %-------------------------------------------------------------------------------
 % this is the main procedure which will take the Filename through a pipeline
 % of transformation from:
 %
 %  0. list of lines
 %  1. lines of words
 %  2. list of words
 %  3. sorted list of words
 %  4. final index format
 %
 -spec main(string()) -> list().
 main(Filename) ->
    % 0. Read the file into a list of lines.
    Lines = get_file_contents(Filename),
    %
    % 1. convert lines to lines of words with structure:
    %    [{1, ["abc","def"]},{2, ["fgh","mno"]}...
    Lines_of_Words = to_lines(1,Lines),
    %
    % 2. Now flatten into discreet word list {word, line}
    %    [{"abc",1},{"def",1},{"fgh",2},...]
    Words = flat_line(Lines_of_Words),
    %
    % 3. Sort the results so that they are ordered lexicographically by word,
    %    and then numerically by number. As it turns out, this is exactly how
    %    lists:sort treats the structures {"foo",9}.
    Sorted_Words = lists:sort(Words),
    %
    % 4. Now compress the ordered results into the expression requested for the
    %    assignment.
    %       [{"abc",1},{"abc",2},{"abc",5},{"def",2}]
    %         ==>   [{"abc",[{1,2},{5,5}]},{"def",[{2,2}]}]
    to_index([],Sorted_Words).

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %   this is just verifying the results
 %   of lists:sort/1 on [{"def",3},{"abc",4}...]
 %+++++++++++++++++++++++++++++++++++++++
 sorted_words_test_() ->
    [
     ?_assertEqual([{"abc",3},{"abc",5},{"bcd",4},{"def",3}],
          lists:sort([{"def",3},{"abc",3},{"bcd",4},{"abc",5}]))
    ].

 %-------------------------------------------------------------------------------
 % aggegate a sorted list of words down to the desired output format.
 %  since the list of words is sorted, there are basically 3 "match" cases to
 %  handle: (a) when we are matching the same word on a line not consecutive with
 %  previous match (b) when the match is on the same or next line (c) when we're
 %  start with a new word.
 %
 %  (d) we also have to handle the last word slightly differently since it's
 %  never going to hit pattern (c)
 %
 %  And finally (e) we need to get the recursion started when we have an empty
 %  Index and a non-empty Words list.
 -spec to_index(list(),list()) -> list().

 % (d') this is only necessary as an alternate match for scenario (d)
 to_index([],[]) ->
    [];

 % (d) special case of getting the 
 to_index([{Word,Lines}|Idx], []) ->
    lists:reverse([{Word,lists:reverse(Lines)}|Idx]);

 % (e) to get the recursion started for first word
 to_index([],[{Word,Line}|Words]) ->
    to_index([{Word,[{Line,Line}]}],Words);

 % (a) matching same word after a skip of more than 1 line
 to_index([{Word,[{_X,Line}|_Lines]=Wl}|Idx],[{Word,NewLine}|Words])
  when NewLine > Line + 1 ->
    to_index([{Word,[{NewLine,NewLine}|Wl]}|Idx],Words);

 % (b) this covers case where Line == NewLine and Line + 1 == NewLine
 to_index([{Word,[{X,_Line}|Lines]}|Idx],[{Word,NewLine}|Words]) ->
    to_index([{Word,[{X,NewLine}|Lines]}|Idx],Words);

 % (c) matching a new word
 to_index([{Word,Lines}|Idx],[{NewWord,Line}|Words]) ->
    to_index([{NewWord,[{Line,Line}]},{Word,lists:reverse(Lines)}|Idx],Words).

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 to_index_test_() ->
    [
     ?_assertEqual([],to_index([],[])),
     ?_assertEqual([{"abc",[{1,3},{6,6}]}],
                  to_index([],
                     [{"abc",1},{"abc",2},{"abc",2},{"abc",3},{"abc",6}])),
     ?_assertEqual([{"abc",[{1,3},{6,6}]}],
                   to_index([{"abc",[{1,2}]}],
                            [{"abc",2},{"abc",3},{"abc",6}]))
    ].

 %-------------------------------------------------------------------------------
 % converts the file (a list of strings) into a list of lines. A line object 
 % will be in the format:  {9,["abc","def","mno"]}, a line number followed by
 % a list of words.
 -spec to_lines(integer(),list()) -> list().
 to_lines(_N,[]) ->
    [];
 to_lines(N,[Line|Lines]) ->
    Words = to_words(Line),
    case Words of
        [] ->
            to_lines(N+1,Lines); % to skip blank lines
        _  ->
            [{N,to_words(Line)}|to_lines(N+1,Lines)]
    end.

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 to_lines_test_() ->
    [?_assertEqual([],to_lines(13,[])),
     ?_assertEqual([{1,["abcd"]},{2,["abcd"]}],
                   to_lines(1,["abcd","abcd"])),
     ?_assertEqual([{9,["abcd"]},{10,["abcd"]}],
                   to_lines(9,["abcd","abcd"])),
     ?_assertEqual([{1,["abcd"]},{3,["abcd"]}],
                   to_lines(1,["abcd"," ","abcd"]))
    ].

 %-------------------------------------------------------------------------------
 % converts a string into a list of words.
 %   all words will be handled in lower case.
 %   remove any leading or trailing punctuation
 %   filter out any words of length <= 3
 %     See filter_words/1 for adjustments of the selected words or word formats.
 %
 -spec to_words(string()) -> list().
 to_words(L) ->
    filter_words(string:tokens(L," \n\r\t")).

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 to_words_test_() ->
    [?_assertEqual([],to_words("")),
     ?_assertEqual(["once"],to_words("Once")),
     ?_assertEqual(["once","twice","three"],to_words("Once twice	thrEE"))].

 %-------------------------------------------------------------------------------
 % filter / normalize the list of words.
 % NOTE: We ignore any word of 3 or less characters, this will include some 
 %       words which potentially should be indexed ... depends on the use case
 %
 % TODO: Some words would be better to index with capital letters e.g. Names
 %       or abbreviations. This will not be addressed here. One good approach
 %       would be to only lowercase if the word is init-capped, but would
 %       not handle names (e.g. Abraham), so how this should be implemented
 %       really would dependend on the intended use of the index results.
 filter_words([]) ->
    [];
 filter_words([Word|Words]) ->
    % this regex is going to remove leading an trailing punctuation
    %  and spaces, but should leave "internal" punctuation in place
    NewWord = re:replace(Word,
                         "(?|^[^[:alpha:]]*)(.*?)(?|[^[:alpha:]]*$)","\\1",
                         [{return,list}]),
    L = length(NewWord),
    case L of
        L when L > 3 ->
            % for words longer than 3 characters, we'll keep the lower case
            [string:to_lower(NewWord)|filter_words(Words)];
        _ ->
            % ignore any words with 3 or less characters
            filter_words(Words)
    end.

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 filter_words_test_() ->
    [
     ?_assertEqual([],filter_words([])),
     ?_assertEqual(["hello"],filter_words(["hello,","the","a","----"])),
     ?_assertEqual(["hello","goodbye"],filter_words([".hello,","  goodbye  "])),
     ?_assertEqual(["don't","pre-load"],filter_words(["don't","pre-load"]))
    ].

 %-------------------------------------------------------------------------------
 % take a list of line objects and convert them into a list of word objects.
 % line: {9, ["abc","def"]}
 % word: {"abc",9}
 %
 % TODO: This is concatenating two lists at each step. A more efficient
 %       approach would be to flatten the compound list at the end, or
 %       perhaps find a suitable tail recursive approach and reverse the
 %       the resulting list at the last. reversing could be skipped with
 %       the apriori knowledge the next step is going to lexicographically
 %       sort these results.
 -spec flat_line(list()) -> list().
 flat_line([]) ->
    [];
 flat_line([L|Ls]) ->
    flattener(L) ++ flat_line(Ls).

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 flat_line_test_() ->
    [
     ?_assertEqual([],flat_line([])),
     ?_assertEqual([{"abc",1},{"def",2}],
                   flat_line([{1,["abc"]},{2,["def"]}]))
    ].

 %-------------------------------------------------------------------------------
 % this is just a step within the flat_line/1 process, its taking a line object
 % with the format: {9, ["abc","def"]}, and pivoting to the word list format
 %   => [{"abc",9},{"def",9}]
 -spec flattener({integer(),list()}) -> list().
 flattener({_N,[]}) ->
    [];
 flattener({N,[W|Ws]}) ->
    [{W,N}|flattener({N,Ws})].

 %+++++++++++++++++++++++++++++++++++++++
 % unit tests
 %+++++++++++++++++++++++++++++++++++++++
 flattenner_test_() ->
    [
     ?_assertEqual([],flattener({9,""})),
     ?_assertEqual([],flattener({3,[]})),
     ?_assertEqual([{"abc",9},{"def",9}],flattener({9,["abc","def"]}))
    ].

 %-------------------------------------------------------------------------------
 % Used to read a file into a list of lines.
 % Example files available in
 %   gettysburg-address.txt (short)
 %   dickens-christmas.txt  (long)


 % Get the contents of a text file into a list of lines.
 % Each line has its trailing newline removed.
 -spec get_file_contents(string()) -> list().
 get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
 lists:reverse(Rev).

 %-------------------------------------------------------------------------------
 % Auxiliary function for get_file_contents.
 % Not exported.
 %
 -spec get_all_lines(io:device(),list()) -> list().
 get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

 %-------------------------------------------------------------------------------
 % Show the contents of a list of strings.
 % Can be used to check the results of calling get_file_contents.
 %
 -spec show_file_contents(list()) -> ok.
 show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.
	%%==============================================================================
	%% Univ. Kent Functional Programing with Erlang
	%% Week 2: Part 20 - indexing a file
	%%
	%% Description:
	%% Use the main function main/1 to create your output. It will produce
	%% a list of data structures: { "foo" , [{3,5},{7,7},{11,13}] } meaning
	%% the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
	%%
	%% Enhancements:
	%% -------------
	%% 1. Remove short words:
	%% I've added to the to_words/1 method the logic to remove words
	%% with length <= 3. NOTE: this is not entirely a good idea.
	%%
	%% 2. Sorting lexicographically:
	%% This was part of my original data representation, so not necessary
	%% The "compression" from a list of Words {"abc",9} to the index format
	%% is based heavily on the list of words being sorted.
	%%
	%% 3. Normalising words "Foo" and "foo" are the same:
	%% This is also included in my original solution using string:to_lower
	%% from the standard modules.
	%%
	%% 4. Normalising for common endings, plurals etc.
	%% TODO: This is not done or attempted.
	%%
	%% 5. Make the data representation more efficient.
	%% TODO: My thinking here is about using this structure for lookup, so
	%% you'd want to quickly say: "Give me the index entry for word `foo`",
	%% and right now you can't use standard member/2 approach. So one idea
	%% would be to store the words and pages as two separate lists
	%% NOTE: It looks though like lists:keyfind/3 would be useful with they
	%% the current structure, so I'd do some testing for the actual need
	%% before beginning structure change.
	%%
	%% Author: Brian L. Shaver
	%% Date : 2017-03-19
	%%==============================================================================
	-module(index).
	-include_lib("eunit/include/eunit.hrl").
	-export([main/1,get_file_contents/1,show_file_contents/1]).

	%-------------------------------------------------------------------------------
	% this is the main procedure which will take the Filename through a pipeline
	% of transformation from:
	%
	% 0. list of lines
	% 1. lines of words
	% 2. list of words
	% 3. sorted list of words
	% 4. final index format
	%
	-spec main(string()) -> list().
	main(Filename) ->
	% 0. Read the file into a list of lines.
	Lines = get_file_contents(Filename),
	%
	% 1. convert lines to lines of words with structure:
	% [{1, ["abc","def"]},{2, ["fgh","mno"]}...
	Lines_of_Words = to_lines(1,Lines),
	%
	% 2. Now flatten into discreet word list {word, line}
	% [{"abc",1},{"def",1},{"fgh",2},...]
	Words = flat_line(Lines_of_Words),
	%
	% 3. Sort the results so that they are ordered lexicographically by word,
	% and then numerically by number. As it turns out, this is exactly how
	% lists:sort treats the structures {"foo",9}.
	Sorted_Words = lists:sort(Words),
	%
	% 4. Now compress the ordered results into the expression requested for the
	% assignment.
	% [{"abc",1},{"abc",2},{"abc",5},{"def",2}]
	% ==> [{"abc",[{1,2},{5,5}]},{"def",[{2,2}]}]
	to_index([],Sorted_Words).

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	% this is just verifying the results
	% of lists:sort/1 on [{"def",3},{"abc",4}...]
	%+++++++++++++++++++++++++++++++++++++++
	sorted_words_test_() ->
	[
	?_assertEqual([{"abc",3},{"abc",5},{"bcd",4},{"def",3}],
	lists:sort([{"def",3},{"abc",3},{"bcd",4},{"abc",5}]))
	].

	%-------------------------------------------------------------------------------
	% aggegate a sorted list of words down to the desired output format.
	% since the list of words is sorted, there are basically 3 "match" cases to
	% handle: (a) when we are matching the same word on a line not consecutive with
	% previous match (b) when the match is on the same or next line (c) when we're
	% start with a new word.
	%
	% (d) we also have to handle the last word slightly differently since it's
	% never going to hit pattern (c)
	%
	% And finally (e) we need to get the recursion started when we have an empty
	% Index and a non-empty Words list.
	-spec to_index(list(),list()) -> list().

	% (d') this is only necessary as an alternate match for scenario (d)
	to_index([],[]) ->
	[];

	% (d) special case of getting the
	to_index([{Word,Lines}\|Idx], []) ->
	lists:reverse([{Word,lists:reverse(Lines)}\|Idx]);

	% (e) to get the recursion started for first word
	to_index([],[{Word,Line}\|Words]) ->
	to_index([{Word,[{Line,Line}]}],Words);

	% (a) matching same word after a skip of more than 1 line
	to_index([{Word,[{_X,Line}\|_Lines]=Wl}\|Idx],[{Word,NewLine}\|Words])
	when NewLine > Line + 1 ->
	to_index([{Word,[{NewLine,NewLine}\|Wl]}\|Idx],Words);

	% (b) this covers case where Line == NewLine and Line + 1 == NewLine
	to_index([{Word,[{X,_Line}\|Lines]}\|Idx],[{Word,NewLine}\|Words]) ->
	to_index([{Word,[{X,NewLine}\|Lines]}\|Idx],Words);

	% (c) matching a new word
	to_index([{Word,Lines}\|Idx],[{NewWord,Line}\|Words]) ->
	to_index([{NewWord,[{Line,Line}]},{Word,lists:reverse(Lines)}\|Idx],Words).

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	to_index_test_() ->
	[
	?_assertEqual([],to_index([],[])),
	?_assertEqual([{"abc",[{1,3},{6,6}]}],
	to_index([],
	[{"abc",1},{"abc",2},{"abc",2},{"abc",3},{"abc",6}])),
	?_assertEqual([{"abc",[{1,3},{6,6}]}],
	to_index([{"abc",[{1,2}]}],
	[{"abc",2},{"abc",3},{"abc",6}]))
	].

	%-------------------------------------------------------------------------------
	% converts the file (a list of strings) into a list of lines. A line object
	% will be in the format: {9,["abc","def","mno"]}, a line number followed by
	% a list of words.
	-spec to_lines(integer(),list()) -> list().
	to_lines(_N,[]) ->
	[];
	to_lines(N,[Line\|Lines]) ->
	Words = to_words(Line),
	case Words of
	[] ->
	to_lines(N+1,Lines); % to skip blank lines
	_ ->
	[{N,to_words(Line)}\|to_lines(N+1,Lines)]
	end.

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	to_lines_test_() ->
	[?_assertEqual([],to_lines(13,[])),
	?_assertEqual([{1,["abcd"]},{2,["abcd"]}],
	to_lines(1,["abcd","abcd"])),
	?_assertEqual([{9,["abcd"]},{10,["abcd"]}],
	to_lines(9,["abcd","abcd"])),
	?_assertEqual([{1,["abcd"]},{3,["abcd"]}],
	to_lines(1,["abcd"," ","abcd"]))
	].

	%-------------------------------------------------------------------------------
	% converts a string into a list of words.
	% all words will be handled in lower case.
	% remove any leading or trailing punctuation
	% filter out any words of length <= 3
	% See filter_words/1 for adjustments of the selected words or word formats.
	%
	-spec to_words(string()) -> list().
	to_words(L) ->
	filter_words(string:tokens(L," \n\r\t")).

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	to_words_test_() ->
	[?_assertEqual([],to_words("")),
	?_assertEqual(["once"],to_words("Once")),
	?_assertEqual(["once","twice","three"],to_words("Once twice thrEE"))].

	%-------------------------------------------------------------------------------
	% filter / normalize the list of words.
	% NOTE: We ignore any word of 3 or less characters, this will include some
	% words which potentially should be indexed ... depends on the use case
	%
	% TODO: Some words would be better to index with capital letters e.g. Names
	% or abbreviations. This will not be addressed here. One good approach
	% would be to only lowercase if the word is init-capped, but would
	% not handle names (e.g. Abraham), so how this should be implemented
	% really would dependend on the intended use of the index results.
	filter_words([]) ->
	[];
	filter_words([Word\|Words]) ->
	% this regex is going to remove leading an trailing punctuation
	% and spaces, but should leave "internal" punctuation in place
	NewWord = re:replace(Word,
	"(?\|^[^[:alpha:]])(.?)(?\|[^[:alpha:]]*$)","\\1",
	[{return,list}]),
	L = length(NewWord),
	case L of
	L when L > 3 ->
	% for words longer than 3 characters, we'll keep the lower case
	[string:to_lower(NewWord)\|filter_words(Words)];
	_ ->
	% ignore any words with 3 or less characters
	filter_words(Words)
	end.

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	filter_words_test_() ->
	[
	?_assertEqual([],filter_words([])),
	?_assertEqual(["hello"],filter_words(["hello,","the","a","----"])),
	?_assertEqual(["hello","goodbye"],filter_words([".hello,"," goodbye "])),
	?_assertEqual(["don't","pre-load"],filter_words(["don't","pre-load"]))
	].

	%-------------------------------------------------------------------------------
	% take a list of line objects and convert them into a list of word objects.
	% line: {9, ["abc","def"]}
	% word: {"abc",9}
	%
	% TODO: This is concatenating two lists at each step. A more efficient
	% approach would be to flatten the compound list at the end, or
	% perhaps find a suitable tail recursive approach and reverse the
	% the resulting list at the last. reversing could be skipped with
	% the apriori knowledge the next step is going to lexicographically
	% sort these results.
	-spec flat_line(list()) -> list().
	flat_line([]) ->
	[];
	flat_line([L\|Ls]) ->
	flattener(L) ++ flat_line(Ls).

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	flat_line_test_() ->
	[
	?_assertEqual([],flat_line([])),
	?_assertEqual([{"abc",1},{"def",2}],
	flat_line([{1,["abc"]},{2,["def"]}]))
	].

	%-------------------------------------------------------------------------------
	% this is just a step within the flat_line/1 process, its taking a line object
	% with the format: {9, ["abc","def"]}, and pivoting to the word list format
	% => [{"abc",9},{"def",9}]
	-spec flattener({integer(),list()}) -> list().
	flattener({_N,[]}) ->
	[];
	flattener({N,[W\|Ws]}) ->
	[{W,N}\|flattener({N,Ws})].

	%+++++++++++++++++++++++++++++++++++++++
	% unit tests
	%+++++++++++++++++++++++++++++++++++++++
	flattenner_test_() ->
	[
	?_assertEqual([],flattener({9,""})),
	?_assertEqual([],flattener({3,[]})),
	?_assertEqual([{"abc",9},{"def",9}],flattener({9,["abc","def"]}))
	].

	%-------------------------------------------------------------------------------
	% Used to read a file into a list of lines.
	% Example files available in
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.
	-spec get_file_contents(string()) -> list().
	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	%-------------------------------------------------------------------------------
	% Auxiliary function for get_file_contents.
	% Not exported.
	%
	-spec get_all_lines(io:device(),list()) -> list().
	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	%-------------------------------------------------------------------------------
	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.
	%
	-spec show_file_contents(list()) -> ok.
	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.