Skip to content

Instantly share code, notes, and snippets.

@shakerlxxv
Created March 19, 2017 17:21
Show Gist options
  • Save shakerlxxv/a00119f7776295b4105351521c3d0a20 to your computer and use it in GitHub Desktop.
Save shakerlxxv/a00119f7776295b4105351521c3d0a20 to your computer and use it in GitHub Desktop.
%%==============================================================================
%% Univ. Kent Functional Programing with Erlang
%% Week 2: Part 20 - indexing a file
%%
%% Description:
%% Use the main function main/1 to create your output. It will produce
%% a list of data structures: { "foo" , [{3,5},{7,7},{11,13}] } meaning
%% the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file.
%%
%% Enhancements:
%% -------------
%% 1. Remove short words:
%% I've added to the to_words/1 method the logic to remove words
%% with length <= 3. NOTE: this is not entirely a good idea.
%%
%% 2. Sorting lexicographically:
%% This was part of my original data representation, so not necessary
%% The "compression" from a list of Words {"abc",9} to the index format
%% is based heavily on the list of words being sorted.
%%
%% 3. Normalising words "Foo" and "foo" are the same:
%% This is also included in my original solution using string:to_lower
%% from the standard modules.
%%
%% 4. Normalising for common endings, plurals etc.
%% TODO: This is not done or attempted.
%%
%% 5. Make the data representation more efficient.
%% TODO: My thinking here is about using this structure for lookup, so
%% you'd want to quickly say: "Give me the index entry for word `foo`",
%% and right now you can't use standard member/2 approach. So one idea
%% would be to store the words and pages as two separate lists
%% NOTE: It looks though like lists:keyfind/3 would be useful with they
%% the current structure, so I'd do some testing for the actual need
%% before beginning structure change.
%%
%% Author: Brian L. Shaver
%% Date : 2017-03-19
%%==============================================================================
-module(index).
-include_lib("eunit/include/eunit.hrl").
-export([main/1,get_file_contents/1,show_file_contents/1]).
%-------------------------------------------------------------------------------
% this is the main procedure which will take the Filename through a pipeline
% of transformation from:
%
% 0. list of lines
% 1. lines of words
% 2. list of words
% 3. sorted list of words
% 4. final index format
%
-spec main(string()) -> list().
main(Filename) ->
% 0. Read the file into a list of lines.
Lines = get_file_contents(Filename),
%
% 1. convert lines to lines of words with structure:
% [{1, ["abc","def"]},{2, ["fgh","mno"]}...
Lines_of_Words = to_lines(1,Lines),
%
% 2. Now flatten into discreet word list {word, line}
% [{"abc",1},{"def",1},{"fgh",2},...]
Words = flat_line(Lines_of_Words),
%
% 3. Sort the results so that they are ordered lexicographically by word,
% and then numerically by number. As it turns out, this is exactly how
% lists:sort treats the structures {"foo",9}.
Sorted_Words = lists:sort(Words),
%
% 4. Now compress the ordered results into the expression requested for the
% assignment.
% [{"abc",1},{"abc",2},{"abc",5},{"def",2}]
% ==> [{"abc",[{1,2},{5,5}]},{"def",[{2,2}]}]
to_index([],Sorted_Words).
%+++++++++++++++++++++++++++++++++++++++
% unit tests
% this is just verifying the results
% of lists:sort/1 on [{"def",3},{"abc",4}...]
%+++++++++++++++++++++++++++++++++++++++
sorted_words_test_() ->
[
?_assertEqual([{"abc",3},{"abc",5},{"bcd",4},{"def",3}],
lists:sort([{"def",3},{"abc",3},{"bcd",4},{"abc",5}]))
].
%-------------------------------------------------------------------------------
% aggegate a sorted list of words down to the desired output format.
% since the list of words is sorted, there are basically 3 "match" cases to
% handle: (a) when we are matching the same word on a line not consecutive with
% previous match (b) when the match is on the same or next line (c) when we're
% start with a new word.
%
% (d) we also have to handle the last word slightly differently since it's
% never going to hit pattern (c)
%
% And finally (e) we need to get the recursion started when we have an empty
% Index and a non-empty Words list.
-spec to_index(list(),list()) -> list().
% (d') this is only necessary as an alternate match for scenario (d)
to_index([],[]) ->
[];
% (d) special case of getting the
to_index([{Word,Lines}|Idx], []) ->
lists:reverse([{Word,lists:reverse(Lines)}|Idx]);
% (e) to get the recursion started for first word
to_index([],[{Word,Line}|Words]) ->
to_index([{Word,[{Line,Line}]}],Words);
% (a) matching same word after a skip of more than 1 line
to_index([{Word,[{_X,Line}|_Lines]=Wl}|Idx],[{Word,NewLine}|Words])
when NewLine > Line + 1 ->
to_index([{Word,[{NewLine,NewLine}|Wl]}|Idx],Words);
% (b) this covers case where Line == NewLine and Line + 1 == NewLine
to_index([{Word,[{X,_Line}|Lines]}|Idx],[{Word,NewLine}|Words]) ->
to_index([{Word,[{X,NewLine}|Lines]}|Idx],Words);
% (c) matching a new word
to_index([{Word,Lines}|Idx],[{NewWord,Line}|Words]) ->
to_index([{NewWord,[{Line,Line}]},{Word,lists:reverse(Lines)}|Idx],Words).
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
to_index_test_() ->
[
?_assertEqual([],to_index([],[])),
?_assertEqual([{"abc",[{1,3},{6,6}]}],
to_index([],
[{"abc",1},{"abc",2},{"abc",2},{"abc",3},{"abc",6}])),
?_assertEqual([{"abc",[{1,3},{6,6}]}],
to_index([{"abc",[{1,2}]}],
[{"abc",2},{"abc",3},{"abc",6}]))
].
%-------------------------------------------------------------------------------
% converts the file (a list of strings) into a list of lines. A line object
% will be in the format: {9,["abc","def","mno"]}, a line number followed by
% a list of words.
-spec to_lines(integer(),list()) -> list().
to_lines(_N,[]) ->
[];
to_lines(N,[Line|Lines]) ->
Words = to_words(Line),
case Words of
[] ->
to_lines(N+1,Lines); % to skip blank lines
_ ->
[{N,to_words(Line)}|to_lines(N+1,Lines)]
end.
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
to_lines_test_() ->
[?_assertEqual([],to_lines(13,[])),
?_assertEqual([{1,["abcd"]},{2,["abcd"]}],
to_lines(1,["abcd","abcd"])),
?_assertEqual([{9,["abcd"]},{10,["abcd"]}],
to_lines(9,["abcd","abcd"])),
?_assertEqual([{1,["abcd"]},{3,["abcd"]}],
to_lines(1,["abcd"," ","abcd"]))
].
%-------------------------------------------------------------------------------
% converts a string into a list of words.
% all words will be handled in lower case.
% remove any leading or trailing punctuation
% filter out any words of length <= 3
% See filter_words/1 for adjustments of the selected words or word formats.
%
-spec to_words(string()) -> list().
to_words(L) ->
filter_words(string:tokens(L," \n\r\t")).
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
to_words_test_() ->
[?_assertEqual([],to_words("")),
?_assertEqual(["once"],to_words("Once")),
?_assertEqual(["once","twice","three"],to_words("Once twice thrEE"))].
%-------------------------------------------------------------------------------
% filter / normalize the list of words.
% NOTE: We ignore any word of 3 or less characters, this will include some
% words which potentially should be indexed ... depends on the use case
%
% TODO: Some words would be better to index with capital letters e.g. Names
% or abbreviations. This will not be addressed here. One good approach
% would be to only lowercase if the word is init-capped, but would
% not handle names (e.g. Abraham), so how this should be implemented
% really would dependend on the intended use of the index results.
filter_words([]) ->
[];
filter_words([Word|Words]) ->
% this regex is going to remove leading an trailing punctuation
% and spaces, but should leave "internal" punctuation in place
NewWord = re:replace(Word,
"(?|^[^[:alpha:]]*)(.*?)(?|[^[:alpha:]]*$)","\\1",
[{return,list}]),
L = length(NewWord),
case L of
L when L > 3 ->
% for words longer than 3 characters, we'll keep the lower case
[string:to_lower(NewWord)|filter_words(Words)];
_ ->
% ignore any words with 3 or less characters
filter_words(Words)
end.
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
filter_words_test_() ->
[
?_assertEqual([],filter_words([])),
?_assertEqual(["hello"],filter_words(["hello,","the","a","----"])),
?_assertEqual(["hello","goodbye"],filter_words([".hello,"," goodbye "])),
?_assertEqual(["don't","pre-load"],filter_words(["don't","pre-load"]))
].
%-------------------------------------------------------------------------------
% take a list of line objects and convert them into a list of word objects.
% line: {9, ["abc","def"]}
% word: {"abc",9}
%
% TODO: This is concatenating two lists at each step. A more efficient
% approach would be to flatten the compound list at the end, or
% perhaps find a suitable tail recursive approach and reverse the
% the resulting list at the last. reversing could be skipped with
% the apriori knowledge the next step is going to lexicographically
% sort these results.
-spec flat_line(list()) -> list().
flat_line([]) ->
[];
flat_line([L|Ls]) ->
flattener(L) ++ flat_line(Ls).
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
flat_line_test_() ->
[
?_assertEqual([],flat_line([])),
?_assertEqual([{"abc",1},{"def",2}],
flat_line([{1,["abc"]},{2,["def"]}]))
].
%-------------------------------------------------------------------------------
% this is just a step within the flat_line/1 process, its taking a line object
% with the format: {9, ["abc","def"]}, and pivoting to the word list format
% => [{"abc",9},{"def",9}]
-spec flattener({integer(),list()}) -> list().
flattener({_N,[]}) ->
[];
flattener({N,[W|Ws]}) ->
[{W,N}|flattener({N,Ws})].
%+++++++++++++++++++++++++++++++++++++++
% unit tests
%+++++++++++++++++++++++++++++++++++++++
flattenner_test_() ->
[
?_assertEqual([],flattener({9,""})),
?_assertEqual([],flattener({3,[]})),
?_assertEqual([{"abc",9},{"def",9}],flattener({9,["abc","def"]}))
].
%-------------------------------------------------------------------------------
% Used to read a file into a list of lines.
% Example files available in
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
-spec get_file_contents(string()) -> list().
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
%-------------------------------------------------------------------------------
% Auxiliary function for get_file_contents.
% Not exported.
%
-spec get_all_lines(io:device(),list()) -> list().
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
%-------------------------------------------------------------------------------
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
%
-spec show_file_contents(list()) -> ok.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment