Created
March 19, 2017 17:21
-
-
Save shakerlxxv/a00119f7776295b4105351521c3d0a20 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%============================================================================== | |
%% Univ. Kent Functional Programing with Erlang | |
%% Week 2: Part 20 - indexing a file | |
%% | |
%% Description: | |
%% Use the main function main/1 to create your output. It will produce | |
%% a list of data structures: { "foo" , [{3,5},{7,7},{11,13}] } meaning | |
%% the word "foo" occurs on lines 3, 4, 5, 7, 11, 12 and 13 in the file. | |
%% | |
%% Enhancements: | |
%% ------------- | |
%% 1. Remove short words: | |
%% I've added to the to_words/1 method the logic to remove words | |
%% with length <= 3. NOTE: this is not entirely a good idea. | |
%% | |
%% 2. Sorting lexicographically: | |
%% This was part of my original data representation, so not necessary | |
%% The "compression" from a list of Words {"abc",9} to the index format | |
%% is based heavily on the list of words being sorted. | |
%% | |
%% 3. Normalising words "Foo" and "foo" are the same: | |
%% This is also included in my original solution using string:to_lower | |
%% from the standard modules. | |
%% | |
%% 4. Normalising for common endings, plurals etc. | |
%% TODO: This is not done or attempted. | |
%% | |
%% 5. Make the data representation more efficient. | |
%% TODO: My thinking here is about using this structure for lookup, so | |
%% you'd want to quickly say: "Give me the index entry for word `foo`", | |
%% and right now you can't use standard member/2 approach. So one idea | |
%% would be to store the words and pages as two separate lists | |
%% NOTE: It looks though like lists:keyfind/3 would be useful with they | |
%% the current structure, so I'd do some testing for the actual need | |
%% before beginning structure change. | |
%% | |
%% Author: Brian L. Shaver | |
%% Date : 2017-03-19 | |
%%============================================================================== | |
-module(index). | |
-include_lib("eunit/include/eunit.hrl"). | |
-export([main/1,get_file_contents/1,show_file_contents/1]). | |
%------------------------------------------------------------------------------- | |
% this is the main procedure which will take the Filename through a pipeline | |
% of transformation from: | |
% | |
% 0. list of lines | |
% 1. lines of words | |
% 2. list of words | |
% 3. sorted list of words | |
% 4. final index format | |
% | |
-spec main(string()) -> list(). | |
main(Filename) -> | |
% 0. Read the file into a list of lines. | |
Lines = get_file_contents(Filename), | |
% | |
% 1. convert lines to lines of words with structure: | |
% [{1, ["abc","def"]},{2, ["fgh","mno"]}... | |
Lines_of_Words = to_lines(1,Lines), | |
% | |
% 2. Now flatten into discreet word list {word, line} | |
% [{"abc",1},{"def",1},{"fgh",2},...] | |
Words = flat_line(Lines_of_Words), | |
% | |
% 3. Sort the results so that they are ordered lexicographically by word, | |
% and then numerically by number. As it turns out, this is exactly how | |
% lists:sort treats the structures {"foo",9}. | |
Sorted_Words = lists:sort(Words), | |
% | |
% 4. Now compress the ordered results into the expression requested for the | |
% assignment. | |
% [{"abc",1},{"abc",2},{"abc",5},{"def",2}] | |
% ==> [{"abc",[{1,2},{5,5}]},{"def",[{2,2}]}] | |
to_index([],Sorted_Words). | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
% this is just verifying the results | |
% of lists:sort/1 on [{"def",3},{"abc",4}...] | |
%+++++++++++++++++++++++++++++++++++++++ | |
sorted_words_test_() -> | |
[ | |
?_assertEqual([{"abc",3},{"abc",5},{"bcd",4},{"def",3}], | |
lists:sort([{"def",3},{"abc",3},{"bcd",4},{"abc",5}])) | |
]. | |
%------------------------------------------------------------------------------- | |
% aggegate a sorted list of words down to the desired output format. | |
% since the list of words is sorted, there are basically 3 "match" cases to | |
% handle: (a) when we are matching the same word on a line not consecutive with | |
% previous match (b) when the match is on the same or next line (c) when we're | |
% start with a new word. | |
% | |
% (d) we also have to handle the last word slightly differently since it's | |
% never going to hit pattern (c) | |
% | |
% And finally (e) we need to get the recursion started when we have an empty | |
% Index and a non-empty Words list. | |
-spec to_index(list(),list()) -> list(). | |
% (d') this is only necessary as an alternate match for scenario (d) | |
to_index([],[]) -> | |
[]; | |
% (d) special case of getting the | |
to_index([{Word,Lines}|Idx], []) -> | |
lists:reverse([{Word,lists:reverse(Lines)}|Idx]); | |
% (e) to get the recursion started for first word | |
to_index([],[{Word,Line}|Words]) -> | |
to_index([{Word,[{Line,Line}]}],Words); | |
% (a) matching same word after a skip of more than 1 line | |
to_index([{Word,[{_X,Line}|_Lines]=Wl}|Idx],[{Word,NewLine}|Words]) | |
when NewLine > Line + 1 -> | |
to_index([{Word,[{NewLine,NewLine}|Wl]}|Idx],Words); | |
% (b) this covers case where Line == NewLine and Line + 1 == NewLine | |
to_index([{Word,[{X,_Line}|Lines]}|Idx],[{Word,NewLine}|Words]) -> | |
to_index([{Word,[{X,NewLine}|Lines]}|Idx],Words); | |
% (c) matching a new word | |
to_index([{Word,Lines}|Idx],[{NewWord,Line}|Words]) -> | |
to_index([{NewWord,[{Line,Line}]},{Word,lists:reverse(Lines)}|Idx],Words). | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
to_index_test_() -> | |
[ | |
?_assertEqual([],to_index([],[])), | |
?_assertEqual([{"abc",[{1,3},{6,6}]}], | |
to_index([], | |
[{"abc",1},{"abc",2},{"abc",2},{"abc",3},{"abc",6}])), | |
?_assertEqual([{"abc",[{1,3},{6,6}]}], | |
to_index([{"abc",[{1,2}]}], | |
[{"abc",2},{"abc",3},{"abc",6}])) | |
]. | |
%------------------------------------------------------------------------------- | |
% converts the file (a list of strings) into a list of lines. A line object | |
% will be in the format: {9,["abc","def","mno"]}, a line number followed by | |
% a list of words. | |
-spec to_lines(integer(),list()) -> list(). | |
to_lines(_N,[]) -> | |
[]; | |
to_lines(N,[Line|Lines]) -> | |
Words = to_words(Line), | |
case Words of | |
[] -> | |
to_lines(N+1,Lines); % to skip blank lines | |
_ -> | |
[{N,to_words(Line)}|to_lines(N+1,Lines)] | |
end. | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
to_lines_test_() -> | |
[?_assertEqual([],to_lines(13,[])), | |
?_assertEqual([{1,["abcd"]},{2,["abcd"]}], | |
to_lines(1,["abcd","abcd"])), | |
?_assertEqual([{9,["abcd"]},{10,["abcd"]}], | |
to_lines(9,["abcd","abcd"])), | |
?_assertEqual([{1,["abcd"]},{3,["abcd"]}], | |
to_lines(1,["abcd"," ","abcd"])) | |
]. | |
%------------------------------------------------------------------------------- | |
% converts a string into a list of words. | |
% all words will be handled in lower case. | |
% remove any leading or trailing punctuation | |
% filter out any words of length <= 3 | |
% See filter_words/1 for adjustments of the selected words or word formats. | |
% | |
-spec to_words(string()) -> list(). | |
to_words(L) -> | |
filter_words(string:tokens(L," \n\r\t")). | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
to_words_test_() -> | |
[?_assertEqual([],to_words("")), | |
?_assertEqual(["once"],to_words("Once")), | |
?_assertEqual(["once","twice","three"],to_words("Once twice thrEE"))]. | |
%------------------------------------------------------------------------------- | |
% filter / normalize the list of words. | |
% NOTE: We ignore any word of 3 or less characters, this will include some | |
% words which potentially should be indexed ... depends on the use case | |
% | |
% TODO: Some words would be better to index with capital letters e.g. Names | |
% or abbreviations. This will not be addressed here. One good approach | |
% would be to only lowercase if the word is init-capped, but would | |
% not handle names (e.g. Abraham), so how this should be implemented | |
% really would dependend on the intended use of the index results. | |
filter_words([]) -> | |
[]; | |
filter_words([Word|Words]) -> | |
% this regex is going to remove leading an trailing punctuation | |
% and spaces, but should leave "internal" punctuation in place | |
NewWord = re:replace(Word, | |
"(?|^[^[:alpha:]]*)(.*?)(?|[^[:alpha:]]*$)","\\1", | |
[{return,list}]), | |
L = length(NewWord), | |
case L of | |
L when L > 3 -> | |
% for words longer than 3 characters, we'll keep the lower case | |
[string:to_lower(NewWord)|filter_words(Words)]; | |
_ -> | |
% ignore any words with 3 or less characters | |
filter_words(Words) | |
end. | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
filter_words_test_() -> | |
[ | |
?_assertEqual([],filter_words([])), | |
?_assertEqual(["hello"],filter_words(["hello,","the","a","----"])), | |
?_assertEqual(["hello","goodbye"],filter_words([".hello,"," goodbye "])), | |
?_assertEqual(["don't","pre-load"],filter_words(["don't","pre-load"])) | |
]. | |
%------------------------------------------------------------------------------- | |
% take a list of line objects and convert them into a list of word objects. | |
% line: {9, ["abc","def"]} | |
% word: {"abc",9} | |
% | |
% TODO: This is concatenating two lists at each step. A more efficient | |
% approach would be to flatten the compound list at the end, or | |
% perhaps find a suitable tail recursive approach and reverse the | |
% the resulting list at the last. reversing could be skipped with | |
% the apriori knowledge the next step is going to lexicographically | |
% sort these results. | |
-spec flat_line(list()) -> list(). | |
flat_line([]) -> | |
[]; | |
flat_line([L|Ls]) -> | |
flattener(L) ++ flat_line(Ls). | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
flat_line_test_() -> | |
[ | |
?_assertEqual([],flat_line([])), | |
?_assertEqual([{"abc",1},{"def",2}], | |
flat_line([{1,["abc"]},{2,["def"]}])) | |
]. | |
%------------------------------------------------------------------------------- | |
% this is just a step within the flat_line/1 process, its taking a line object | |
% with the format: {9, ["abc","def"]}, and pivoting to the word list format | |
% => [{"abc",9},{"def",9}] | |
-spec flattener({integer(),list()}) -> list(). | |
flattener({_N,[]}) -> | |
[]; | |
flattener({N,[W|Ws]}) -> | |
[{W,N}|flattener({N,Ws})]. | |
%+++++++++++++++++++++++++++++++++++++++ | |
% unit tests | |
%+++++++++++++++++++++++++++++++++++++++ | |
flattenner_test_() -> | |
[ | |
?_assertEqual([],flattener({9,""})), | |
?_assertEqual([],flattener({3,[]})), | |
?_assertEqual([{"abc",9},{"def",9}],flattener({9,["abc","def"]})) | |
]. | |
%------------------------------------------------------------------------------- | |
% Used to read a file into a list of lines. | |
% Example files available in | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
-spec get_file_contents(string()) -> list(). | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
%------------------------------------------------------------------------------- | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
% | |
-spec get_all_lines(io:device(),list()) -> list(). | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
%------------------------------------------------------------------------------- | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
% | |
-spec show_file_contents(list()) -> ok. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment