Last active
May 19, 2020 17:25
-
-
Save gorkaio/bd7fbf814f266dc8efd11e199abcd5c7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1,show_file_contents/1,index/1,index/2]). | |
-export([split_words_test/0,filter_words_test/0,normalise_words_test/0,words_test/0,index_line_test/0,index_test/0,add_words_test/0,range_test/0]). | |
-define(MIN_WORD_LENGTH, 2). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
%% Assignment %%% | |
%% Generate word index from lines | |
index(L) -> index(L, []). | |
index(L, ExcludeWords) -> index(L, ExcludeWords, 1, []). | |
index([], _, _, Ac) -> | |
RangedWordLines = lists:map(fun({W,WL}) -> {W, range(WL)} end, Ac), | |
lists:keysort(1, RangedWordLines); | |
index([H|T], ExcludeWords, LineCount, Ac) -> | |
L = index_line(H, ExcludeWords, LineCount), | |
index(T, ExcludeWords, LineCount + 1, add_words(L, Ac)). | |
index_test() -> | |
[] = index([]), | |
[] = index([""]), | |
[] = index(["doh!"], ["doh"]), | |
[{"hello", [{1,1}]}] = index(["hello world!"], ["world"]), | |
[{"hello",[{1,1}]},{"world",[{1,1}]}] = index(["hello world!"]), | |
[{"hello",[{1,2}]},{"universe",[{2,2}]},{"world",[{1,1}]}] = index(["hello world", "hello universe"]), | |
[{"baby", [{1,1},{3,3}]},{"hello",[{1,3}]},{"universe",[{3,3}]}] = index(["hello baby", "hello world", "hello baby universe"], ["world"]), | |
[{"universe",[{2,2}]}, {"world",[{1,1}]}] = index(["hello world", "hello universe"], ["hello"]), | |
passed. | |
%% Index a single line | |
index_line(S, ExcludeWords, LineCount) -> | |
lists:map(fun(W) -> {W, [LineCount]} end, words(S, ExcludeWords)). | |
index_line_test() -> | |
[] = index_line("", [], 2), | |
[{"foo", [2]}, {"nice", [2]}, {"word", [2]}] = index_line("foo is a nice word! foo! foo!", [], 2), | |
passed. | |
% Process words: split, normalise and filter | |
words(S) -> words(S, []). | |
words(S, ExcludeWords) -> | |
Words = split_words(S), | |
Normalised = normalise_words(Words), | |
filter_words(Normalised, ExcludeWords). | |
words_test() -> | |
[] = words(""), | |
[] = words(" "), | |
[] = words("me and I", ["and"]), | |
["about","begin","dead","doubt","marley","that","there","was","whatever","with"] = | |
words("Marley was dead: to begin with. There is no doubt whatever about that."), | |
["about","begin","dead","doubt","marley","there","was","whatever"] = | |
words("Marley was dead: to begin with. There is no doubt whatever about that.", ["that","with"]), | |
passed. | |
% Split strings into list of words | |
split_words(S) -> | |
L = re:split(S, "[[:^alnum:]]", [{return,list}]), | |
lists:filter(fun(A) -> A /= [] end, L). | |
split_words_test() -> | |
[] = split_words(""), | |
[] = split_words(" "), | |
["hello", "world"] = split_words("hello world"), | |
["hello", "world"] = split_words("hello 'world'"), | |
["December", "1843", "Stave", "1", "Marley", "s", "Ghost"] = split_words("December, 1843. Stave 1: Marley's Ghost"), | |
passed. | |
% Normalise words | |
normalise_words(L) -> | |
lists:map(fun(A) -> string:lowercase(A) end, L). | |
normalise_words_test() -> | |
[] = normalise_words([]), | |
["december", "1843", "stave", "1", "marley", "s", "ghost"] = normalise_words(["DeCeMber", "1843", "STAVE", "1", "MarleY", "s", "Ghost"]), | |
passed. | |
% filter words in a list: | |
% remove words shorter than MIN_WORD_LENGTH and optionally exclude some words | |
filter_words(L) -> filter_words(L, []). | |
filter_words(L, Exclude) -> | |
Words = lists:filter(fun(A) -> length(A) > ?MIN_WORD_LENGTH end, L) -- Exclude, | |
lists:usort(Words). % Sorts words and removes duplicates | |
filter_words_test() -> | |
[] = filter_words([]), | |
["love", "you"] = filter_words(["it", "is", "you", "my", "love"]), | |
["1843","December","Ghost","Marley","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"]), | |
["love", "you"] = filter_words(["it", "is", "you", "my", "love"], []), | |
["you"] = filter_words(["it", "is", "you", "my", "love"], ["love"]), | |
["1843","December","Stave"] = filter_words(["December", "1843", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]), | |
["1843","December","Stave"] = filter_words(["December", "1843", "December", "Stave", "1", "Marley", "s", "Ghost"], ["Marley", "Ghost"]), | |
passed. | |
%% Add words to index | |
add_words([],Index) -> Index; | |
add_words([{Word,WordLines} = W|L], Index) -> | |
IndexNew = case lists:keytake(Word, 1, Index) of | |
{value, {Word, WordLinesPre}, IndexRest} -> [{Word, lists:usort(WordLines ++ WordLinesPre)}] ++ IndexRest; | |
_ -> [W|Index] | |
end, | |
add_words(L, IndexNew). | |
add_words_test() -> | |
[] = add_words([], []), | |
[{"hello", [1]}] = add_words([],[{"hello", [1]}]), | |
[{"hello",[1]},{"bye",[2]}] = add_words([{"hello", [1]}], [{"bye", [2]}]), | |
[{"hello",[1,2]}] = add_words([{"hello", [1]}], [{"hello", [2]}]), | |
[{"hello",[1,2]},{"bye", [3]}] = add_words([{"hello", [1]}], [{"bye", [3]}, {"hello", [2]}]), | |
[{"bye", [1,3]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [3]}, {"hello", [2]}]), | |
[{"bye", [1]},{"hello",[1,2]}] = add_words([{"hello", [1]}, {"bye", [1]}], [{"bye", [1]}, {"hello", [2]}]), | |
passed. | |
%% Generate range | |
range([]) -> []; | |
range([A|L]) -> range(L, A, A, []). | |
range([], Last, Start, Ac) -> lists:reverse([{Start,Last}] ++ Ac); | |
range([A|T], Last, Start, Ac) when Last + 1 == A -> | |
range(T, A, Start, Ac); | |
range([A|T], Last, Start, Ac) -> | |
range(T, A, A, [{Start,Last}] ++ Ac). | |
range_test() -> | |
[] = range([]), | |
[{1,1}] = range([1]), | |
[{1,2}] = range([1,2]), | |
[{1,1}, {4,4}] = range([1,4]), | |
[{1,4}] = range([1,2,3,4]), | |
[{1,1}, {3,5}, {7,7}, {9,10}] = range([1,3,4,5,7,9,10]), | |
passed. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nice