Created
May 23, 2020 17:49
-
-
Save mareknowak/c8c8018b644e13c622a72fdbaee0a8ff to your computer and use it in GitHub Desktop.
Programming challenge: indexing a file - FP in Erlang 3.3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% FP in Erlang 3.3 | |
%% | |
%% Programming challenge: indexing a file | |
-module(index). | |
-export( | |
[ get_file_contents/1 | |
, show_file_contents/1 | |
, split_index/2 | |
, split_index_bin/2 | |
, write_list/2 | |
, traverse/3 | |
, main/2 % main function | |
, test/0 | |
]). | |
%% To generate index from a text file use main/2: | |
%% Example: | |
%% index:main("dickens-christmas.txt", "index.txt") | |
%% | |
%% Output of the main function contains also times of execution of reading from | |
%% file, generating index and writing to file. | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
%% Auxiliary functions | |
%% @doc Test whether first argument is a member of the second argument (list) | |
%% | |
member(_X, []) -> false; | |
member(X, [X | _Ys]) -> true; | |
member(X, [_Y | Ys]) -> member(X, Ys). | |
test_member() -> | |
true = member(2,[2,0,0,1]), | |
false = member(20,[2,0,0,1]), | |
ok. | |
%% @doc Is a word common? | |
is_common(Word) -> | |
Common = | |
[ "about" | |
, "after" | |
, "again" | |
, "all" | |
, "and" | |
, "any" | |
, "are" | |
, "been" | |
, "before" | |
, "but" | |
, "came" | |
, "can" | |
, "come" | |
, "could" | |
, "couldn't" | |
, "did" | |
, "didn't" | |
, "don't" | |
, "done" | |
, "every" | |
, "for" | |
, "from" | |
, "had" | |
, "has" | |
, "have" | |
, "having" | |
, "her" | |
, "him" | |
, "himself" | |
, "his" | |
, "i'd" | |
, "i'll" | |
, "i'm" | |
, "i've" | |
, "into" | |
, "isn't" | |
, "it's" | |
, "its" | |
, "knew" | |
, "know" | |
, "like" | |
, "man" | |
, "man's" | |
, "men" | |
, "more" | |
, "mrs" | |
, "much" | |
, "not" | |
, "now" | |
, "out" | |
, "said" | |
, "she" | |
, "should" | |
, "than" | |
, "that" | |
, "that's" | |
, "the" | |
, "their" | |
, "them" | |
, "then" | |
, "there" | |
, "there's" | |
, "these" | |
, "they" | |
, "they'd" | |
, "this" | |
, "those" | |
, "upon" | |
, "very" | |
, "was" | |
, "wasn't" | |
, "were" | |
, "what" | |
, "when" | |
, "will" | |
, "with" | |
, "would" | |
, "wouldn't" | |
, "you" | |
, "you'd" | |
, "you'll" | |
, "you're" | |
, "your" | |
], | |
member(Word, Common). | |
%% @doc Input: a list of words. | |
%% Output: list of words not shorter then 3 chars and not common | |
clean([], Cleaned) -> Cleaned; | |
clean([Word|Words], Cleaned) -> | |
case length(Word) < 3 of | |
true -> clean(Words, Cleaned); | |
false -> | |
%% drop common words | |
case is_common(Word) of | |
true -> clean(Words, Cleaned); | |
false -> clean(Words, [Word|Cleaned]) | |
end | |
end. | |
clean(Words) -> | |
clean(Words, []). | |
%% @doc Get words from a line | |
line_to_words(Line) -> | |
%% Let's be case-agnostic | |
CF = string:casefold(Line), | |
%% return strings separated by a char | |
Words = string:lexemes(CF, "\s\t.,:;-+()[]{}~*=%&#\"!-_\\`?"), | |
clean(Words). | |
%% Auxiliary functions that relate list of integers and list of ranges | |
%% | |
%% @doc Convert ranges to list of ordered integers | |
range_to_list(Range) -> | |
{From, To} = Range, | |
lists:seq(From, To). | |
test_range_to_list() -> | |
[1, 2, 3] = range_to_list({1, 3}), | |
[11, 12, 13] = range_to_list({11, 13}), | |
[8] = range_to_list({8, 8}), | |
ok. | |
ranges_to_list([], Out) -> Out; | |
ranges_to_list([R|Rs], Out) -> | |
L = range_to_list(R), | |
ranges_to_list(Rs, Out ++ L). | |
ranges_to_list(Ranges) -> | |
ranges_to_list(Ranges, []). | |
test_ranges_to_list() -> | |
[3, 4] = ranges_to_list([{3,4}]), | |
[3, 4, 5, 7, 11, 12, 13] = ranges_to_list([{3,5},{7,7},{11,13}]), | |
ok. | |
%% @doc Input: an integer and a list of integers | |
%% Output Range for given integer and the rest of list of integers | |
get_range_aux({From, To}, []) -> | |
{{From, To}, []}; | |
get_range_aux({From, To}, [N| Ints]) when N =< To-> | |
get_range_aux({From, To}, Ints); | |
get_range_aux({From, To}, [N| Ints]) -> | |
case N == To + 1 of | |
true -> get_range_aux({From, N}, Ints); | |
false -> {{From, To}, [N| Ints]} | |
end. | |
get_range(N, List) -> | |
get_range_aux({N, N}, List). | |
test_get_range() -> | |
{{1,3}, [5, 6]} = get_range(1, [1,2,3,5,6]), | |
{{4,6}, [8, 9]} = get_range(4, [1,2,3,4,5,6, 8, 9]), | |
ok. | |
%% @doc Convert list of integers to list of ranges | |
list_to_ranges(Ranges, []) -> lists:reverse(Ranges); | |
list_to_ranges(Ranges, [N| Ints]) -> | |
{Range, Rest} = get_range(N, Ints), | |
list_to_ranges([Range|Ranges], Rest). | |
list_to_ranges(List) -> | |
list_to_ranges([], List). | |
test_list_to_ranges() -> | |
[{3,5},{7,7},{11,13}] = list_to_ranges([3, 4, 5, 7, 11, 12, 13]), | |
[{3,5}] = list_to_ranges([3, 4, 4, 4, 5]), | |
ok. | |
%% @doc Return list with inserted element in the correct place | |
insert(Xs, [], Z) -> | |
lists:reverse(Xs) ++ [Z]; | |
insert(Xs, [Y| Ys], Z) when Z =< Y -> | |
lists:reverse(Xs) ++ [Z, Y| Ys]; | |
insert(Xs, [Y| Ys], Z) -> | |
insert([Y| Xs], Ys, Z). | |
insert(List, Z) -> | |
insert([], List, Z). | |
test_insert() -> | |
[1, 2, 3, 4] = insert([1, 2, 4], 3), | |
[1, 2, 2, 3, 4] = insert([1, 2, 3, 4], 2), | |
[1, 2, 3, 4] = insert([1, 2, 3], 4), | |
ok. | |
%% @doc Add N to ranges | |
update_ranges(Ranges, N) -> | |
L = ranges_to_list(Ranges), | |
NewL = insert(L, N), | |
list_to_ranges(NewL). | |
test_update_ranges() -> | |
[{3,5}] = update_ranges([{3,4}], 5), | |
[{3,4}, {7,7}, {19,23}] = update_ranges([{3,4}, {7,7}, {19,23}], 20), | |
ok. | |
%% @doc Split index (linear version) | |
%% Left contains elements with words less than Word | |
%% Right - not less than Word | |
split_index(Word, Index) -> | |
IsLess = | |
fun (Ind) -> | |
{W, _} = Ind, | |
W < Word | |
end, | |
lists:partition(IsLess, Index). | |
test_split_index() -> | |
Index = [{"a", [{1,1}]}, {"b", [{2,3}]}], | |
Left = [{"a", [{1,1}]}], | |
Right = [{"b", [{2,3}]}], | |
{Left, Right} = split_index("b", Index), | |
ok. | |
%% @doc Add Left list to Right | |
add([], Sum) -> Sum; | |
add([L| Left], Right) -> | |
add(Left, [L| Right]). | |
%% @doc Split index (binary version) | |
%% Left contains elements with words less than Word | |
%% Right - not less than Word | |
%% ToSplit - list to be splitted into Left and Right | |
%% | |
%% To be tail recursive I avoid using ++ so lists must be reversed back and | |
%% forth | |
%% | |
%% Binary split is faster than linear one. For "dickens-christmat.txt" it takes | |
%% about 1s (vs. 2s in linear case). | |
split_index_bin(Left, Right, _Word, []) -> {lists:reverse(Left), Right}; | |
split_index_bin(Left, Right, Word, [Element]) -> | |
{W, _} = Element, | |
case W < Word of | |
true -> split_index_bin([Element| Left], Right, Word, []); | |
false -> split_index_bin(Left, [Element| Right], Word, []) | |
end; | |
split_index_bin(Left, Right, Word, ToSplit) -> | |
N = length(ToSplit), | |
{L, R} = lists:split(N div 2, ToSplit), | |
{W, _} = hd(R), | |
case W < Word of | |
%% all words in L are less then Word so add L | |
%% to Left | |
true -> | |
split_index_bin(add(L, Left), Right, Word, R); | |
%% Word is not greater than a word in R so | |
%% R is joined with Right | |
false -> | |
split_index_bin(Left, add(lists:reverse(R), Right), Word, L) | |
end. | |
split_index_bin(_Word, []) -> {[], []}; | |
split_index_bin(Word, Index) -> | |
split_index_bin([], [], Word, Index). | |
test_split_index_bin() -> | |
Index = [{"a", [{1,1}]}, {"b", [{2,3}]}], | |
Left = [{"a", [{1,1}]}], | |
Right = [{"b", [{2,3}]}], | |
{Left, Right} = split_index_bin("b", Index), | |
{Index, []} = split_index_bin("c", Index), | |
{[], Index} = split_index_bin("a", Index), | |
ok. | |
update_index({Word, Line}, []) -> [{Word, [{Line, Line}]}]; | |
update_index({Word, Line}, Index) -> | |
%% Split Index | |
%% Left contains elements less than Word | |
%% | |
%% Binary split for "dickens-christmas.txt" is 2x faster than | |
%% linear split. Function main/2 measures those times. | |
%% | |
%% Uncomment to switch between different flavours of split index | |
%% | |
%% {Left, Right} = split_index(Word, Index), % linear split | |
{Left, Right} = split_index_bin(Word, Index), % binary split | |
case length(Right) == 0 of | |
%% Word is the largest element in the index | |
true -> Left ++ [{Word, [{Line, Line}]}]; | |
false -> | |
%% take an element from Right | |
%% and compare to Word | |
[{W, Ranges}| Rest] = Right, | |
case Word == W of | |
true -> | |
NewRanges = update_ranges(Ranges, Line), | |
Left ++ [{W, NewRanges}| Rest]; | |
false -> | |
%% Add element in front of Right | |
NewRight = [{Word, [{Line, Line}]}, {W, Ranges}| Rest], | |
Left ++ NewRight | |
end | |
end. | |
test_update_index() -> | |
[{"elephant", [{8, 8}]}] = update_index({"elephant", 8}, []), | |
[{"elephant", [{8, 10}]}, {"home", [{1, 2}]}] = | |
update_index( | |
%% new word | |
{"elephant", 8}, | |
%% index | |
[{"elephant", [{9,10}] }, {"home", [{1, 2}]}]), | |
ok. | |
%% @doc Write a list to file | |
write_list(_File, []) -> ok; | |
write_list(File, [Word| Words]) -> | |
file:write_file(File, io_lib:fwrite("~p\n", [Word]), [append]), | |
write_list(File, Words). | |
add_line_number(_N, [], Out) -> Out; | |
add_line_number(N, [L|Ls], Out) -> | |
add_line_number(N, Ls, [{L, N}|Out]). | |
words_to_index([], Index) -> Index; | |
words_to_index([L|Ls], Index) -> | |
NewIndex = update_index(L, Index), | |
words_to_index(Ls, NewIndex). | |
traverse(_N, [], Index) -> Index; | |
traverse(N, [Line| Lines], Index) -> | |
Words = line_to_words(Line), | |
Inds = add_line_number(N, Words, []), | |
NewIndex = words_to_index(Inds, Index), | |
traverse(N + 1, Lines, NewIndex). | |
%% @doc Main function | |
%% Input, Output - names of files | |
main(Input, Output) -> | |
%% Read file | |
%% Timer measures execution time of a function | |
{Time1, Lines} = timer:tc(index, get_file_contents, [Input]), | |
{Time2, Index} = timer:tc(index, traverse, [1, Lines, []]), | |
file:write_file(Output, io_lib:fwrite("Reading from file takes ~p ms.\n\n", [Time1])), | |
%% Generating index for "dickens-christmas.txt" takes about 1s for if | |
%% update_index uses binary split and 2s for linear split | |
file:write_file(Output, io_lib:fwrite("Generating index takes ~p ms.\n\n", [Time2]), [append]), | |
{Time3, _} = timer:tc(index, write_list, [Output, Index]), | |
%% Writing to file takes about 4s. | |
file:write_file(Output, io_lib:fwrite("\nWriting to file takes ~p ms.\n", [Time3]), [append]), | |
ok. | |
test() -> | |
ok = test_range_to_list(), | |
ok = test_ranges_to_list(), | |
ok = test_member(), | |
ok = test_get_range(), | |
ok = test_list_to_ranges(), | |
ok = test_insert(), | |
ok = test_update_ranges(), | |
ok = test_split_index(), | |
ok = test_split_index_bin(), | |
ok = test_update_index(), | |
ok. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment