Skip to content

Instantly share code, notes, and snippets.

@mareknowak
Created May 23, 2020 17:49
Show Gist options
  • Save mareknowak/c8c8018b644e13c622a72fdbaee0a8ff to your computer and use it in GitHub Desktop.
Save mareknowak/c8c8018b644e13c622a72fdbaee0a8ff to your computer and use it in GitHub Desktop.
Programming challenge: indexing a file - FP in Erlang 3.3
%% FP in Erlang 3.3
%%
%% Programming challenge: indexing a file
-module(index).
-export(
[ get_file_contents/1
, show_file_contents/1
, split_index/2
, split_index_bin/2
, write_list/2
, traverse/3
, main/2 % main function
, test/0
]).
%% To generate index from a text file use main/2:
%% Example:
%% index:main("dickens-christmas.txt", "index.txt")
%%
%% Output of the main function contains also times of execution of reading from
%% file, generating index and writing to file.
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
%% Auxiliary functions
%% @doc Test whether first argument is a member of the second argument (list)
%%
member(_X, []) -> false;
member(X, [X | _Ys]) -> true;
member(X, [_Y | Ys]) -> member(X, Ys).
test_member() ->
true = member(2,[2,0,0,1]),
false = member(20,[2,0,0,1]),
ok.
%% @doc Is a word common?
is_common(Word) ->
Common =
[ "about"
, "after"
, "again"
, "all"
, "and"
, "any"
, "are"
, "been"
, "before"
, "but"
, "came"
, "can"
, "come"
, "could"
, "couldn't"
, "did"
, "didn't"
, "don't"
, "done"
, "every"
, "for"
, "from"
, "had"
, "has"
, "have"
, "having"
, "her"
, "him"
, "himself"
, "his"
, "i'd"
, "i'll"
, "i'm"
, "i've"
, "into"
, "isn't"
, "it's"
, "its"
, "knew"
, "know"
, "like"
, "man"
, "man's"
, "men"
, "more"
, "mrs"
, "much"
, "not"
, "now"
, "out"
, "said"
, "she"
, "should"
, "than"
, "that"
, "that's"
, "the"
, "their"
, "them"
, "then"
, "there"
, "there's"
, "these"
, "they"
, "they'd"
, "this"
, "those"
, "upon"
, "very"
, "was"
, "wasn't"
, "were"
, "what"
, "when"
, "will"
, "with"
, "would"
, "wouldn't"
, "you"
, "you'd"
, "you'll"
, "you're"
, "your"
],
member(Word, Common).
%% @doc Input: a list of words.
%% Output: list of words not shorter then 3 chars and not common
clean([], Cleaned) -> Cleaned;
clean([Word|Words], Cleaned) ->
case length(Word) < 3 of
true -> clean(Words, Cleaned);
false ->
%% drop common words
case is_common(Word) of
true -> clean(Words, Cleaned);
false -> clean(Words, [Word|Cleaned])
end
end.
clean(Words) ->
clean(Words, []).
%% @doc Get words from a line
line_to_words(Line) ->
%% Let's be case-agnostic
CF = string:casefold(Line),
%% return strings separated by a char
Words = string:lexemes(CF, "\s\t.,:;-+()[]{}~*=%&#\"!-_\\`?"),
clean(Words).
%% Auxiliary functions that relate list of integers and list of ranges
%%
%% @doc Convert ranges to list of ordered integers
range_to_list(Range) ->
{From, To} = Range,
lists:seq(From, To).
test_range_to_list() ->
[1, 2, 3] = range_to_list({1, 3}),
[11, 12, 13] = range_to_list({11, 13}),
[8] = range_to_list({8, 8}),
ok.
ranges_to_list([], Out) -> Out;
ranges_to_list([R|Rs], Out) ->
L = range_to_list(R),
ranges_to_list(Rs, Out ++ L).
ranges_to_list(Ranges) ->
ranges_to_list(Ranges, []).
test_ranges_to_list() ->
[3, 4] = ranges_to_list([{3,4}]),
[3, 4, 5, 7, 11, 12, 13] = ranges_to_list([{3,5},{7,7},{11,13}]),
ok.
%% @doc Input: an integer and a list of integers
%% Output Range for given integer and the rest of list of integers
get_range_aux({From, To}, []) ->
{{From, To}, []};
get_range_aux({From, To}, [N| Ints]) when N =< To->
get_range_aux({From, To}, Ints);
get_range_aux({From, To}, [N| Ints]) ->
case N == To + 1 of
true -> get_range_aux({From, N}, Ints);
false -> {{From, To}, [N| Ints]}
end.
get_range(N, List) ->
get_range_aux({N, N}, List).
test_get_range() ->
{{1,3}, [5, 6]} = get_range(1, [1,2,3,5,6]),
{{4,6}, [8, 9]} = get_range(4, [1,2,3,4,5,6, 8, 9]),
ok.
%% @doc Convert list of integers to list of ranges
list_to_ranges(Ranges, []) -> lists:reverse(Ranges);
list_to_ranges(Ranges, [N| Ints]) ->
{Range, Rest} = get_range(N, Ints),
list_to_ranges([Range|Ranges], Rest).
list_to_ranges(List) ->
list_to_ranges([], List).
test_list_to_ranges() ->
[{3,5},{7,7},{11,13}] = list_to_ranges([3, 4, 5, 7, 11, 12, 13]),
[{3,5}] = list_to_ranges([3, 4, 4, 4, 5]),
ok.
%% @doc Return list with inserted element in the correct place
insert(Xs, [], Z) ->
lists:reverse(Xs) ++ [Z];
insert(Xs, [Y| Ys], Z) when Z =< Y ->
lists:reverse(Xs) ++ [Z, Y| Ys];
insert(Xs, [Y| Ys], Z) ->
insert([Y| Xs], Ys, Z).
insert(List, Z) ->
insert([], List, Z).
test_insert() ->
[1, 2, 3, 4] = insert([1, 2, 4], 3),
[1, 2, 2, 3, 4] = insert([1, 2, 3, 4], 2),
[1, 2, 3, 4] = insert([1, 2, 3], 4),
ok.
%% @doc Add N to ranges
update_ranges(Ranges, N) ->
L = ranges_to_list(Ranges),
NewL = insert(L, N),
list_to_ranges(NewL).
test_update_ranges() ->
[{3,5}] = update_ranges([{3,4}], 5),
[{3,4}, {7,7}, {19,23}] = update_ranges([{3,4}, {7,7}, {19,23}], 20),
ok.
%% @doc Split index (linear version)
%% Left contains elements with words less than Word
%% Right - not less than Word
split_index(Word, Index) ->
IsLess =
fun (Ind) ->
{W, _} = Ind,
W < Word
end,
lists:partition(IsLess, Index).
test_split_index() ->
Index = [{"a", [{1,1}]}, {"b", [{2,3}]}],
Left = [{"a", [{1,1}]}],
Right = [{"b", [{2,3}]}],
{Left, Right} = split_index("b", Index),
ok.
%% @doc Add Left list to Right
add([], Sum) -> Sum;
add([L| Left], Right) ->
add(Left, [L| Right]).
%% @doc Split index (binary version)
%% Left contains elements with words less than Word
%% Right - not less than Word
%% ToSplit - list to be splitted into Left and Right
%%
%% To be tail recursive I avoid using ++ so lists must be reversed back and
%% forth
%%
%% Binary split is faster than linear one. For "dickens-christmat.txt" it takes
%% about 1s (vs. 2s in linear case).
split_index_bin(Left, Right, _Word, []) -> {lists:reverse(Left), Right};
split_index_bin(Left, Right, Word, [Element]) ->
{W, _} = Element,
case W < Word of
true -> split_index_bin([Element| Left], Right, Word, []);
false -> split_index_bin(Left, [Element| Right], Word, [])
end;
split_index_bin(Left, Right, Word, ToSplit) ->
N = length(ToSplit),
{L, R} = lists:split(N div 2, ToSplit),
{W, _} = hd(R),
case W < Word of
%% all words in L are less then Word so add L
%% to Left
true ->
split_index_bin(add(L, Left), Right, Word, R);
%% Word is not greater than a word in R so
%% R is joined with Right
false ->
split_index_bin(Left, add(lists:reverse(R), Right), Word, L)
end.
split_index_bin(_Word, []) -> {[], []};
split_index_bin(Word, Index) ->
split_index_bin([], [], Word, Index).
test_split_index_bin() ->
Index = [{"a", [{1,1}]}, {"b", [{2,3}]}],
Left = [{"a", [{1,1}]}],
Right = [{"b", [{2,3}]}],
{Left, Right} = split_index_bin("b", Index),
{Index, []} = split_index_bin("c", Index),
{[], Index} = split_index_bin("a", Index),
ok.
update_index({Word, Line}, []) -> [{Word, [{Line, Line}]}];
update_index({Word, Line}, Index) ->
%% Split Index
%% Left contains elements less than Word
%%
%% Binary split for "dickens-christmas.txt" is 2x faster than
%% linear split. Function main/2 measures those times.
%%
%% Uncomment to switch between different flavours of split index
%%
%% {Left, Right} = split_index(Word, Index), % linear split
{Left, Right} = split_index_bin(Word, Index), % binary split
case length(Right) == 0 of
%% Word is the largest element in the index
true -> Left ++ [{Word, [{Line, Line}]}];
false ->
%% take an element from Right
%% and compare to Word
[{W, Ranges}| Rest] = Right,
case Word == W of
true ->
NewRanges = update_ranges(Ranges, Line),
Left ++ [{W, NewRanges}| Rest];
false ->
%% Add element in front of Right
NewRight = [{Word, [{Line, Line}]}, {W, Ranges}| Rest],
Left ++ NewRight
end
end.
test_update_index() ->
[{"elephant", [{8, 8}]}] = update_index({"elephant", 8}, []),
[{"elephant", [{8, 10}]}, {"home", [{1, 2}]}] =
update_index(
%% new word
{"elephant", 8},
%% index
[{"elephant", [{9,10}] }, {"home", [{1, 2}]}]),
ok.
%% @doc Write a list to file
write_list(_File, []) -> ok;
write_list(File, [Word| Words]) ->
file:write_file(File, io_lib:fwrite("~p\n", [Word]), [append]),
write_list(File, Words).
add_line_number(_N, [], Out) -> Out;
add_line_number(N, [L|Ls], Out) ->
add_line_number(N, Ls, [{L, N}|Out]).
words_to_index([], Index) -> Index;
words_to_index([L|Ls], Index) ->
NewIndex = update_index(L, Index),
words_to_index(Ls, NewIndex).
traverse(_N, [], Index) -> Index;
traverse(N, [Line| Lines], Index) ->
Words = line_to_words(Line),
Inds = add_line_number(N, Words, []),
NewIndex = words_to_index(Inds, Index),
traverse(N + 1, Lines, NewIndex).
%% @doc Main function
%% Input, Output - names of files
main(Input, Output) ->
%% Read file
%% Timer measures execution time of a function
{Time1, Lines} = timer:tc(index, get_file_contents, [Input]),
{Time2, Index} = timer:tc(index, traverse, [1, Lines, []]),
file:write_file(Output, io_lib:fwrite("Reading from file takes ~p ms.\n\n", [Time1])),
%% Generating index for "dickens-christmas.txt" takes about 1s for if
%% update_index uses binary split and 2s for linear split
file:write_file(Output, io_lib:fwrite("Generating index takes ~p ms.\n\n", [Time2]), [append]),
{Time3, _} = timer:tc(index, write_list, [Output, Index]),
%% Writing to file takes about 4s.
file:write_file(Output, io_lib:fwrite("\nWriting to file takes ~p ms.\n", [Time3]), [append]),
ok.
test() ->
ok = test_range_to_list(),
ok = test_ranges_to_list(),
ok = test_member(),
ok = test_get_range(),
ok = test_list_to_ranges(),
ok = test_insert(),
ok = test_update_ranges(),
ok = test_split_index(),
ok = test_split_index_bin(),
ok = test_update_index(),
ok.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment