Created
March 8, 2017 12:45
-
-
Save colinbankier/3273bdca9fda41662ef9189e07142b2b to your computer and use it in GitHub Desktop.
Indexing a file - Erlang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_index/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% The main entry to the module, returns the index for the given file name. | |
get_index(FileName) -> | |
Contents = get_file_contents(FileName), | |
build_index(Contents, 1, #{}). | |
% Builds the index recursively, accumulating entries and associated line | |
% numbers in the Index parameter. | |
% I used a Map data structure here for easier lookup and adding to as lines are | |
% processed. | |
% In the base case, when in the index is complete, the list of line numbers is | |
% grouped into contiguous ranges. | |
% The final index is then sorted by word. | |
% I decided to utilise a few functions from the Erlang std lib, | |
% e.g. lists:sort, string:to_lower, string:tokens | |
% given that we have implemented our own sort and similar functions in previous exercises, | |
% and using these % allowed more time to focus on other aspects of the challenge. | |
build_index([], _LineNum, Index) -> | |
GroupedIndex = group_values_by_range(maps:to_list(Index)), | |
lists:sort(fun(A, B) -> A =< B end, GroupedIndex); | |
build_index([H|T], LineNum, Index) -> | |
LineWords = nub(tokenize(string:to_lower(H))), | |
FilteredWords = filter_short_words(LineWords), | |
NewIndex = index_words(FilteredWords, LineNum, Index), | |
build_index(T, LineNum + 1, NewIndex). | |
% Indexes words from a single line, adding an entry to Index for LineNum. | |
index_words([], _LineNum, Index) -> | |
Index; | |
index_words([H|T], LineNum, Index) -> | |
Lines = maps:get(H, Index, []), | |
NewIndex = maps:put(H, [LineNum|Lines], Index), | |
index_words(T, LineNum, NewIndex). | |
% Splits a line into individual words. | |
% Std lib function used. | |
tokenize(Line) -> | |
string:tokens(Line, " ,.;?!`[]()\\\""). | |
% Removes short words from being indexed. | |
% Could be extended to filter common words such as "and". | |
filter_short_words(Words) when is_list(Words) -> | |
filter_short_words(Words, []). | |
filter_short_words([], Acc) -> | |
Acc; | |
filter_short_words([H|T], Acc) -> | |
case length(H) < 3 of | |
true -> | |
filter_short_words(T, Acc); | |
false -> | |
filter_short_words(T, [H|Acc]) | |
end. | |
% Groups the values by range for an index of Key,Value pairs. | |
% The values are assumed to be a list of numbers which are | |
% group into contiguous ranges. | |
group_values_by_range(Pairs) when is_list(Pairs) -> | |
group_values_by_range(Pairs, []). | |
group_values_by_range([{Key, Value}|T], Acc) -> | |
group_values_by_range(T, [{Key, group_by_range(lists:sort(Value))}|Acc]); | |
group_values_by_range([], Acc) -> | |
Acc. | |
% Groups a list of numbers into contiguous ranges. | |
group_by_range([]) -> | |
[]; | |
group_by_range([H|T]) -> | |
group_by_range(T, {H,H}, []). | |
group_by_range([], Range, Acc) -> | |
lists:reverse([Range|Acc]); | |
group_by_range([H|T], {S,E}, Acc) when E+1 == H -> | |
group_by_range(T, {S,H}, Acc); | |
group_by_range([H|T], Range={_S,_E}, Acc) -> | |
group_by_range(T, {H,H},[Range|Acc]). | |
% Returns a list with duplicate elements removed. | |
nub(List) -> | |
lists:reverse(nub(List, [])). | |
nub([], Acc) -> | |
Acc; | |
nub([H|T], Acc) -> | |
NextAcc = case lists:member(H, Acc) of | |
true -> Acc; | |
false -> [H | Acc] | |
end, | |
nub(T, NextAcc). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment