Last active
May 22, 2020 19:42
-
-
Save jameswalton/86926d7c4eda9e4e5a184dab56fcc71d to your computer and use it in GitHub Desktop.
Indexing a file - Functional Programming in Erlang assignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Four score and seven years ago our fathers brought | |
forth on this continent, a new nation, conceived in Liberty, | |
and dedicated to the proposition that all men are created equal. | |
Now we are engaged in a great civil war, testing whether | |
that nation, or any nation so conceived and dedicated, | |
can long endure. We are met on a great battle-field of that war. | |
We have come to dedicate a portion of that field, as a final | |
resting place for those who here gave their lives that | |
that nation might live. It is altogether fitting and proper | |
that we should do this. | |
But, in a larger sense, we can not dedicate -- | |
we can not consecrate -- we can not hallow -- this ground. | |
The brave men, living and dead, who struggled here, have | |
\consecrated it, far above our poor power to add or detract. | |
The world will little note, nor long remember what we say here, | |
but it can never forget what they did here. It is for us the | |
living, rather, to be dedicated here to the unfinished work | |
which they who fought here have thus far so nobly advanced. | |
It is rather for us to be here dedicated to the great task | |
remaining before us -- that from these honored dead we take | |
increased devotion to that cause for which they gave the | |
last full measure of devotion -- that we here highly | |
resolve that these dead shall not have died in vain -- | |
that this nation, under God, shall have a new birth | |
of freedom -- and that government of the people, | |
by the people, for the people, shall not perish from the earth. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1, show_file_contents/1, index/1, test_all/0]). | |
%% Helper functions which can be exported for testing individually | |
-export([add_line_numbers/1, index_line/1, index_words/2, aggregate_words/1, remove_word/2, find_occurrences/2, partition_ranges/1]). | |
-define(MinWordLength, 3). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok, File} = file:open(Name, [read]), | |
Rev = get_all_lines(File, []), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File, Partial) -> | |
case io:get_line(File, "") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip, _} = lists:split(length(Line) - 1, Line), | |
get_all_lines(File, [Strip | Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L | Ls]) -> | |
io:format("~s~n", [L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
%% Main function to perform the indexing of a file given a FileName. | |
index(FileName) -> | |
aggregate_words(lists:flatten(index_line(add_line_numbers(get_file_contents(FileName))))). | |
%% Add line number to each line | |
%% [ {1, ["List", "of", "Words"]} ] | |
add_line_numbers(L) -> add_line_numbers(L, 1). | |
add_line_numbers([], _) -> []; | |
add_line_numbers([H|T], N) -> | |
[ {N, string:split(H, " ", all)} | add_line_numbers(T, N + 1) ]. | |
%% Higher level function to index the words | |
index_line([]) -> | |
[]; | |
index_line([{LineNumber, Line}|T]) -> | |
[ index_words(LineNumber, Line) | index_line(T) ]. | |
%% Build a list of { Word, LineNumber } tuples of words with a minimum length of MinWordLength | |
%% This function also normalizes (lowercases) the word and cleans any unwanted symbols | |
index_words(_, []) -> []; | |
index_words(LineNumber, [Word|Words]) when length(Word) >= ?MinWordLength -> | |
Normalized = string:to_lower(Word), | |
Cleaned = string:trim(Normalized, both, ".'\\`,;:!"), | |
[ {Cleaned, LineNumber} | index_words(LineNumber, Words)]; | |
index_words(LineNumber, [_Word|Words]) -> index_words(LineNumber, Words). | |
%% Find all occurrences | |
%% Create list of [1, 1, 2, 4, 6] | |
%% Partition list of lines into [{1, 1}, {3, 5}] | |
aggregate_words(L) -> aggregate_words(L, {}). | |
aggregate_words([], _Index) -> []; | |
aggregate_words([{Word, _LineNumber}|Words] = List, _Index) -> | |
[ {Word, partition_ranges(find_occurrences(Word, List))} | aggregate_words(remove_word(Word, Words)) ]. | |
%% Select all occurrences of a given word | |
find_occurrences(_Word, []) -> []; | |
find_occurrences(Word, [{Word, LineNumber} | Words]) -> | |
[ LineNumber | find_occurrences(Word, Words) ]; | |
find_occurrences(Word, [_|Words]) -> | |
find_occurrences(Word, Words). | |
%% Remove all occurrences of a given word | |
remove_word(_Word, []) -> []; | |
remove_word(Word, [{Word, _}|Words]) -> remove_word(Word, Words); | |
remove_word(Word, [{_NotWord, _} = Keep |T]) -> [ Keep | remove_word(Word, T) ]. | |
%% Higher level function to partition a list of lines into a list of tuple ranges | |
partition_ranges([]) -> []; | |
partition_ranges([X|Lines]) -> partition_ranges(Lines, {X, X}, []). | |
%% Base case, we've run out of lines so the last range is complete so | |
%% add it to the list of ranges | |
partition_ranges([], Range, Ranges) -> Ranges ++ [Range]; | |
%% With the tuple {1, 2} and the next line is 2 iterate again | |
partition_ranges([Y|Lines], {X, Y}, Ranges) -> | |
partition_ranges(Lines, {X, Y}, Ranges); | |
%% With the tuple {1, 2} and the next line is 3 (2 + 1) then set the | |
%% second element to of the tuple to 3 | |
partition_ranges([Z|Lines], {X, Y}, Ranges) when Y + 1 =:= Z -> | |
partition_ranges(Lines, {X, Z}, Ranges); | |
%% With the tuple {1, 2} and the next line is 4 then the range is | |
%% broken and we add the previous range to the list and initially | |
%% set the new range tuple to {4, 4} | |
partition_ranges([Z|Lines], {_X, _Y} = Range, Ranges) -> | |
partition_ranges(Lines, {Z, Z}, Ranges ++ [Range]). | |
%% Tests | |
test_all() -> | |
test_add_line_numbers(), | |
test_index_line(), | |
test_partition_ranges(), | |
test_index(), | |
ok. | |
test_add_line_numbers() -> | |
[] = add_line_numbers([]), | |
WithNumbers = | |
[{1, ["Four", "score", "and"]}, | |
{2, ["forth", "on", "this", "continent,"]}, | |
{3, ["and", "dedicated", "to."]}], | |
Input = ["Four score and", "forth on this continent,", "and dedicated to."], | |
WithNumbers = add_line_numbers(Input). | |
test_index_line() -> | |
[] = index_line([]), | |
NumberedWords = [[{"four",1},{"score",1},{"and",1}], | |
[{"forth",2},{"this",2},{"continent",2}], | |
[{"and",3},{"dedicated",3},{"to",3}]], | |
Input = [{1, ["Four", "score", "and"]}, | |
{2, ["forth", "on", "this", "continent,"]}, | |
{3, ["and", "dedicated", "to."]}], | |
NumberedWords = index_line(Input). | |
test_partition_ranges() -> | |
[] = partition_ranges([]), | |
[{3, 3}] = partition_ranges([3]), | |
[{1, 1}, {3, 6}, {9, 9}] = partition_ranges([1, 3, 4, 5, 6, 9, 9, 9]). | |
test_index() -> | |
[{"four",[{1,1}]}, | |
{"score",[{1,1}]}, | |
{"and",[{1,1},{3,3},{6,6},{10,10},{15,15},{27,27}]}, | |
{"seven",[{1,1}]}, | |
{"years",[{1,1}]}, | |
{"ago",[{1,1}]}, | |
{"our",[{1,1},{16,16}]}, | |
{"fathers",[{1,1}]}, | |
{"brought",[{1,1}]}, | |
{"forth",[{2,2}]}, | |
{"this",[{2,2},{11,11},{14,14},{26,26}]}, | |
{"continent",[{2,2}]}, | |
{"new",[{2,2},{26,26}]}, | |
{"nation",[{2,2},{6,6},{10,10},{26,26}]}, | |
{"conceived",[{2,2},{6,6}]}, | |
{"liberty",[{2,2}]}, | |
{"dedicated",[{3,3},{6,6},{19,19},{21,21}]}, | |
{"the",[{3,3},{15,15},{17,19},{21,21},{23,23},{27,28}]}, | |
{"proposition",[{3,3}]}, | |
{"that",[{3,3},{6,11},{22,27}]}, | |
{"all",[{3,3}]}, | |
{"men",[{3,3},{15,15}]}, | |
{"are",[{3,3},{5,5},{7,7}]}, | |
{"created",[{3,3}]}, | |
{"equal",[{3,3}]}, | |
{"now",[{5,5}]}, | |
{"engaged",[{5,5}]}, | |
{"great",[{5,5},{7,7},{21,21}]}, | |
{"civil",[{5,5}]}, | |
{"war",[{5,5},{7,7}]}, | |
{"testing",[{5,5}]}, | |
{"whether",[{5,5}]}, | |
{"any",[{6,6}]}, | |
{"can",[{7,7},{13,14},{18,18}]}, | |
{"long",[{7,7},{17,17}]}, | |
{"endure",[{7,7}]}, | |
{"met",[{7,7}]}, | |
{"battle-field",[{7,7}]}, | |
{"have",[{8,8},{15,15},{20,20},{25,26}]}, | |
{"come",[{8,8}]}, | |
{"dedicate",[{8,8},{13,13}]}, | |
{"portion",[{8,8}]}, | |
{"field",[{8,8}]}, | |
{"final",[{8,8}]}, | |
{"resting",[{9,9}]}, | |
{"place",[{9,9}]}, | |
{"for",[{9,9},{18,18},{21,21},{23,23},{28,28}]}, | |
{"those",[{9,9}]}, | |
{"who",[{9,9},{15,15},{20,20}]}, | |
{"here",[{9,9},{15,15},{17,21},{24,24}]}, | |
{"gave",[{9,9},{23,23}]}, | |
{"their",[{9,9}]}, | |
{"lives",[{9,9}]}, | |
{"might",[{10,10}]}, | |
{"live",[{10,10}]}, | |
{"altogether",[{10,10}]}, | |
{"fitting",[{10,10}]}, | |
{"proper",[{10,10}]}, | |
{"should",[{11,11}]}, | |
{"but",[{13,13},{18,18}]}, | |
{"larger",[{13,13}]}, | |
{"sense",[{13,13}]}, | |
{"not",[{13,14},{25,25},{28,28}]}, | |
{"consecrate",[{14,14}]}, | |
{"hallow",[{14,14}]}, | |
{"ground",[{14,14}]}, | |
{"brave",[{15,15}]}, | |
{"living",[{15,15},{19,19}]}, | |
{"dead",[{15,15},{22,22},{25,25}]}, | |
{"struggled",[{15,15}]}, | |
{"consecrated",[{16,16}]}, | |
{"it",[{16,16}]}, | |
{"far",[{16,16},{20,20}]}, | |
{"above",[{16,16}]}, | |
{"poor",[{16,16}]}, | |
{"power",[{16,16}]}, | |
{"add",[{16,16}]}, | |
{"detract",[{16,16}]}, | |
{"world",[{17,17}]}, | |
{"will",[{17,17}]}, | |
{"little",[{17,17}]}, | |
{"note",[{17,17}]}, | |
{"nor",[{17,17}]}, | |
{"remember",[{17,17}]}, | |
{"what",[{17,18}]}, | |
{"say",[{17,17}]}, | |
{"never",[{18,18}]}, | |
{"forget",[{18,18}]}, | |
{"they",[{18,18},{20,20},{23,23}]}, | |
{"did",[{18,18}]}, | |
{"rather",[{19,19},{21,21}]}, | |
{"unfinished",[{19,19}]}, | |
{"work",[{19,19}]}, | |
{"which",[{20,20},{23,23}]}, | |
{"fought",[{20,20}]}, | |
{"thus",[{20,20}]}, | |
{"nobly",[{20,20}]}, | |
{"advanced",[{20,20}]}, | |
{"task",[{21,21}]}, | |
{"remaining",[{22,22}]}, | |
{"before",[{22,22}]}, | |
{"from",[{22,22},{28,28}]}, | |
{"these",[{22,22},{25,25}]}, | |
{"honored",[{22,22}]}, | |
{"take",[{22,22}]}, | |
{"increased",[{23,23}]}, | |
{"devotion",[{23,24}]}, | |
{"cause",[{23,23}]}, | |
{"last",[{24,24}]}, | |
{"full",[{24,24}]}, | |
{"measure",[{24,24}]}, | |
{"highly",[{24,24}]}, | |
{"resolve",[{25,25}]}, | |
{"shall",[{25,26},{28,28}]}, | |
{"died",[{25,25}]}, | |
{"vain",[{25,25}]}, | |
{"under",[{26,26}]}, | |
{"god",[{26,26}]}, | |
{"birth",[{26,26}]}, | |
{"freedom",[{27,27}]}, | |
{"government",[{27,27}]}, | |
{"people",[{27,28}]}, | |
{"perish",[{28,28}]}, | |
{"earth",[{28,28}]}] = index("gettysburg-address.txt"). |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The code implements the basic indexing as suggested in the assignment, along with cleaning and normalising the words. | |
Here is how I approached the assignment and the individual functions that make up the overall indexer. | |
The indexer can be run with index:index(Filename) and the tests can be run with index:test_all/0. | |
Given the input: | |
["Four score and seven years ago our fathers brought", | |
"forth on this continent, a new nation, conceived in Liberty,", | |
"and dedicated to the proposition that all men are created equal."] | |
Give lines a number: | |
[{1,"Four score and seven years ago our fathers brought"}, | |
{2,"forth on this continent, a new nation, conceived in Liberty,"}, | |
{3,"and dedicated to the proposition that all men are created equal."}] | |
Split the line in the same function using string:split based on spaces | |
[{1,["Four", "score", "and", "seven", "years", "ago", "our", "fathers", "brought"]}] | |
Index each word in the line. Create a tuple for each word in the line in the form {Word, Line}. | |
This is where I decided to do the cleaning and normalising since I am already accessing each word here. | |
[{"four",1},{"score",1},{"forth",2},{"continent",2}] | |
Reduce the {Word,Number} entries by creating {Word, LineNumber} for each line the word occurs on. | |
Take the first entry then look in the remaining list for occurrences of the word. | |
Add these occurrences to the word entry as a list of lines that the word appears on and remove further occurrences | |
of the word from the remaining list. | |
Create ranges in the form [{X,X}, {X,Y}] to denote the inclusive start and end lines of occurrences | |
Note on performance | |
The word list is traversed several times but the same traversal is repeated to both find occurrences of a given word | |
and remove occurrences of a given word. Having studied the `lists` module docs I could have used the lists:partition | |
function to achieve this, no doubt more efficiently. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment