Created
March 5, 2017 16:24
-
-
Save ygrenzinger/7e70299299b60a7ccb4d52a477e65b40 to your computer and use it in GitHub Desktop.
Future Learn Erlang Week 2 exercise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(week2). | |
-export([get_file_contents/1,show_file_contents/1, processText/1]). | |
-include_lib("eunit/include/eunit.hrl"). | |
isAlphabetical(X) -> | |
($A =< X andalso X =< $Z) or ($a =< X andalso X =< $z). | |
isAlphabetical_test() -> | |
?assertEqual(true, isAlphabetical($G)), | |
?assertEqual(true, isAlphabetical($g)), | |
?assertEqual(false, isAlphabetical($.)), | |
?assertEqual(false, isAlphabetical($,)). | |
isBlank(C) -> | |
(C == 32). | |
isBlank_test() -> | |
?assertEqual(false, isBlank($I)), | |
?assertEqual(false, isBlank($.)), | |
?assertEqual(true, isBlank($\ )). | |
isLineReturn(C) -> | |
(C == $\n). | |
isLineReturn_test() -> | |
?assertEqual(false, isLineReturn($I)), | |
?assertEqual(false, isLineReturn($.)), | |
?assertEqual(true, isLineReturn($\n)). | |
nocap(C) -> | |
case $A =< C andalso C =< $Z of | |
true -> | |
C+32; | |
false -> | |
C | |
end. | |
removeCapFromText(Text) -> | |
lists:map(fun(C) -> nocap(C) end, Text). | |
removeCapFromText_test() -> | |
?assertEqual("madam i'm adam", removeCapFromText("Madam I'm Adam")). | |
replaceSpecialCharByBlank(C) -> | |
case isBlank(C) or isAlphabetical(C) of | |
true -> C; | |
false -> 32 | |
end. | |
replaceSpecialCharByBlank_test() -> | |
?assertEqual(32, replaceSpecialCharByBlank($.)), | |
?assertEqual(32, replaceSpecialCharByBlank($\ )), | |
?assertEqual($\n, replaceSpecialCharByBlank($\n)), | |
?assertEqual($i, replaceSpecialCharByBlank($i)). | |
cleanText(Text) -> | |
lists:map(fun(C) -> nocap(replaceSpecialCharByBlank(C)) end, Text). | |
cleanText_test() -> | |
?assertEqual("hello i m discovering erlang hope it will be fine ", | |
cleanText("Hello, I'm discovering Erlang! %Hope it will be fine.")). | |
splitLineByBlank(Line) -> | |
string:tokens(Line, " "). | |
splitLineByBlank_test() -> | |
?assertEqual(["Hello", "Madam", "Adam"], splitLineByBlank("Hello Madam Adam")). | |
% 25 common stopwords : http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html | |
% stopwords() -> ["a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"]. | |
% But there is normally much more stopwords http://www.ranks.nl/stopwords | |
% To simplify I will just remove too short words | |
keepUseful(Words) -> | |
lists:filter(fun(Word) -> string:len(Word) > 3 end, Words). | |
% The decomposing function : takes a text, clean it and build a list of lists of words (or words by line) | |
transformIntoLinesOfUsefulWord(Lines) -> | |
lists:map(fun(Line) -> keepUseful(splitLineByBlank(cleanText(Line))) end, Lines). | |
transformIntoLinesOfUsefulWord_test() -> | |
?assertEqual([ | |
["hello"], | |
["trying", "through", "erlang"], | |
["hope", "solution", "correct"] | |
], transformIntoLinesOfUsefulWord(["Hello!","I'm trying to get through Erlang.","Hope the solution is correct."])). | |
addLinePosition(Pos, Word, WordsByLines) -> | |
UpdatingFn = fun(Lines) -> Lines ++ [Pos] end, | |
maps:update_with(Word,UpdatingFn,[Pos],WordsByLines). | |
% Takes in input the lines of words, the line number and the words grouped by lines | |
% Use Map structure (#{}) and foldl to build the list | |
wordsByLines([], _, WordsByLines) -> | |
WordsByLines; | |
wordsByLines([Words|RemainingLines], Pos, WordsByLines) -> | |
Fun = fun(Word, Map) -> addLinePosition(Pos, Word, Map) end, | |
UpdatedMap = lists:foldl(Fun, WordsByLines, Words), | |
wordsByLines(RemainingLines, Pos+1, UpdatedMap). | |
% funny things thanks to Map the word are already sorted (hash algo ?) | |
wordsByLines(LinesOfWord) -> | |
maps:to_list(wordsByLines(LinesOfWord, 1, #{})). | |
wordsByLines_test() -> | |
?assertEqual([{ "hello" , [1] }, { "thanks", [2]}, { "yannick", [1, 2]}], | |
wordsByLines([["hello", "yannick"], ["thanks", "yannick"]])). | |
% Last ingredient to complete the work : transform the list of lines to a list of the ranges of lines | |
toRangeOfLinesT([], Ranges) -> | |
Ranges; | |
toRangeOfLinesT([X|Xs], Ranges) -> | |
{A,B} = lists:last(Ranges), | |
case X == B + 1 of | |
true -> toRangeOfLinesT(Xs, lists:droplast(Ranges) ++ [{A,X}]); | |
false -> toRangeOfLinesT(Xs, Ranges ++ [{X,X}]) | |
end. | |
toRangeOfLines([X|Xs]) -> | |
toRangeOfLinesT(Xs, [{X,X}]). | |
toRangeOfLines_test() -> | |
?assertEqual([{3,3}], toRangeOfLines([3])), | |
?assertEqual([{3,4}], toRangeOfLines([3,4])), | |
?assertEqual([{3,5},{7,7},{11,13}], toRangeOfLines([3,4,5,7,11,12,13])). | |
toWordWithRangeOfLines({Word, Lines}) -> | |
{Word, toRangeOfLines(Lines)}. | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
% Main function ! | |
% Transform text into list of words associated to range of lines where it appears | |
processText(FileName) -> | |
Lines = get_file_contents(FileName), | |
WordsByLine = wordsByLines(transformIntoLinesOfUsefulWord(Lines)), | |
lists:map(fun(X) -> toWordWithRangeOfLines(X) end, WordsByLine). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment