Skip to content

Instantly share code, notes, and snippets.

@ygrenzinger
Created March 5, 2017 16:24
Show Gist options
  • Save ygrenzinger/3b0124b2db1e8919b508d1f9f3752297 to your computer and use it in GitHub Desktop.
Save ygrenzinger/3b0124b2db1e8919b508d1f9f3752297 to your computer and use it in GitHub Desktop.
Future Learn Erlang Week 2 exercise
-module(week2).
-export([get_file_contents/1,show_file_contents/1, processText/1]).
-include_lib("eunit/include/eunit.hrl").
isAlphabetical(X) ->
($A =< X andalso X =< $Z) or ($a =< X andalso X =< $z).
isAlphabetical_test() ->
?assertEqual(true, isAlphabetical($G)),
?assertEqual(true, isAlphabetical($g)),
?assertEqual(false, isAlphabetical($.)),
?assertEqual(false, isAlphabetical($,)).
isBlank(C) ->
(C == 32).
isBlank_test() ->
?assertEqual(false, isBlank($I)),
?assertEqual(false, isBlank($.)),
?assertEqual(true, isBlank($\ )).
isLineReturn(C) ->
(C == $\n).
isLineReturn_test() ->
?assertEqual(false, isLineReturn($I)),
?assertEqual(false, isLineReturn($.)),
?assertEqual(true, isLineReturn($\n)).
nocap(C) ->
case $A =< C andalso C =< $Z of
true ->
C+32;
false ->
C
end.
removeCapFromText(Text) ->
lists:map(fun(C) -> nocap(C) end, Text).
removeCapFromText_test() ->
?assertEqual("madam i'm adam", removeCapFromText("Madam I'm Adam")).
replaceSpecialCharByBlank(C) ->
case isBlank(C) or isAlphabetical(C) of
true -> C;
false -> 32
end.
replaceSpecialCharByBlank_test() ->
?assertEqual(32, replaceSpecialCharByBlank($.)),
?assertEqual(32, replaceSpecialCharByBlank($\ )),
?assertEqual($\n, replaceSpecialCharByBlank($\n)),
?assertEqual($i, replaceSpecialCharByBlank($i)).
cleanText(Text) ->
lists:map(fun(C) -> nocap(replaceSpecialCharByBlank(C)) end, Text).
cleanText_test() ->
?assertEqual("hello i m discovering erlang hope it will be fine ",
cleanText("Hello, I'm discovering Erlang! %Hope it will be fine.")).
splitLineByBlank(Line) ->
string:tokens(Line, " ").
splitLineByBlank_test() ->
?assertEqual(["Hello", "Madam", "Adam"], splitLineByBlank("Hello Madam Adam")).
% 25 common stopwords : http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html
% stopwords() -> ["a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"].
% But there is normally much more stopwords http://www.ranks.nl/stopwords
% To simplify I will just remove too short words
keepUseful(Words) ->
lists:filter(fun(Word) -> string:len(Word) > 3 end, Words).
% The decomposing function : takes a text, clean it and build a list of lists of words (or words by line)
transformIntoLinesOfUsefulWord(Lines) ->
lists:map(fun(Line) -> keepUseful(splitLineByBlank(cleanText(Line))) end, Lines).
transformIntoLinesOfUsefulWord_test() ->
?assertEqual([
["hello"],
["trying", "through", "erlang"],
["hope", "solution", "correct"]
], transformIntoLinesOfUsefulWord(["Hello!","I'm trying to get through Erlang.","Hope the solution is correct."])).
addLinePosition(Pos, Word, WordsByLines) ->
UpdatingFn = fun(Lines) -> Lines ++ [Pos] end,
maps:update_with(Word,UpdatingFn,[Pos],WordsByLines).
% Takes in input the lines of words, the line number and the words grouped by lines
% Use Map structure (#{}) and foldl to build the list
wordsByLines([], _, WordsByLines) ->
WordsByLines;
wordsByLines([Words|RemainingLines], Pos, WordsByLines) ->
Fun = fun(Word, Map) -> addLinePosition(Pos, Word, Map) end,
UpdatedMap = lists:foldl(Fun, WordsByLines, Words),
wordsByLines(RemainingLines, Pos+1, UpdatedMap).
% funny things thanks to Map the word are already sorted (hash algo ?)
wordsByLines(LinesOfWord) ->
maps:to_list(wordsByLines(LinesOfWord, 1, #{})).
wordsByLines_test() ->
?assertEqual([{ "hello" , [1] }, { "thanks", [2]}, { "yannick", [1, 2]}],
wordsByLines([["hello", "yannick"], ["thanks", "yannick"]])).
% Last ingredient to complete the work : transform the list of lines to a list of the ranges of lines
toRangeOfLinesT([], Ranges) ->
Ranges;
toRangeOfLinesT([X|Xs], Ranges) ->
{A,B} = lists:last(Ranges),
case X == B + 1 of
true -> toRangeOfLinesT(Xs, lists:droplast(Ranges) ++ [{A,X}]);
false -> toRangeOfLinesT(Xs, Ranges ++ [{X,X}])
end.
toRangeOfLines([X|Xs]) ->
toRangeOfLinesT(Xs, [{X,X}]).
toRangeOfLines_test() ->
?assertEqual([{3,3}], toRangeOfLines([3])),
?assertEqual([{3,4}], toRangeOfLines([3,4])),
?assertEqual([{3,5},{7,7},{11,13}], toRangeOfLines([3,4,5,7,11,12,13])).
toWordWithRangeOfLines({Word, Lines}) ->
{Word, toRangeOfLines(Lines)}.
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
% Main function !
% Transform text into list of words associated to range of lines where it appears
processText(FileName) ->
Lines = get_file_contents(FileName),
WordsByLine = wordsByLines(transformIntoLinesOfUsefulWord(Lines)),
lists:map(fun(X) -> toWordWithRangeOfLines(X) end, WordsByLine).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment