ygrenzinger · March 5, 2017 16:24
diff --git a/erlang-week2.erl b/erlang-week2.erl
 -module(week2).
 -export([get_file_contents/1,show_file_contents/1, processText/1]).
 -include_lib("eunit/include/eunit.hrl").

 isAlphabetical(X) ->
    ($A =< X andalso X =< $Z) or  ($a =< X andalso X =< $z).

 isAlphabetical_test() ->
    ?assertEqual(true, isAlphabetical($G)),
    ?assertEqual(true, isAlphabetical($g)),
    ?assertEqual(false, isAlphabetical($.)),
    ?assertEqual(false, isAlphabetical($,)).

 isBlank(C) ->
  (C == 32).

 isBlank_test() ->
    ?assertEqual(false, isBlank($I)),
    ?assertEqual(false, isBlank($.)),
    ?assertEqual(true, isBlank($\ )).

 isLineReturn(C) ->
  (C == $\n).

 isLineReturn_test() ->
    ?assertEqual(false, isLineReturn($I)),
    ?assertEqual(false, isLineReturn($.)),
    ?assertEqual(true, isLineReturn($\n)).

 nocap(C) ->
    case $A =< C andalso C =< $Z of
    	true ->
    	    C+32;
    	false ->
    	    C
    end.

 removeCapFromText(Text) ->
  lists:map(fun(C) -> nocap(C) end, Text).

 removeCapFromText_test() ->
    ?assertEqual("madam i'm adam", removeCapFromText("Madam I'm Adam")).

 replaceSpecialCharByBlank(C) ->
  case isBlank(C) or isAlphabetical(C) of
    true -> C;
    false -> 32
  end.

 replaceSpecialCharByBlank_test() ->
  ?assertEqual(32, replaceSpecialCharByBlank($.)),
  ?assertEqual(32, replaceSpecialCharByBlank($\ )),
  ?assertEqual($\n, replaceSpecialCharByBlank($\n)),
  ?assertEqual($i, replaceSpecialCharByBlank($i)).

 cleanText(Text) ->
  lists:map(fun(C) -> nocap(replaceSpecialCharByBlank(C)) end, Text).

 cleanText_test() ->
  ?assertEqual("hello i m discovering erlang hope it will be fine ",
    cleanText("Hello, I'm discovering Erlang! %Hope it will be fine.")).

 splitLineByBlank(Line) ->
  string:tokens(Line, " ").

 splitLineByBlank_test() ->
  ?assertEqual(["Hello", "Madam", "Adam"], splitLineByBlank("Hello Madam Adam")).

 % 25 common stopwords : http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html
 % stopwords() -> ["a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"].
 % But there is normally much more stopwords http://www.ranks.nl/stopwords
 % To simplify I will just remove too short words

 keepUseful(Words) ->
  lists:filter(fun(Word) -> string:len(Word) > 3 end, Words).

 % The decomposing function : takes a text, clean it and build a list of lists of words (or words by line)
 transformIntoLinesOfUsefulWord(Lines) ->
  lists:map(fun(Line) -> keepUseful(splitLineByBlank(cleanText(Line))) end, Lines).

 transformIntoLinesOfUsefulWord_test() ->
  ?assertEqual([
    ["hello"],
    ["trying", "through", "erlang"],
    ["hope", "solution", "correct"]
  ], transformIntoLinesOfUsefulWord(["Hello!","I'm trying to get through Erlang.","Hope the solution is correct."])).

 addLinePosition(Pos, Word, WordsByLines) ->
  UpdatingFn = fun(Lines) -> Lines ++ [Pos] end,
  maps:update_with(Word,UpdatingFn,[Pos],WordsByLines).

 % Takes in input the lines of words, the line number and the words grouped by lines
 % Use Map structure (#{}) and foldl to build the list
 wordsByLines([], _, WordsByLines) ->
  WordsByLines;
 wordsByLines([Words|RemainingLines], Pos, WordsByLines) ->
  Fun = fun(Word, Map) -> addLinePosition(Pos, Word, Map) end,
  UpdatedMap = lists:foldl(Fun, WordsByLines, Words),
  wordsByLines(RemainingLines, Pos+1, UpdatedMap).

 % funny things thanks to Map the word are already sorted (hash algo ?)
 wordsByLines(LinesOfWord) ->
  maps:to_list(wordsByLines(LinesOfWord, 1, #{})).

 wordsByLines_test() ->
  ?assertEqual([{ "hello" , [1] }, { "thanks", [2]}, { "yannick", [1, 2]}],
    wordsByLines([["hello", "yannick"], ["thanks", "yannick"]])).

 % Last ingredient to complete the work : transform the list of lines to a list of the ranges of lines
 toRangeOfLinesT([], Ranges) ->
  Ranges;
 toRangeOfLinesT([X|Xs], Ranges) ->
  {A,B} = lists:last(Ranges),
  case X == B + 1 of
    true -> toRangeOfLinesT(Xs, lists:droplast(Ranges) ++ [{A,X}]);
    false -> toRangeOfLinesT(Xs, Ranges ++ [{X,X}])
  end.

 toRangeOfLines([X|Xs]) ->
  toRangeOfLinesT(Xs, [{X,X}]).

 toRangeOfLines_test() ->
  ?assertEqual([{3,3}], toRangeOfLines([3])),
  ?assertEqual([{3,4}], toRangeOfLines([3,4])),
  ?assertEqual([{3,5},{7,7},{11,13}], toRangeOfLines([3,4,5,7,11,12,13])).

 toWordWithRangeOfLines({Word, Lines}) ->
  {Word, toRangeOfLines(Lines)}.

 % Used to read a file into a list of lines.
 % Example files available in:
 %   gettysburg-address.txt (short)
 %   dickens-christmas.txt  (long)

 % Get the contents of a text file into a list of lines.
 % Each line has its trailing newline removed.

 get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
 lists:reverse(Rev).

 % Auxiliary function for get_file_contents.
 % Not exported.

 get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

 % Show the contents of a list of strings.
 % Can be used to check the results of calling get_file_contents.

 show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.

 % Main function !
 % Transform text into list of words associated to range of lines where it appears
 processText(FileName) ->
  Lines = get_file_contents(FileName),
  WordsByLine = wordsByLines(transformIntoLinesOfUsefulWord(Lines)),
  lists:map(fun(X) -> toWordWithRangeOfLines(X) end, WordsByLine).
	-module(week2).
	-export([get_file_contents/1,show_file_contents/1, processText/1]).
	-include_lib("eunit/include/eunit.hrl").

	isAlphabetical(X) ->
	($A =< X andalso X =< $Z) or ($a =< X andalso X =< $z).

	isAlphabetical_test() ->
	?assertEqual(true, isAlphabetical($G)),
	?assertEqual(true, isAlphabetical($g)),
	?assertEqual(false, isAlphabetical($.)),
	?assertEqual(false, isAlphabetical($,)).

	isBlank(C) ->
	(C == 32).

	isBlank_test() ->
	?assertEqual(false, isBlank($I)),
	?assertEqual(false, isBlank($.)),
	?assertEqual(true, isBlank($\ )).

	isLineReturn(C) ->
	(C == $\n).

	isLineReturn_test() ->
	?assertEqual(false, isLineReturn($I)),
	?assertEqual(false, isLineReturn($.)),
	?assertEqual(true, isLineReturn($\n)).

	nocap(C) ->
	case $A =< C andalso C =< $Z of
	true ->
	C+32;
	false ->
	C
	end.

	removeCapFromText(Text) ->
	lists:map(fun(C) -> nocap(C) end, Text).

	removeCapFromText_test() ->
	?assertEqual("madam i'm adam", removeCapFromText("Madam I'm Adam")).

	replaceSpecialCharByBlank(C) ->
	case isBlank(C) or isAlphabetical(C) of
	true -> C;
	false -> 32
	end.

	replaceSpecialCharByBlank_test() ->
	?assertEqual(32, replaceSpecialCharByBlank($.)),
	?assertEqual(32, replaceSpecialCharByBlank($\ )),
	?assertEqual($\n, replaceSpecialCharByBlank($\n)),
	?assertEqual($i, replaceSpecialCharByBlank($i)).

	cleanText(Text) ->
	lists:map(fun(C) -> nocap(replaceSpecialCharByBlank(C)) end, Text).

	cleanText_test() ->
	?assertEqual("hello i m discovering erlang hope it will be fine ",
	cleanText("Hello, I'm discovering Erlang! %Hope it will be fine.")).

	splitLineByBlank(Line) ->
	string:tokens(Line, " ").

	splitLineByBlank_test() ->
	?assertEqual(["Hello", "Madam", "Adam"], splitLineByBlank("Hello Madam Adam")).

	% 25 common stopwords : http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html
	% stopwords() -> ["a", "an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"].
	% But there is normally much more stopwords http://www.ranks.nl/stopwords
	% To simplify I will just remove too short words

	keepUseful(Words) ->
	lists:filter(fun(Word) -> string:len(Word) > 3 end, Words).

	% The decomposing function : takes a text, clean it and build a list of lists of words (or words by line)
	transformIntoLinesOfUsefulWord(Lines) ->
	lists:map(fun(Line) -> keepUseful(splitLineByBlank(cleanText(Line))) end, Lines).

	transformIntoLinesOfUsefulWord_test() ->
	?assertEqual([
	["hello"],
	["trying", "through", "erlang"],
	["hope", "solution", "correct"]
	], transformIntoLinesOfUsefulWord(["Hello!","I'm trying to get through Erlang.","Hope the solution is correct."])).

	addLinePosition(Pos, Word, WordsByLines) ->
	UpdatingFn = fun(Lines) -> Lines ++ [Pos] end,
	maps:update_with(Word,UpdatingFn,[Pos],WordsByLines).

	% Takes in input the lines of words, the line number and the words grouped by lines
	% Use Map structure (#{}) and foldl to build the list
	wordsByLines([], _, WordsByLines) ->
	WordsByLines;
	wordsByLines([Words\|RemainingLines], Pos, WordsByLines) ->
	Fun = fun(Word, Map) -> addLinePosition(Pos, Word, Map) end,
	UpdatedMap = lists:foldl(Fun, WordsByLines, Words),
	wordsByLines(RemainingLines, Pos+1, UpdatedMap).

	% funny things thanks to Map the word are already sorted (hash algo ?)
	wordsByLines(LinesOfWord) ->
	maps:to_list(wordsByLines(LinesOfWord, 1, #{})).

	wordsByLines_test() ->
	?assertEqual([{ "hello" , [1] }, { "thanks", [2]}, { "yannick", [1, 2]}],
	wordsByLines([["hello", "yannick"], ["thanks", "yannick"]])).

	% Last ingredient to complete the work : transform the list of lines to a list of the ranges of lines
	toRangeOfLinesT([], Ranges) ->
	Ranges;
	toRangeOfLinesT([X\|Xs], Ranges) ->
	{A,B} = lists:last(Ranges),
	case X == B + 1 of
	true -> toRangeOfLinesT(Xs, lists:droplast(Ranges) ++ [{A,X}]);
	false -> toRangeOfLinesT(Xs, Ranges ++ [{X,X}])
	end.

	toRangeOfLines([X\|Xs]) ->
	toRangeOfLinesT(Xs, [{X,X}]).

	toRangeOfLines_test() ->
	?assertEqual([{3,3}], toRangeOfLines([3])),
	?assertEqual([{3,4}], toRangeOfLines([3,4])),
	?assertEqual([{3,5},{7,7},{11,13}], toRangeOfLines([3,4,5,7,11,12,13])).

	toWordWithRangeOfLines({Word, Lines}) ->
	{Word, toRangeOfLines(Lines)}.

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)

	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	% Main function !
	% Transform text into list of words associated to range of lines where it appears
	processText(FileName) ->
	Lines = get_file_contents(FileName),
	WordsByLine = wordsByLines(transformIntoLinesOfUsefulWord(Lines)),
	lists:map(fun(X) -> toWordWithRangeOfLines(X) end, WordsByLine).