chetkhatri · April 6, 2017 12:55
diff --git a/index1-mooc.erl b/index1-mooc.erl
 -module(index). 
 -export([ 
 lines_from_contents/1, 
 get_file_contents/1, 
 show_file_contents/1, 
 basic_index_from_lines/1, 
 intermediate_index/1, 
 final_index/1]).

 % C = index:get_file_contents("gettysburg-address.txt"). 
 % L = index:lines_from_contents(C). 
 % B = index:basic_index_from_lines(L). 
 % I = index:intermediate_index(B). 
 % F = index:final_index(I).

 % Used to read a file into a list of lines. 
 % Example files available in: 
 % gettysburg-address.txt (short) 
 % dickens-christmas.txt (long)

 % Get the contents of a text file into a list of lines. 
 % Each line has its trailing newline removed. 
 % result: [line1, line2, ...] 
 % where each line is a string, a list of characters 
 get_file_contents(Name) -> 
 {ok,File} = file:open(Name,[read]), 
 Rev = get_all_lines(File,[]), 
 lists:reverse(Rev).

 % Auxiliary function for get_file_contents. 
 % Not exported. 
 get_all_lines(File,Partial) -> 
 case io:get_line(File,"") of 
 eof -> file:close(File), 
 Partial; 
 Line -> {Strip,_} = lists:split(length(Line)-1,Line), 
 get_all_lines(File,[Strip|Partial]) 
 end.

 % Show the contents of a list of strings. 
 % Can be used to check the results of calling get_file_contents. 
 show_file_contents([L|Ls]) -> 
 io:format("~s~n",[L]), 
 show_file_contents(Ls); 
 show_file_contents([]) -> 
 ok.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 % transform a list of strings (lists) and zip with sequence of 
 % integers to represent line numbers, converting each item in the 
 % list of strings into a list of words per line using token split 
 % on spaces, commas and full stops. 
 % result: [{line#, [word1, word2, ...]}, {line#, [word1, ...]}, ...] 
 lines_from_contents(Contents) -> 
 lists:zipwith( 
 fun(X,Y) -> {X, string:tokens(Y, ", .")} end, 
 lists:seq(1, length(Contents)), 
 Contents). 

 % transform the list of tuples representing each line into a list 
 % of words and line numbers they occur 
 % remove punctuation contained in words as we go 
 % result: [{word1, line#}, {word2, line#}, ...] 
 basic_index_from_lines(Lines) -> 
 lists:flatten(basic_index(Lines, [])). 

 % make list of word and line number pairs 
 basic_index([], Acc) -> 
 Acc; 
 basic_index([{Line, Words} | Others], Acc) -> 
 basic_index(Others, [make_word_line_pairs(Line, Words)|Acc]). 

 % flip order of line numbers and words 
 make_word_line_pairs(_, []) -> 
 []; 
 make_word_line_pairs(Line, [Word | Words]) -> 
 [{Word, Line} 
 | make_word_line_pairs(Line, Words)].

 % take simple list of tuples of {word, line_number} and flatten into 
 % a list of words, each with a list of line numbers 
 % works off results from basic_index 
 % result: [{word1, [line1, line2, ...]}, {word2, [line1, ...]}, ...] 
 intermediate_index(Basic_index) -> 
 build_int_index(Basic_index, []).

 % build up list of word, lines , updating existing lists of lines 
 % for a matched word, removing current entry and adding a new list 
 % prepended with the new line, into the Acc 
 build_int_index([], Acc) -> Acc; 
 build_int_index([{Word, Line} | Others], Acc) -> 
 case contains(Word, Acc) of 
 {Word, _Lines} -> build_int_index(Others, update(Acc, Word, Line)); 
 false -> build_int_index(Others, [{Word, [Line]} | Acc]) 
 end. 

 % determine if the list of {word, [lines]} contains given word 
 % if not, then false 
 % if so, evaluate to tuple of the actual matched {word, [lines]} 
 contains(_Word, []) -> 
 false; 
 contains(Word, [{Word, Lines} | _Words]) -> 
 {Word, Lines}; 
 contains(Word, [_ | T]) -> 
 contains(Word, T).

 % find the part of the list Intermediate which references the Word 
 % and then append the new line to its list of lines 
 % The word must exist in Intermediate 
 update([{Word,Lines} | Others], Word, Line) -> 
 [{Word, [Line | Lines]} | Others]; 
 update([NoMatch | Others], Word, Line) -> 
 [NoMatch | update(Others, Word, Line)]. 

 % final step to condense list of line numbers into pairs or ranges 
 % of {start_row_number, end_row_number} associated with each Word 
 % works off results from intermediate_index. First, remove duplicates 
 % from the list. The map function gets each entry in the intermediate 
 % index which is a pair of {word, [line1, line2, ...]} 
 % result: [{word1, [{start1,end1}, {start2, end2}, ...]}, ...] 
 final_index(Intermediate) -> 
 lists:map(fun({Word, Lines}) -> 
 {Word, condense(mylist:nub(Lines))} end, Intermediate).

 % line numbers are in ascending order 
 % should not contain duplicate line references 
 % compress adjacent rows to a single range 
 condense([]) -> 
 []; 
 condense([L | []]) -> 
 [{L, L}]; 
 condense([L1 | _Others]=Lines) -> 
 [L2 | CompressedOthers] = 
 dropwhile(fun(H1, H2) -> H2 == H1 + 1 end, Lines), 
 [{L1, L2} | condense(CompressedOthers)].

 % remove entries from head of list while they make the rule provided 
 % which references the head of the tail as well 
 % tweak from github:otp/lib/stdlib/src/lists.erl 
 dropwhile(Pred, [Hd|[NextHd|_Tail]=Others]=Rest) -> 
 case Pred(Hd, NextHd) of 
 true -> dropwhile(Pred, Others); 
 false -> Rest 
 end; 
 dropwhile(_Pred, [_Hd|[]]=Rest) -> 
 Rest; 
 dropwhile(Pred, []) when is_function(Pred, 1) -> 
 [].
	-module(index).
	-export([
	lines_from_contents/1,
	get_file_contents/1,
	show_file_contents/1,
	basic_index_from_lines/1,
	intermediate_index/1,
	final_index/1]).

	% C = index:get_file_contents("gettysburg-address.txt").
	% L = index:lines_from_contents(C).
	% B = index:basic_index_from_lines(L).
	% I = index:intermediate_index(B).
	% F = index:final_index(I).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)

	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.
	% result: [line1, line2, ...]
	% where each line is a string, a list of characters
	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.
	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.
	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	% transform a list of strings (lists) and zip with sequence of
	% integers to represent line numbers, converting each item in the
	% list of strings into a list of words per line using token split
	% on spaces, commas and full stops.
	% result: [{line#, [word1, word2, ...]}, {line#, [word1, ...]}, ...]
	lines_from_contents(Contents) ->
	lists:zipwith(
	fun(X,Y) -> {X, string:tokens(Y, ", .")} end,
	lists:seq(1, length(Contents)),
	Contents).

	% transform the list of tuples representing each line into a list
	% of words and line numbers they occur
	% remove punctuation contained in words as we go
	% result: [{word1, line#}, {word2, line#}, ...]
	basic_index_from_lines(Lines) ->
	lists:flatten(basic_index(Lines, [])).

	% make list of word and line number pairs
	basic_index([], Acc) ->
	Acc;
	basic_index([{Line, Words} \| Others], Acc) ->
	basic_index(Others, [make_word_line_pairs(Line, Words)\|Acc]).

	% flip order of line numbers and words
	make_word_line_pairs(_, []) ->
	[];
	make_word_line_pairs(Line, [Word \| Words]) ->
	[{Word, Line}
	\| make_word_line_pairs(Line, Words)].

	% take simple list of tuples of {word, line_number} and flatten into
	% a list of words, each with a list of line numbers
	% works off results from basic_index
	% result: [{word1, [line1, line2, ...]}, {word2, [line1, ...]}, ...]
	intermediate_index(Basic_index) ->
	build_int_index(Basic_index, []).

	% build up list of word, lines , updating existing lists of lines
	% for a matched word, removing current entry and adding a new list
	% prepended with the new line, into the Acc
	build_int_index([], Acc) -> Acc;
	build_int_index([{Word, Line} \| Others], Acc) ->
	case contains(Word, Acc) of
	{Word, _Lines} -> build_int_index(Others, update(Acc, Word, Line));
	false -> build_int_index(Others, [{Word, [Line]} \| Acc])
	end.

	% determine if the list of {word, [lines]} contains given word
	% if not, then false
	% if so, evaluate to tuple of the actual matched {word, [lines]}
	contains(_Word, []) ->
	false;
	contains(Word, [{Word, Lines} \| _Words]) ->
	{Word, Lines};
	contains(Word, [_ \| T]) ->
	contains(Word, T).

	% find the part of the list Intermediate which references the Word
	% and then append the new line to its list of lines
	% The word must exist in Intermediate
	update([{Word,Lines} \| Others], Word, Line) ->
	[{Word, [Line \| Lines]} \| Others];
	update([NoMatch \| Others], Word, Line) ->
	[NoMatch \| update(Others, Word, Line)].

	% final step to condense list of line numbers into pairs or ranges
	% of {start_row_number, end_row_number} associated with each Word
	% works off results from intermediate_index. First, remove duplicates
	% from the list. The map function gets each entry in the intermediate
	% index which is a pair of {word, [line1, line2, ...]}
	% result: [{word1, [{start1,end1}, {start2, end2}, ...]}, ...]
	final_index(Intermediate) ->
	lists:map(fun({Word, Lines}) ->
	{Word, condense(mylist:nub(Lines))} end, Intermediate).

	% line numbers are in ascending order
	% should not contain duplicate line references
	% compress adjacent rows to a single range
	condense([]) ->
	[];
	condense([L \| []]) ->
	[{L, L}];
	condense([L1 \| _Others]=Lines) ->
	[L2 \| CompressedOthers] =
	dropwhile(fun(H1, H2) -> H2 == H1 + 1 end, Lines),
	[{L1, L2} \| condense(CompressedOthers)].

	% remove entries from head of list while they make the rule provided
	% which references the head of the tail as well
	% tweak from github:otp/lib/stdlib/src/lists.erl
	dropwhile(Pred, [Hd\|[NextHd\|_Tail]=Others]=Rest) ->
	case Pred(Hd, NextHd) of
	true -> dropwhile(Pred, Others);
	false -> Rest
	end;
	dropwhile(_Pred, [_Hd\|[]]=Rest) ->
	Rest;
	dropwhile(Pred, []) when is_function(Pred, 1) ->
	[].
No results found