yuanjs · May 22, 2020 06:23
diff --git a/index.erl b/index.erl
 -module(index).
 -export([get_file_contents/1,show_file_contents/1,
 get_string_words_index/4,get_all_lines_index/1,build_range/1,
 build_words_occurs/1,sort_words_index/1, get_index/1]).

 % Used to read a file into a list of lines.
 % Example files available in:
 %   gettysburg-address.txt (short)
 %   dickens-christmas.txt  (long)


 % Get the contents of a text file into a list of lines.
 % Each line has its trailing newline removed.

 get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
 lists:reverse(Rev).

 % Auxiliary function for get_file_contents.
 % Not exported.

 get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

 % Show the contents of a list of strings.
 % Can be used to check the results of calling get_file_contents.

 show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.

 % My functions
 % 1. Call get_file_contents to get file contents in a list of all line.
 % 2. Call get_all_lines_index to build a list of all words and wors occurs in which line number.
 % 3. Call sort_words_index to index the list of all words by word and line number.
 % 4. Finally call build_words_occurs to rearrange all the line index number into a rane of tuple.

 % get all words index by filename
 get_index(Filename) ->
    Contents = get_file_contents(Filename),
    AllWordsIndex = get_all_lines_index(Contents),
    SortWordsIndex = sort_words_index(AllWordsIndex),
    lists:reverse(build_words_occurs(SortWordsIndex)).

 % build words index by occurs range.
 build_words_occurs(Xs) ->
    build_words_occurs(Xs, [], {"", 0}, []).

 build_words_occurs([], Results, {PreWord, _}, CurrentIndexList) ->
    [{PreWord, build_range(CurrentIndexList)} | Results];
 build_words_occurs([{PreWord, PreLineNumber}|Xs], Results, {PreWord, PreLineNumber}, CurrentIndexList) ->
    build_words_occurs(Xs, Results, {PreWord, PreLineNumber}, CurrentIndexList);
 build_words_occurs([{PreWord, LineNumber}| Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
    NewIndexList = [LineNumber|CurrentIndexList],
    build_words_occurs(Xs, Results, {PreWord, LineNumber}, NewIndexList);
 build_words_occurs([{NewWord, LineNumber}|Xs], Results, {"", _PreLineNumber}, CurrentIndexList) ->
    NewIndexList = [LineNumber|CurrentIndexList],
    build_words_occurs(Xs, Results, {NewWord, LineNumber}, NewIndexList);
 build_words_occurs([{NewWord, LineNumber}|Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
    NewResults = [{PreWord, build_range(CurrentIndexList)} | Results],
    build_words_occurs(Xs, NewResults, {NewWord, LineNumber}, [LineNumber]).

 % build index number range
 build_range(Xs) ->
    group_by_range(lists:reverse(Xs)).

 % This is the most trickey part for me. I have to google solution from internet.
 % Groups a list of numbers into contiguous ranges.
 group_by_range([]) ->
    [];
 group_by_range([H|T]) ->
    group_by_range(T, {H,H}, []).
 group_by_range([], Range, Acc) ->
    lists:reverse([Range|Acc]);
 group_by_range([H|T], {S,E}, Acc) when E+1 == H ->
    group_by_range(T, {S,H}, Acc);
 group_by_range([H|T], Range={_S,_E}, Acc) ->
    group_by_range(T, {H,H},[Range|Acc]).

 % Sort all the words index by word and line number.
 sort_words_index(Xs) ->
    lists:sort(fun({Word1, LineNumber1}, {Word2, LineNumber2}) ->
                      Word1 < Word2 orelse LineNumber1 < LineNumber2 end, Xs).
 % Get all lines word index
 get_all_lines_index(Xs) ->
    get_all_lines_index(Xs, [], 1).

 get_all_lines_index([], WordsIndex, _LineNumber) ->
    WordsIndex;
 get_all_lines_index([X|Xs], WordsIndex, LineNumber) ->
    get_all_lines_index(Xs, get_string_words_index(X, [], "", LineNumber) ++ WordsIndex, LineNumber + 1).

 % Get All words index
 get_string_words_index([], WordsIndex, "", _LineNumber) ->
    WordsIndex;
 get_string_words_index([], WordsIndex, CurrentWord, LineNumber) ->
    if
        length(CurrentWord) > 3 ->
            [{lists:reverse(CurrentWord), LineNumber}|WordsIndex];
        true -> WordsIndex
    end;
 get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 65) and (X =< 90) ->
    NewCurrentWord = [X + 32|CurrentWord],
    get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
 get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 97) and (X =< 122) ->
    NewCurrentWord = [X|CurrentWord],
    get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
 get_string_words_index([_|Xs], WordsIndex, "", LineNumber) ->
    get_string_words_index(Xs, WordsIndex, "", LineNumber);
 get_string_words_index([_|Xs], WordsIndex, CurrentWord, LineNumber) ->
    NewWordsIndex =
    if
        length(CurrentWord) > 3 ->
            [{lists:reverse(CurrentWord), LineNumber}|WordsIndex];
        true -> WordsIndex
    end,
    get_string_words_index(Xs, NewWordsIndex, "", LineNumber).
	-module(index).
	-export([get_file_contents/1,show_file_contents/1,
	get_string_words_index/4,get_all_lines_index/1,build_range/1,
	build_words_occurs/1,sort_words_index/1, get_index/1]).

	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.

	% My functions
	% 1. Call get_file_contents to get file contents in a list of all line.
	% 2. Call get_all_lines_index to build a list of all words and wors occurs in which line number.
	% 3. Call sort_words_index to index the list of all words by word and line number.
	% 4. Finally call build_words_occurs to rearrange all the line index number into a rane of tuple.

	% get all words index by filename
	get_index(Filename) ->
	Contents = get_file_contents(Filename),
	AllWordsIndex = get_all_lines_index(Contents),
	SortWordsIndex = sort_words_index(AllWordsIndex),
	lists:reverse(build_words_occurs(SortWordsIndex)).

	% build words index by occurs range.
	build_words_occurs(Xs) ->
	build_words_occurs(Xs, [], {"", 0}, []).

	build_words_occurs([], Results, {PreWord, _}, CurrentIndexList) ->
	[{PreWord, build_range(CurrentIndexList)} \| Results];
	build_words_occurs([{PreWord, PreLineNumber}\|Xs], Results, {PreWord, PreLineNumber}, CurrentIndexList) ->
	build_words_occurs(Xs, Results, {PreWord, PreLineNumber}, CurrentIndexList);
	build_words_occurs([{PreWord, LineNumber}\| Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
	NewIndexList = [LineNumber\|CurrentIndexList],
	build_words_occurs(Xs, Results, {PreWord, LineNumber}, NewIndexList);
	build_words_occurs([{NewWord, LineNumber}\|Xs], Results, {"", _PreLineNumber}, CurrentIndexList) ->
	NewIndexList = [LineNumber\|CurrentIndexList],
	build_words_occurs(Xs, Results, {NewWord, LineNumber}, NewIndexList);
	build_words_occurs([{NewWord, LineNumber}\|Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) ->
	NewResults = [{PreWord, build_range(CurrentIndexList)} \| Results],
	build_words_occurs(Xs, NewResults, {NewWord, LineNumber}, [LineNumber]).

	% build index number range
	build_range(Xs) ->
	group_by_range(lists:reverse(Xs)).

	% This is the most trickey part for me. I have to google solution from internet.
	% Groups a list of numbers into contiguous ranges.
	group_by_range([]) ->
	[];
	group_by_range([H\|T]) ->
	group_by_range(T, {H,H}, []).
	group_by_range([], Range, Acc) ->
	lists:reverse([Range\|Acc]);
	group_by_range([H\|T], {S,E}, Acc) when E+1 == H ->
	group_by_range(T, {S,H}, Acc);
	group_by_range([H\|T], Range={_S,_E}, Acc) ->
	group_by_range(T, {H,H},[Range\|Acc]).

	% Sort all the words index by word and line number.
	sort_words_index(Xs) ->
	lists:sort(fun({Word1, LineNumber1}, {Word2, LineNumber2}) ->
	Word1 < Word2 orelse LineNumber1 < LineNumber2 end, Xs).
	% Get all lines word index
	get_all_lines_index(Xs) ->
	get_all_lines_index(Xs, [], 1).

	get_all_lines_index([], WordsIndex, _LineNumber) ->
	WordsIndex;
	get_all_lines_index([X\|Xs], WordsIndex, LineNumber) ->
	get_all_lines_index(Xs, get_string_words_index(X, [], "", LineNumber) ++ WordsIndex, LineNumber + 1).

	% Get All words index
	get_string_words_index([], WordsIndex, "", _LineNumber) ->
	WordsIndex;
	get_string_words_index([], WordsIndex, CurrentWord, LineNumber) ->
	if
	length(CurrentWord) > 3 ->
	[{lists:reverse(CurrentWord), LineNumber}\|WordsIndex];
	true -> WordsIndex
	end;
	get_string_words_index([X\|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 65) and (X =< 90) ->
	NewCurrentWord = [X + 32\|CurrentWord],
	get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
	get_string_words_index([X\|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 97) and (X =< 122) ->
	NewCurrentWord = [X\|CurrentWord],
	get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber);
	get_string_words_index([_\|Xs], WordsIndex, "", LineNumber) ->
	get_string_words_index(Xs, WordsIndex, "", LineNumber);
	get_string_words_index([_\|Xs], WordsIndex, CurrentWord, LineNumber) ->
	NewWordsIndex =
	if
	length(CurrentWord) > 3 ->
	[{lists:reverse(CurrentWord), LineNumber}\|WordsIndex];
	true -> WordsIndex
	end,
	get_string_words_index(Xs, NewWordsIndex, "", LineNumber).