Last active
May 22, 2020 06:23
-
-
Save yuanjs/047dc205a29094fb39faf25a483221f1 to your computer and use it in GitHub Desktop.
erlang indexing a file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1,show_file_contents/1, | |
get_string_words_index/4,get_all_lines_index/1,build_range/1, | |
build_words_occurs/1,sort_words_index/1, get_index/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
% My functions | |
% 1. Call get_file_contents to get file contents in a list of all line. | |
% 2. Call get_all_lines_index to build a list of all words and wors occurs in which line number. | |
% 3. Call sort_words_index to index the list of all words by word and line number. | |
% 4. Finally call build_words_occurs to rearrange all the line index number into a rane of tuple. | |
% get all words index by filename | |
get_index(Filename) -> | |
Contents = get_file_contents(Filename), | |
AllWordsIndex = get_all_lines_index(Contents), | |
SortWordsIndex = sort_words_index(AllWordsIndex), | |
lists:reverse(build_words_occurs(SortWordsIndex)). | |
% build words index by occurs range. | |
build_words_occurs(Xs) -> | |
build_words_occurs(Xs, [], {"", 0}, []). | |
build_words_occurs([], Results, {PreWord, _}, CurrentIndexList) -> | |
[{PreWord, build_range(CurrentIndexList)} | Results]; | |
build_words_occurs([{PreWord, PreLineNumber}|Xs], Results, {PreWord, PreLineNumber}, CurrentIndexList) -> | |
build_words_occurs(Xs, Results, {PreWord, PreLineNumber}, CurrentIndexList); | |
build_words_occurs([{PreWord, LineNumber}| Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) -> | |
NewIndexList = [LineNumber|CurrentIndexList], | |
build_words_occurs(Xs, Results, {PreWord, LineNumber}, NewIndexList); | |
build_words_occurs([{NewWord, LineNumber}|Xs], Results, {"", _PreLineNumber}, CurrentIndexList) -> | |
NewIndexList = [LineNumber|CurrentIndexList], | |
build_words_occurs(Xs, Results, {NewWord, LineNumber}, NewIndexList); | |
build_words_occurs([{NewWord, LineNumber}|Xs], Results, {PreWord, _PreLineNumber}, CurrentIndexList) -> | |
NewResults = [{PreWord, build_range(CurrentIndexList)} | Results], | |
build_words_occurs(Xs, NewResults, {NewWord, LineNumber}, [LineNumber]). | |
% build index number range | |
build_range(Xs) -> | |
group_by_range(lists:reverse(Xs)). | |
% This is the most trickey part for me. I have to google solution from internet. | |
% Groups a list of numbers into contiguous ranges. | |
group_by_range([]) -> | |
[]; | |
group_by_range([H|T]) -> | |
group_by_range(T, {H,H}, []). | |
group_by_range([], Range, Acc) -> | |
lists:reverse([Range|Acc]); | |
group_by_range([H|T], {S,E}, Acc) when E+1 == H -> | |
group_by_range(T, {S,H}, Acc); | |
group_by_range([H|T], Range={_S,_E}, Acc) -> | |
group_by_range(T, {H,H},[Range|Acc]). | |
% Sort all the words index by word and line number. | |
sort_words_index(Xs) -> | |
lists:sort(fun({Word1, LineNumber1}, {Word2, LineNumber2}) -> | |
Word1 < Word2 orelse LineNumber1 < LineNumber2 end, Xs). | |
% Get all lines word index | |
get_all_lines_index(Xs) -> | |
get_all_lines_index(Xs, [], 1). | |
get_all_lines_index([], WordsIndex, _LineNumber) -> | |
WordsIndex; | |
get_all_lines_index([X|Xs], WordsIndex, LineNumber) -> | |
get_all_lines_index(Xs, get_string_words_index(X, [], "", LineNumber) ++ WordsIndex, LineNumber + 1). | |
% Get All words index | |
get_string_words_index([], WordsIndex, "", _LineNumber) -> | |
WordsIndex; | |
get_string_words_index([], WordsIndex, CurrentWord, LineNumber) -> | |
if | |
length(CurrentWord) > 3 -> | |
[{lists:reverse(CurrentWord), LineNumber}|WordsIndex]; | |
true -> WordsIndex | |
end; | |
get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 65) and (X =< 90) -> | |
NewCurrentWord = [X + 32|CurrentWord], | |
get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber); | |
get_string_words_index([X|Xs], WordsIndex, CurrentWord, LineNumber) when (X >= 97) and (X =< 122) -> | |
NewCurrentWord = [X|CurrentWord], | |
get_string_words_index(Xs, WordsIndex, NewCurrentWord, LineNumber); | |
get_string_words_index([_|Xs], WordsIndex, "", LineNumber) -> | |
get_string_words_index(Xs, WordsIndex, "", LineNumber); | |
get_string_words_index([_|Xs], WordsIndex, CurrentWord, LineNumber) -> | |
NewWordsIndex = | |
if | |
length(CurrentWord) > 3 -> | |
[{lists:reverse(CurrentWord), LineNumber}|WordsIndex]; | |
true -> WordsIndex | |
end, | |
get_string_words_index(Xs, NewWordsIndex, "", LineNumber). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment