Created
April 6, 2017 12:55
-
-
Save chetkhatri/055ea4d97c89bd686f4ec2a050f3f27d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -module(index). | |
| -export([ | |
| lines_from_contents/1, | |
| get_file_contents/1, | |
| show_file_contents/1, | |
| basic_index_from_lines/1, | |
| intermediate_index/1, | |
| final_index/1]). | |
| % C = index:get_file_contents("gettysburg-address.txt"). | |
| % L = index:lines_from_contents(C). | |
| % B = index:basic_index_from_lines(L). | |
| % I = index:intermediate_index(B). | |
| % F = index:final_index(I). | |
| % Used to read a file into a list of lines. | |
| % Example files available in: | |
| % gettysburg-address.txt (short) | |
| % dickens-christmas.txt (long) | |
| % Get the contents of a text file into a list of lines. | |
| % Each line has its trailing newline removed. | |
| % result: [line1, line2, ...] | |
| % where each line is a string, a list of characters | |
| get_file_contents(Name) -> | |
| {ok,File} = file:open(Name,[read]), | |
| Rev = get_all_lines(File,[]), | |
| lists:reverse(Rev). | |
| % Auxiliary function for get_file_contents. | |
| % Not exported. | |
| get_all_lines(File,Partial) -> | |
| case io:get_line(File,"") of | |
| eof -> file:close(File), | |
| Partial; | |
| Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
| get_all_lines(File,[Strip|Partial]) | |
| end. | |
| % Show the contents of a list of strings. | |
| % Can be used to check the results of calling get_file_contents. | |
| show_file_contents([L|Ls]) -> | |
| io:format("~s~n",[L]), | |
| show_file_contents(Ls); | |
| show_file_contents([]) -> | |
| ok. | |
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| % transform a list of strings (lists) and zip with sequence of | |
| % integers to represent line numbers, converting each item in the | |
| % list of strings into a list of words per line using token split | |
| % on spaces, commas and full stops. | |
| % result: [{line#, [word1, word2, ...]}, {line#, [word1, ...]}, ...] | |
| lines_from_contents(Contents) -> | |
| lists:zipwith( | |
| fun(X,Y) -> {X, string:tokens(Y, ", .")} end, | |
| lists:seq(1, length(Contents)), | |
| Contents). | |
| % transform the list of tuples representing each line into a list | |
| % of words and line numbers they occur | |
| % remove punctuation contained in words as we go | |
| % result: [{word1, line#}, {word2, line#}, ...] | |
| basic_index_from_lines(Lines) -> | |
| lists:flatten(basic_index(Lines, [])). | |
| % make list of word and line number pairs | |
| basic_index([], Acc) -> | |
| Acc; | |
| basic_index([{Line, Words} | Others], Acc) -> | |
| basic_index(Others, [make_word_line_pairs(Line, Words)|Acc]). | |
| % flip order of line numbers and words | |
| make_word_line_pairs(_, []) -> | |
| []; | |
| make_word_line_pairs(Line, [Word | Words]) -> | |
| [{Word, Line} | |
| | make_word_line_pairs(Line, Words)]. | |
| % take simple list of tuples of {word, line_number} and flatten into | |
| % a list of words, each with a list of line numbers | |
| % works off results from basic_index | |
| % result: [{word1, [line1, line2, ...]}, {word2, [line1, ...]}, ...] | |
| intermediate_index(Basic_index) -> | |
| build_int_index(Basic_index, []). | |
| % build up list of word, lines , updating existing lists of lines | |
| % for a matched word, removing current entry and adding a new list | |
| % prepended with the new line, into the Acc | |
| build_int_index([], Acc) -> Acc; | |
| build_int_index([{Word, Line} | Others], Acc) -> | |
| case contains(Word, Acc) of | |
| {Word, _Lines} -> build_int_index(Others, update(Acc, Word, Line)); | |
| false -> build_int_index(Others, [{Word, [Line]} | Acc]) | |
| end. | |
| % determine if the list of {word, [lines]} contains given word | |
| % if not, then false | |
| % if so, evaluate to tuple of the actual matched {word, [lines]} | |
| contains(_Word, []) -> | |
| false; | |
| contains(Word, [{Word, Lines} | _Words]) -> | |
| {Word, Lines}; | |
| contains(Word, [_ | T]) -> | |
| contains(Word, T). | |
| % find the part of the list Intermediate which references the Word | |
| % and then append the new line to its list of lines | |
| % The word must exist in Intermediate | |
| update([{Word,Lines} | Others], Word, Line) -> | |
| [{Word, [Line | Lines]} | Others]; | |
| update([NoMatch | Others], Word, Line) -> | |
| [NoMatch | update(Others, Word, Line)]. | |
| % final step to condense list of line numbers into pairs or ranges | |
| % of {start_row_number, end_row_number} associated with each Word | |
| % works off results from intermediate_index. First, remove duplicates | |
| % from the list. The map function gets each entry in the intermediate | |
| % index which is a pair of {word, [line1, line2, ...]} | |
| % result: [{word1, [{start1,end1}, {start2, end2}, ...]}, ...] | |
| final_index(Intermediate) -> | |
| lists:map(fun({Word, Lines}) -> | |
| {Word, condense(mylist:nub(Lines))} end, Intermediate). | |
| % line numbers are in ascending order | |
| % should not contain duplicate line references | |
| % compress adjacent rows to a single range | |
| condense([]) -> | |
| []; | |
| condense([L | []]) -> | |
| [{L, L}]; | |
| condense([L1 | _Others]=Lines) -> | |
| [L2 | CompressedOthers] = | |
| dropwhile(fun(H1, H2) -> H2 == H1 + 1 end, Lines), | |
| [{L1, L2} | condense(CompressedOthers)]. | |
| % remove entries from head of list while they make the rule provided | |
| % which references the head of the tail as well | |
| % tweak from github:otp/lib/stdlib/src/lists.erl | |
| dropwhile(Pred, [Hd|[NextHd|_Tail]=Others]=Rest) -> | |
| case Pred(Hd, NextHd) of | |
| true -> dropwhile(Pred, Others); | |
| false -> Rest | |
| end; | |
| dropwhile(_Pred, [_Hd|[]]=Rest) -> | |
| Rest; | |
| dropwhile(Pred, []) when is_function(Pred, 1) -> | |
| []. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment