Skip to content

Instantly share code, notes, and snippets.

@chetkhatri
Created April 6, 2017 12:55
Show Gist options
  • Select an option

  • Save chetkhatri/055ea4d97c89bd686f4ec2a050f3f27d to your computer and use it in GitHub Desktop.

Select an option

Save chetkhatri/055ea4d97c89bd686f4ec2a050f3f27d to your computer and use it in GitHub Desktop.
-module(index).
-export([
lines_from_contents/1,
get_file_contents/1,
show_file_contents/1,
basic_index_from_lines/1,
intermediate_index/1,
final_index/1]).
% C = index:get_file_contents("gettysburg-address.txt").
% L = index:lines_from_contents(C).
% B = index:basic_index_from_lines(L).
% I = index:intermediate_index(B).
% F = index:final_index(I).
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
% result: [line1, line2, ...]
% where each line is a string, a list of characters
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls);
show_file_contents([]) ->
ok.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% transform a list of strings (lists) and zip with sequence of
% integers to represent line numbers, converting each item in the
% list of strings into a list of words per line using token split
% on spaces, commas and full stops.
% result: [{line#, [word1, word2, ...]}, {line#, [word1, ...]}, ...]
lines_from_contents(Contents) ->
lists:zipwith(
fun(X,Y) -> {X, string:tokens(Y, ", .")} end,
lists:seq(1, length(Contents)),
Contents).
% transform the list of tuples representing each line into a list
% of words and line numbers they occur
% remove punctuation contained in words as we go
% result: [{word1, line#}, {word2, line#}, ...]
basic_index_from_lines(Lines) ->
lists:flatten(basic_index(Lines, [])).
% make list of word and line number pairs
basic_index([], Acc) ->
Acc;
basic_index([{Line, Words} | Others], Acc) ->
basic_index(Others, [make_word_line_pairs(Line, Words)|Acc]).
% flip order of line numbers and words
make_word_line_pairs(_, []) ->
[];
make_word_line_pairs(Line, [Word | Words]) ->
[{Word, Line}
| make_word_line_pairs(Line, Words)].
% take simple list of tuples of {word, line_number} and flatten into
% a list of words, each with a list of line numbers
% works off results from basic_index
% result: [{word1, [line1, line2, ...]}, {word2, [line1, ...]}, ...]
intermediate_index(Basic_index) ->
build_int_index(Basic_index, []).
% build up list of word, lines , updating existing lists of lines
% for a matched word, removing current entry and adding a new list
% prepended with the new line, into the Acc
build_int_index([], Acc) -> Acc;
build_int_index([{Word, Line} | Others], Acc) ->
case contains(Word, Acc) of
{Word, _Lines} -> build_int_index(Others, update(Acc, Word, Line));
false -> build_int_index(Others, [{Word, [Line]} | Acc])
end.
% determine if the list of {word, [lines]} contains given word
% if not, then false
% if so, evaluate to tuple of the actual matched {word, [lines]}
contains(_Word, []) ->
false;
contains(Word, [{Word, Lines} | _Words]) ->
{Word, Lines};
contains(Word, [_ | T]) ->
contains(Word, T).
% find the part of the list Intermediate which references the Word
% and then append the new line to its list of lines
% The word must exist in Intermediate
update([{Word,Lines} | Others], Word, Line) ->
[{Word, [Line | Lines]} | Others];
update([NoMatch | Others], Word, Line) ->
[NoMatch | update(Others, Word, Line)].
% final step to condense list of line numbers into pairs or ranges
% of {start_row_number, end_row_number} associated with each Word
% works off results from intermediate_index. First, remove duplicates
% from the list. The map function gets each entry in the intermediate
% index which is a pair of {word, [line1, line2, ...]}
% result: [{word1, [{start1,end1}, {start2, end2}, ...]}, ...]
final_index(Intermediate) ->
lists:map(fun({Word, Lines}) ->
{Word, condense(mylist:nub(Lines))} end, Intermediate).
% line numbers are in ascending order
% should not contain duplicate line references
% compress adjacent rows to a single range
condense([]) ->
[];
condense([L | []]) ->
[{L, L}];
condense([L1 | _Others]=Lines) ->
[L2 | CompressedOthers] =
dropwhile(fun(H1, H2) -> H2 == H1 + 1 end, Lines),
[{L1, L2} | condense(CompressedOthers)].
% remove entries from head of list while they make the rule provided
% which references the head of the tail as well
% tweak from github:otp/lib/stdlib/src/lists.erl
dropwhile(Pred, [Hd|[NextHd|_Tail]=Others]=Rest) ->
case Pred(Hd, NextHd) of
true -> dropwhile(Pred, Others);
false -> Rest
end;
dropwhile(_Pred, [_Hd|[]]=Rest) ->
Rest;
dropwhile(Pred, []) when is_function(Pred, 1) ->
[].
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment