Created
March 3, 2017 14:14
-
-
Save ekalinin/183ba30657f816295232edda33fdbe5c to your computer and use it in GitHub Desktop.
indexing a file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-compile([export_all]). | |
% -export([get_file_contents/1,show_file_contents/1]). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok, File} = file:open(Name, [read]), | |
Rev = get_all_lines(File, []), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File, Partial) -> | |
case io:get_line(File, "") of | |
eof -> | |
file:close(File), | |
Partial; | |
Line -> | |
{Strip, _} = lists:split(length(Line)-1, Line), | |
get_all_lines(File, [Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n", [L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
%%% | |
%%% Solution | |
%%% | |
% Index a file, returns a list of tuples | |
% Each tuple is: | |
% * word | |
% * list of rows from a file | |
index_file(Filename) -> | |
index_lines(get_file_contents(Filename)). | |
% Index a list of lines | |
index_lines(Lines) -> | |
% Each state is a tuple of two elements: | |
% * number of line from file | |
% * index (as dict) | |
InitState = {1, dict:new()}, | |
{_, Index} = lists:foldl(fun index_line/2, InitState, Lines), | |
lists:sort(dict:to_list(Index)). | |
% Index a line | |
index_line(Line, Acc={RowNumber, _Index}) -> | |
Words = string:tokens(Line, " "), | |
{_, NewIndex} = lists:foldl(fun proccess_word/2, Acc, Words), | |
{RowNumber+1, NewIndex}. | |
proccess_word(Word, Acc) -> | |
ClearWord = clear_word(Word), | |
case need_to_proccess(ClearWord) of | |
true -> index_word(ClearWord, Acc); | |
false -> Acc | |
end. | |
clear_word(Word) -> | |
re:replace(string:to_lower(Word), | |
"(\\\\)|(\')|(\")", "", [global, {return, list}]). | |
common_words() -> | |
["the", "not", "and", "for", "with", "you", "this", "but", "from", "are", | |
"but", "any"]. | |
need_to_proccess(Word) -> | |
(length(Word) >= 3) and not(lists:member(Word, common_words())). | |
% Index word | |
index_word(Word, {RowNumber, Index}) -> | |
Pos = {RowNumber, RowNumber}, | |
NewIndex = dict:update(Word, | |
fun(Old) -> update_word_positions(Old, Pos) end, | |
[Pos], Index), | |
{RowNumber, NewIndex}. | |
% | |
update_word_positions(Positions, CurrentPos) -> | |
{PrevPoses, LastPos} = lists:split(length(Positions)-1, Positions), | |
[{LastPosFrom, LastPosTo}] = LastPos, | |
{CurrPosFrom, CurrPosTo} = CurrentPos, | |
PrevPoses ++ | |
if | |
LastPosTo == (CurrPosFrom-1); | |
LastPosTo == CurrPosFrom -> [{LastPosFrom, CurrPosTo}]; | |
true -> LastPos ++ [CurrentPos] | |
end. | |
%%% | |
%%% Tests | |
%%% | |
index_lines_test() -> | |
[{"aaa", [{1, 2}]}, | |
{"bbb", [{3, 3}]}] = index:index_lines(["aaa", "AAA", "bbb", "cc"]). | |
index_line_test() -> | |
D1 = dict:store("aaa", [{1,1}], dict:new()), | |
D2 = dict:store("bbb", [{1,1}], D1), | |
{2, D2} = index:index_line("aaa bbb", {1, dict:new()}). | |
index_word_1_test() -> | |
D1 = dict:store("aaa", [{1,1}], dict:new()), | |
{_, D1} = index:index_word("aaa", {1, dict:new()}). | |
index_word_2_test() -> | |
D1 = dict:store("aaa", [{1,2}], dict:new()), | |
RD1 = dict:store("aaa", [{1,1}], dict:new()), | |
{_, D1} = index:index_word("aaa", {2, RD1}). | |
update_word_pos_1_test() -> | |
[{1, 1}, {3, 3}] = update_word_positions([{1, 1}], {3, 3}). | |
update_word_pos_2_test() -> | |
[{1, 2}] = update_word_positions([{1, 1}], {2, 2}). | |
update_word_pos_3_test() -> | |
[{1, 2}] = update_word_positions([{1, 2}], {2, 2}). |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
➥ erl | |
Erlang/OTP 18 [erts-7.3.1.2] [source] [64-bit] [smp:4:4] [async-threads:10] [kernel-poll:false] | |
Eshell V7.3.1.2 (abort with ^G) | |
1> c(index). | |
{ok,index} | |
2> eunit:test(index). | |
All 7 tests passed. | |
ok | |
3> | |
3> index:index_file("gettysburg-address.txt"). | |
[{"above",[{16,16}]}, | |
{"add",[{16,16}]}, | |
{"advanced.",[{20,20}]}, | |
{"ago",[{1,1}]}, | |
{"all",[{3,3}]}, | |
{"altogether",[{10,10}]}, | |
{"battle-field",[{7,7}]}, | |
{"before",[{22,22}]}, | |
{"birth",[{26,26}]}, | |
{"brave",[{15,15}]}, | |
{"brought",[{1,1}]}, | |
{"but,",[{13,13}]}, | |
{"can",[{7,7},{13,14},{18,18}]}, | |
{"cause",[{23,23}]}, | |
{"civil",[{5,5}]}, | |
{"come",[{8,8}]}, | |
{"conceived",[{2,2},{6,6}]}, | |
{"consecrate",[{14,14}]}, | |
{"consecrated",[{16,16}]}, | |
{"continent,",[{2,2}]}, | |
{"created",[{3,3}]}, | |
{"dead",[{22,22},{25,25}]}, | |
{"dead,",[{15,15}]}, | |
{"dedicate",[{8,8},{13,...}]}, | |
{"dedicated",[{3,...},{...}|...]}, | |
{"dedicated,",[{...}]}, | |
{"detract.",[...]}, | |
{[...],...}, | |
{...}|...] | |
4> q(). | |
ok |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment