Last active
April 28, 2021 19:18
-
-
Save ceolinrenato/ff40b85aa60e9d560741f96034cae773 to your computer and use it in GitHub Desktop.
n_grams.exs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Renato Ceolin - 10/28/2020 | |
# Built with elixir 1.11 | |
# To run the code use the comamnd `elixir n_grams.exs` | |
# Tests will be run after the script execution | |
defmodule NGrams do | |
@moduledoc """ | |
NGrams are continguos sequences of words in a sentence | |
""" | |
@doc """ | |
Returns an array containing all the n-grams that can be generated from `sentence`. | |
## Examples | |
iex> NGrams.from_sentence("Show me the code.") | |
["Show me the code", "Show me the", "me the code", "Show me", "me the", "the code", "Show", "me", "the", "code"] | |
""" | |
@spec from_sentence(String.t()) :: [String.t()] | |
def from_sentence(sentence) do | |
~r<[[:punct:]]> | |
|> Regex.replace(sentence, "") | |
|> String.split() | |
|> get_sublists() | |
|> form_ngrams() | |
end | |
defp get_sublists([]), do: [] | |
defp get_sublists([_ | remaining_words] = words) do | |
Enum.concat( | |
[words], | |
get_sublists(remaining_words) | |
) | |
end | |
defp form_ngrams([]), do: [] | |
defp form_ngrams(list) do | |
{_, new_list} = List.pop_at(list, -1) | |
list | |
|> List.zip() | |
|> Enum.map(fn tuple -> | |
tuple | |
|> Tuple.to_list() | |
|> Enum.join(" ") | |
end) | |
|> Kernel.++(form_ngrams(new_list)) | |
end | |
end | |
sentence = IO.gets("Enter a sentence: ") | |
IO.puts("N-grams for the sentence `#{String.trim(sentence)}`:") | |
sentence | |
|> NGrams.from_sentence() | |
|> Enum.each(&IO.puts/1) | |
ExUnit.start() | |
defmodule NGramsTest do | |
@moduledoc false | |
use ExUnit.Case, async: true | |
describe "from_sentence/1" do | |
test "must remove punctuation" do | |
assert ["Hello World", "Hello", "World"] == NGrams.from_sentence("Hello... World!") | |
end | |
test "must ignores ascii control sequences" do | |
assert ["Hello World", "Hello", "World"] == | |
NGrams.from_sentence("Hello... \n World! \r\t\n") | |
end | |
test "must work with 5 words sequence" do | |
assert [ | |
"Lorem ipsum dolor amet ipsum", | |
"Lorem ipsum dolor amet", | |
"ipsum dolor amet ipsum", | |
"Lorem ipsum dolor", | |
"ipsum dolor amet", | |
"dolor amet ipsum", | |
"Lorem ipsum", | |
"ipsum dolor", | |
"dolor amet", | |
"amet ipsum", | |
"Lorem", | |
"ipsum", | |
"dolor", | |
"amet", | |
"ipsum" | |
] == NGrams.from_sentence("Lorem ipsum dolor amet ipsum") | |
end | |
test "must work with the challenge spec example" do | |
assert [ | |
"Show me the code", | |
"Show me the", | |
"me the code", | |
"Show me", | |
"me the", | |
"the code", | |
"Show", | |
"me", | |
"the", | |
"code" | |
] == NGrams.from_sentence("Show me the code.") | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment