Created
September 30, 2016 13:48
-
-
Save seanbehan/7c058ab5610ecc048cc5fd9ab4ecf129 to your computer and use it in GitHub Desktop.
create ngrams in ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Source https://snippets.aktagon.com/snippets/584-generating-word-n-grams-with-ruby | |
module Ngram | |
REGEX = /\w+/ | |
def ngram_tokenize | |
self.downcase.scan(REGEX) | |
end | |
def ngram(n=1) | |
res = Set.new | |
words = ngram_tokenize | |
word_count = words.length | |
words.each_with_index do |word, ix| | |
min = ix | |
max = ix + (n-1) | |
break if word_count <= max | |
res.add words[min..max].join(' ') | |
end | |
res | |
end | |
def ngrams(range=1..3) | |
return ngram(range) if range.is_a?(Fixnum) | |
res = Set.new | |
range.each do |n| | |
res.merge ngram(n) | |
end | |
res | |
end | |
end | |
class String | |
include Ngram | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://snippets.aktagon.com/snippets/584-generating-word-n-grams-with-ruby