jalberto · August 29, 2015 14:06
diff --git a/ngram.rb b/ngram.rb
 #
 # Extract uni, bi, and trigrams.
 #
 # "one two three".ngram(1) # unigrams
 # "one two three".ngram(2) # bigrams
 # "one two three".ngram(3) # trigrams
 #
 module Ngram
  TOKENIZER_REGEX = /[[:word:]'’\-\£\€]+/

  # Split a text into Ngrams
  #
  # @param [Fixnum] n ngram size
  # @param [Boolean] :join the output ngram as a string
  # @return [Array] ngrams array
  def ngram(n = 1, join: true)
    if join
      ngram_to_a(n).map {|n| n.join(' ')}
    else
      ngram_to_a(n)
    end
  end

  def ngram_to_a(n = 1)
    ngram_tokenizer.each_cons(n).to_a
  end

  # Call #ngram and aggregate result
  #
  # @param [Range] range to use
  # @param [Boolean] :join result in a string
  # @param [Boolean] :flatten result array to 1 dimension
  # @return [Array] ngrams array
  def ngrams(range = 1..3, join: true, flatten: true)
    return ngram(range, join: join) if range.is_a?(Fixnum)

    res = []
    range.each do |n|
      res << ngram(n, join: join)
    end

    flatten ? res.flatten! : res
  end

  def ngram_tokenizer
    self.downcase.scan(TOKENIZER_REGEX)
  end

 end

 class String
  include Ngram
 end
	#
	# Extract uni, bi, and trigrams.
	#
	# "one two three".ngram(1) # unigrams
	# "one two three".ngram(2) # bigrams
	# "one two three".ngram(3) # trigrams
	#
	module Ngram
	TOKENIZER_REGEX = /[[:word:]'’\-\£\€]+/

	# Split a text into Ngrams
	#
	# @param [Fixnum] n ngram size
	# @param [Boolean] :join the output ngram as a string
	# @return [Array] ngrams array
	def ngram(n = 1, join: true)
	if join
	ngram_to_a(n).map {\|n\| n.join(' ')}
	else
	ngram_to_a(n)
	end
	end

	def ngram_to_a(n = 1)
	ngram_tokenizer.each_cons(n).to_a
	end

	# Call #ngram and aggregate result
	#
	# @param [Range] range to use
	# @param [Boolean] :join result in a string
	# @param [Boolean] :flatten result array to 1 dimension
	# @return [Array] ngrams array
	def ngrams(range = 1..3, join: true, flatten: true)
	return ngram(range, join: join) if range.is_a?(Fixnum)

	res = []
	range.each do \|n\|
	res << ngram(n, join: join)
	end

	flatten ? res.flatten! : res
	end

	def ngram_tokenizer
	self.downcase.scan(TOKENIZER_REGEX)
	end

	end

	class String
	include Ngram
	end