<?php

namespace NlpTools\Tokenizers;

/**
 * White space phrase tokenizer.
 * Break on every white space
 * Create ngrams with the specified number of words ( $n )
 */
class WhitespacePhraseTokenizer implements TokenizerInterface
{

    private $n; // phrase word length
    
    const PATTERN = '/[\pZ\pC]+/u';

    public function set_n( $n ) {
      $this->n = $n;
    }

    public function tokenize( $str )
    {
    
        // generate unigrams
        $unigrams = preg_split(self::PATTERN,$str,null,PREG_SPLIT_NO_EMPTY);
        $num_unigrams = count( $unigrams );
        
        // generate other nGrams
        $ngrams = array();
        for( $n=2; $n<=$this->n; $n++ ) {
          // loop through each unigram location in the text
          for( $i=0; $i<=$num_unigrams-$n; $i++ ) {
            $key = $i;
            $ngram = array();
            for( $key=$i; $key<$i+$n; $key++ )
              $ngram[] = $unigrams[$key];
            $ngrams[] = implode( ' ', $ngram );
          }
        }
        
        // combine unigrams with new ngrams
        $ngrams = array_merge( $unigrams, $ngrams );
        
        return $ngrams;
        
    }
}