Created
July 19, 2012 04:04
-
-
Save kourge/3140677 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env macruby | |
framework 'CoreFoundation' | |
class Range | |
def to_core_range | |
location = self.begin | |
length = (self.exclude_end? ? self.end : self.end.succ) - location | |
CFRange.new(location, length) | |
end | |
end | |
class StringTokenizer | |
attr :string | |
attr :range | |
def initialize(string, opts={}) | |
@string = string | |
opts[:option] ||= [] | |
modifiers = opts[:option].map { |s| MODIFIERS[s] || 0 }.inject(0, &:+) | |
@range = self.class.to_cf_range(opts[:range], @string.size) | |
@tokenizer = CFStringTokenizerCreate( | |
nil, @string, @range, modifiers, opts[:locale] | |
) | |
end | |
def string=(string) | |
super | |
CFStringTokenizerSetString(@tokenizer, string, @range) | |
end | |
def range=(range) | |
super | |
CFStringTokenizerSetString(@tokenizer, @string, range) | |
end | |
def next_token! | |
token_type = CFStringTokenizerAdvanceToNextToken(@tokenizer) | |
self.make_token(token_type) | |
end | |
def jump_token!(index) | |
token_type = CFStringTokenizerGoToTokenAtIndex(@tokenizer, index) | |
self.make_token(token_type) | |
end | |
def make_token(token_type) | |
Token.new( | |
@tokenator, | |
CFStringTokenizerGetCurrentTokenRange(@tokenizer), | |
Token.type_mask_to_array(token_type) | |
) | |
end | |
def self.to_cf_range(range, size=0) | |
case range.class.name | |
when "Range" then range.to_core_range | |
when "CFRange" then range | |
when "NSRange" then CFRange.new(range.location, range.length) | |
else CFRange.new(0, size) | |
end | |
end | |
def self.guess_language(string, range=nil) | |
range = self.to_cf_range(range, string.size) unless range.nil? | |
CFStringTokenizerCopyBestStringLanguage(string, range) | |
end | |
MODIFIERS = { | |
:unit_word => 0, | |
:unit_sentence => 1, | |
:unit_paragraph => 2, | |
:unit_line_break => 3, | |
:unit_word_boundary => 4, | |
} | |
class Token | |
TYPES = { | |
# 0 => :none, | |
1 => :normal, | |
(1 << 1) => :has_sub_tokens, | |
(1 << 2) => :has_derived_sub_tokens, | |
(1 << 3) => :has_numbers, | |
(1 << 4) => :has_non_letters, | |
(1 << 5) => :is_cj_word | |
} | |
ATTRIBUTES = { | |
:latin_transcription => 1 << 16, | |
:language => 1 << 17 | |
} | |
def self.type_mask_to_array(type_mask) | |
result = [] | |
return result if type_mask == 0 | |
TYPES.each do |mask, type| | |
result << type if (type_mask & mask) == mask | |
end | |
result | |
end | |
attr_reader :range, :types | |
def initialize(tokenizer, range, types) | |
@tokenizer, @range, @types = tokenizer.dup, range.dup, types | |
end | |
:attributes | |
def language | |
CFStringTokenizerCopyCurrentTokenAttribute( | |
@tokenizer, ATTRIBUTES[:language] | |
) | |
end | |
def latin_transcription | |
CFStringTokenizerCopyCurrentTokenAttribute( | |
@tokenizer, ATTRIBUTES[:latin_transcription] | |
) | |
end | |
def sub_tokens | |
raise NotImplementedError.new | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment