Skip to content

Instantly share code, notes, and snippets.

@kourge
Created July 19, 2012 04:04
Show Gist options
  • Save kourge/3140677 to your computer and use it in GitHub Desktop.
Save kourge/3140677 to your computer and use it in GitHub Desktop.
#!/usr/bin/env macruby
framework 'CoreFoundation'
class Range
def to_core_range
location = self.begin
length = (self.exclude_end? ? self.end : self.end.succ) - location
CFRange.new(location, length)
end
end
class StringTokenizer
attr :string
attr :range
def initialize(string, opts={})
@string = string
opts[:option] ||= []
modifiers = opts[:option].map { |s| MODIFIERS[s] || 0 }.inject(0, &:+)
@range = self.class.to_cf_range(opts[:range], @string.size)
@tokenizer = CFStringTokenizerCreate(
nil, @string, @range, modifiers, opts[:locale]
)
end
def string=(string)
super
CFStringTokenizerSetString(@tokenizer, string, @range)
end
def range=(range)
super
CFStringTokenizerSetString(@tokenizer, @string, range)
end
def next_token!
token_type = CFStringTokenizerAdvanceToNextToken(@tokenizer)
self.make_token(token_type)
end
def jump_token!(index)
token_type = CFStringTokenizerGoToTokenAtIndex(@tokenizer, index)
self.make_token(token_type)
end
def make_token(token_type)
Token.new(
@tokenator,
CFStringTokenizerGetCurrentTokenRange(@tokenizer),
Token.type_mask_to_array(token_type)
)
end
def self.to_cf_range(range, size=0)
case range.class.name
when "Range" then range.to_core_range
when "CFRange" then range
when "NSRange" then CFRange.new(range.location, range.length)
else CFRange.new(0, size)
end
end
def self.guess_language(string, range=nil)
range = self.to_cf_range(range, string.size) unless range.nil?
CFStringTokenizerCopyBestStringLanguage(string, range)
end
MODIFIERS = {
:unit_word => 0,
:unit_sentence => 1,
:unit_paragraph => 2,
:unit_line_break => 3,
:unit_word_boundary => 4,
}
class Token
TYPES = {
# 0 => :none,
1 => :normal,
(1 << 1) => :has_sub_tokens,
(1 << 2) => :has_derived_sub_tokens,
(1 << 3) => :has_numbers,
(1 << 4) => :has_non_letters,
(1 << 5) => :is_cj_word
}
ATTRIBUTES = {
:latin_transcription => 1 << 16,
:language => 1 << 17
}
def self.type_mask_to_array(type_mask)
result = []
return result if type_mask == 0
TYPES.each do |mask, type|
result << type if (type_mask & mask) == mask
end
result
end
attr_reader :range, :types
def initialize(tokenizer, range, types)
@tokenizer, @range, @types = tokenizer.dup, range.dup, types
end
:attributes
def language
CFStringTokenizerCopyCurrentTokenAttribute(
@tokenizer, ATTRIBUTES[:language]
)
end
def latin_transcription
CFStringTokenizerCopyCurrentTokenAttribute(
@tokenizer, ATTRIBUTES[:latin_transcription]
)
end
def sub_tokens
raise NotImplementedError.new
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment