Created
November 30, 2012 21:57
-
-
Save leejarvis/4178982 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
module W | |
class Token | |
attr_reader :value, :tags | |
def initialize(value) | |
@value = value | |
@tags = [] | |
end | |
def tagged? | |
@tags.any? | |
end | |
alias to_s value | |
end | |
class Tag | |
class << self; attr_accessor :match; end | |
def self.matches(thing) | |
self.match = thing | |
end | |
def self.scan(tokens) | |
tokens.each do |token| | |
if match.include?(token.value) | |
token.tags << self.new | |
end | |
end | |
end | |
end | |
module Tags | |
class FooBaz < Tag | |
matches ["foo", "baz"] | |
def to_s | |
'foo-or-baz' | |
end | |
end | |
class Foo < Tag | |
matches ["foo"] | |
def to_s | |
'foo-only' | |
end | |
end | |
end | |
class Tokenizer | |
def initialize(options) | |
@options = options | |
end | |
def tokenize(string) | |
tokens = string.split(/\s+/).map { |word| Token.new(word) } | |
Tags.constants.each { |tag| Tags.const_get(tag).scan(tokens) } | |
tokens.select(&:tagged?) | |
end | |
end | |
def self.tokenize(string, options = {}) | |
Tokenizer.new(options).tokenize(string) | |
end | |
end | |
W.tokenize("foo bar baz").each do |token| | |
p "#{token} -> #{token.tags}" | |
end | |
# "foo -> [foo-or-baz, foo-only]" | |
# "baz -> [foo-or-baz]" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment