Created
May 16, 2011 04:20
-
-
Save mieko/973937 to your computer and use it in GitHub Desktop.
Scan free-form text for human-reabible dates (according to Chronic)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'chronic' | |
module Chronic | |
class << self | |
def scan(text) | |
return enum_for(:scan, text) unless block_given? | |
# words is lossless. We need to be able to join('') this and get | |
# the original string back | |
words = scan_by_re(text, /\s+/).to_a | |
# each word is given a begin...end range for its index referencing its | |
# position in the original string. | |
word_offsets = [] | |
# Each token we generate has a back-pointer at the word it came from | |
rev_map = [] | |
# The tokens we generate. pre_normalize can turn a word into two, | |
# which will generate multiple tokens. | |
tokens = [] | |
strpos = 0 | |
words.each.with_index do |word, word_index| | |
word_offsets.push(strpos ... strpos + word.size) | |
strpos += word.size | |
normalized = pre_normalize(word) | |
normalized.split(/\s+/).each do |v| | |
tokens.push(Token.new(v.gsub(/\s+|[\[\]\(\)]|[?!"]\z/, ''))) | |
rev_map.push(word_index) | |
end | |
end | |
# Use Chronic's normal tagger to find useful tokens | |
tokens = Repeater.scan(tokens, {}) | |
[Grabber, Pointer, Scalar, Ordinal, Separator, | |
TimeZone].each do |tokenizer| | |
tokens = tokenizer.scan(tokens) | |
end | |
# Walk through the tokens, finding long runs of tags | |
b, e = 0, 0 | |
joined = nil | |
loop do | |
# Walk until we find a tag | |
b += 1 while (b < tokens.size) && (!tokens[b].tagged? || | |
tokens[b].get_tag(Separator)) | |
# walk another pointer forward and find where it stops | |
e = b + 1 | |
has_parsed = false | |
loop do | |
break if e >= tokens.size | |
break if !tokens[e].tagged? | |
joined = tokens[b...e].map(&:word).join(' ') | |
parse_result = (parse(joined) rescue nil) | |
e -= 1 and break if has_parsed && !parse_result | |
has_parsed = has_parsed || (!!parse_result) | |
e += 1 | |
end | |
# e won't move if b was at the end. | |
break if b >= tokens.size | |
# setup for next iteration | |
tok_range = b ... e | |
b = e | |
# These are parallel arrays, it needs to be inclusive | |
word_indices = rev_map[tok_range.begin] .. rev_map[tok_range.end-1] | |
out_range = word_offsets[word_indices] | |
out_range = out_range.first.begin ... out_range.last.end | |
result = text[out_range] | |
result.gsub!(/[."?'@,;]*\z/i, '') | |
out_range = out_range.begin ... out_range.begin + result.size | |
# special case for english text | |
next if result.match /\Aam\z/i | |
yield result, out_range, joined | |
end | |
end | |
private | |
# like //.split, but is lossless: also inserts the split character into | |
# the stream. | |
def scan_by_re(text, re) | |
return self.enum_for(:scan_by_re, text, re) unless block_given? | |
pos = 0 | |
while (md = text.match(re, pos)) | |
b, e = md.begin(0), md.end(0) | |
yield text[pos...b] unless pos == b | |
yield text[b...e] | |
pos = e | |
end | |
last_s = text[pos ... text.size] | |
yield last_s unless last_s.empty? | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment