Created
May 22, 2010 15:19
-
-
Save whym/410142 to your computer and use it in GitHub Desktop.
classes for manipulating stand-off annotation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| # -*- coding: utf-8; mode: ruby -*- | |
| # TODO: rewrite referring Range | |
| class Annotation | |
| include Comparable | |
| attr_reader :start, :end, :tag | |
| attr_writer :start, :end, :tag | |
| def initialize(s=nil,e=nil,tag=nil) | |
| @start, @end, @tag = s.to_i, e.to_i, tag | |
| end | |
| def to_s | |
| return "#{@start}\t#{@end}\t#{@tag}" | |
| end | |
| def contain?(o) | |
| @start <= o.start and @end >= o.end | |
| end | |
| def eql?(o) | |
| o.is_a?(Annotation) and @start==o.start and @end==o.end and @tag==o.tag | |
| end | |
| alias :== :eql? | |
| def hash | |
| @start.hash + @end.hash + @tag.hash | |
| end | |
| def to_a | |
| [@start,@end,@tag] | |
| end | |
| def tag_name | |
| (self.tag.split(/\s/))[0] | |
| end | |
| def attributes_at(*a) | |
| return attributes.values_at(*a).flatten | |
| end | |
| private | |
| def attributes # TODO: 毎回生成するのでおそい | |
| if !@tag then | |
| return {} | |
| end | |
| h = {} | |
| @tag.gsub(/(\S*?)=("(.*?)"|(.*?))(\s|$)/) do |x| | |
| v = ($3 || $4) | |
| if h[$1] then | |
| if h[$1].is_a?(Array) then | |
| h[$1] << v | |
| else | |
| h[$1] = [h[$1],v] | |
| end | |
| else | |
| h[$1] = v | |
| end | |
| end | |
| return h | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| # -*- coding: utf-8; mode: ruby -*- | |
| require 'set' | |
| require 'annotation' | |
| #require 'bsearch' | |
| # cf. http://0xcc.net/ruby-bsearch/index.html | |
| class Array | |
| def bsearch_lower_boundary(x=0, y=self.length, &block) | |
| x -= 1 | |
| while (mid = x + (y - x) / 2) > x do | |
| val = block.call(self[mid]) | |
| if val < 0 then | |
| x = mid | |
| else | |
| y = mid | |
| end | |
| end | |
| return y | |
| end | |
| def bsearch_upper_boundary(x=0, y=self.length, &block) | |
| x -= 1 | |
| while (mid = x + (y - x) / 2) > x do | |
| val = block.call(self[mid]) | |
| if val <= 0 then | |
| x = mid | |
| else | |
| y = mid | |
| end | |
| end | |
| return x + 1 | |
| end | |
| def bsearch_range(x=0, y=self.length, &block) | |
| u = bsearch_upper_boundary(x, y, &block) | |
| l = bsearch_lower_boundary(x, y, &block) | |
| return l...u | |
| end | |
| # TODO: bsearch_find can be defined in a faster way (terminating when equal) | |
| end | |
| class StandOff | |
| attr_reader :body | |
| def initialize(body='', ann=[]) | |
| if body.respond_to?(:read) then | |
| read_body_file(body) | |
| else | |
| @body = body | |
| end | |
| if ann.respond_to?(:read) then | |
| @annotations = [] | |
| read_annotation_file(ann) | |
| else | |
| @annotations = ann | |
| end | |
| end | |
| def read_body_file(file) | |
| @body = file.read | |
| file.close | |
| end | |
| def read_annotation_file(file) | |
| @annotations += read_annotation_file_(file) | |
| end | |
| def add_annotations(x,method=:insert,sorted=false) | |
| if x.respond_to?(:read) then | |
| appends = read_annotation_file_(x) | |
| else | |
| appends = [x].flatten | |
| end | |
| case method | |
| when :insert | |
| p = 0 | |
| appends.each do |a| | |
| p = @annotations.bsearch_upper_boundary(sorted ? p : 0){|x| cmp_annotations_(x,a)} | |
| @annotations.insert(p,a) | |
| p += 1 | |
| end | |
| when :sort | |
| @annotations = annotation_stable_sort_(@annotations + appends) | |
| end | |
| end | |
| # TODO: 実装 (see csjxml) def read_xml(file) | |
| def write_annotation_file(file, annotations=@annotations) | |
| annotations.each do |e| | |
| file.puts "#{e.start}\t#{e.end}\t#{e.tag}" | |
| end | |
| file.flush | |
| end | |
| def add_annotation(a,inner=true) | |
| if a.is_a? Annotation then | |
| p = if inner then | |
| @annotations.bsearch_upper_boundary{|x| cmp_annotations_(x,a)} | |
| else | |
| @annotations.bsearch_lower_boundary{|x| cmp_annotations_(x,a)} | |
| end | |
| @annotations.insert(p, a) | |
| else | |
| STDERR.puts "#{a.inspect} is not Annotation" | |
| end | |
| end | |
| def subset(region, &block) | |
| s = [] | |
| each_contained_annotation(region) do |x| | |
| s << x | |
| end | |
| if block != nil then | |
| s = s.select(&block) | |
| end | |
| StandOff.new(@body, s) | |
| end | |
| def near_annotations(region, range=0, &block) | |
| res = [] | |
| if block then | |
| each_near_annotation(region, range) do |a| | |
| res << a | |
| block.call(a) | |
| end | |
| else | |
| each_near_annotation(region, range) do |a| | |
| res << a | |
| end | |
| end | |
| res | |
| end | |
| def each_near_annotation(region, range=0, &block) | |
| # TODO: Range による指定に対応 | |
| region = region_(region) | |
| beginpos = @annotations.bsearch_lower_boundary{|x| x.start <=> region.start-range} | |
| endpos = @annotations.bsearch_upper_boundary{|x| x.start <=> region.start} | |
| (endpos-1).downto(beginpos) do |i| | |
| a = @annotations[i] | |
| if (a.start - region.start).abs <= range and (a.end - region.end).abs <= range then | |
| block.call(a) | |
| end | |
| end | |
| end | |
| def matching_disjoint_annotations(pattern, region=nil, &block) | |
| res = [] | |
| if block then | |
| each_matching_disjoint_annotation(pattern,region) do |a,m| | |
| if block.call(a,m) then | |
| res << a | |
| end | |
| end | |
| else | |
| each_matching_disjoint_annotation(pattern,region) do |a,_| | |
| res << a | |
| end | |
| end | |
| return res | |
| end | |
| def each_matching_disjoint_annotation(pattern, region=nil, &block) | |
| beginpos, endpos = 0, @annotations.length | |
| if region then | |
| region = region_(region) | |
| beginpos = @annotations.bsearch_lower_boundary{|x| x.start <=> region.start} | |
| endpos = @annotations.bsearch_upper_boundary{|x| x.start <=> region.end-1} | |
| else | |
| region = Annotation.new(0, @body.length) | |
| end | |
| a = nil | |
| beginpos.upto(endpos-1) do |i| | |
| if (@annotations[i].start != region.start or @annotations[i].end != region.end or @annotations[i].tag != region.tag) and | |
| pattern.match(@annotations[i].tag) then | |
| a = @annotations[i] | |
| break | |
| end | |
| end | |
| if a != nil then | |
| i = @annotations.bsearch_lower_boundary(beginpos){|x| cmp_annotations_(x,a) } | |
| while i < endpos do | |
| matchd = pattern.match(@annotations[i].tag) | |
| if matchd then | |
| a = @annotations[i] | |
| block.call a, matchd | |
| i = @annotations.bsearch_lower_boundary(i+1, endpos){|x| x.start <=> a.end} | |
| else | |
| i += 1 | |
| end | |
| end | |
| end | |
| end | |
| def substr(x,y=nil) | |
| if x.is_a? Annotation then | |
| return @body[x.start, x.end-x.start] | |
| else | |
| return @body[x, y-x] | |
| end | |
| end | |
| def each_containing_annotation(region,&block) | |
| region = region_(region) | |
| endpos = @annotations.bsearch_upper_boundary{|x| cmp_annotations_(x,region) } | |
| i = 0 | |
| while i < endpos do | |
| # TODO: このへんをもう一段二分探索で高速化できる | |
| a = @annotations[i] | |
| if a.contain?(region) then | |
| block.call(a) | |
| i += 1 | |
| else | |
| i = @annotations.bsearch_upper_boundary(i,endpos){|x| x.start <=> a.start } | |
| end | |
| end | |
| end | |
| def each_contained_annotation(region,&block) | |
| region = region_(region) | |
| i = @annotations.bsearch_lower_boundary{|x| cmp_annotations_(x,region) } | |
| e = @annotations.bsearch_upper_boundary(i){|x| x.start <=> region.end } | |
| @annotations[i...e].each do |a| | |
| if region.contain?(a) then | |
| block.call(a) | |
| end | |
| end | |
| end | |
| def enclosing_annotation(region, n=1) | |
| i = 0 | |
| each_enclosing_annotation(region) do |a| | |
| i += 1 | |
| if i == n then | |
| return a | |
| end | |
| end | |
| return nil | |
| end | |
| def each_enclosing_annotation(region,&block) | |
| reg = region_(region) | |
| i = @annotations.bsearch_upper_boundary{|x| cmp_annotations_(x,reg) } - 1 | |
| while i >= 0 do | |
| a = @annotations[i] | |
| if a.contain?(reg) and a != reg and (a.start != reg.start or a.end != reg.end or (reg.tag and a.tag != reg.tag)) then | |
| block.call a | |
| end | |
| i -= 1 | |
| end | |
| end | |
| def annotations(&block) | |
| if block then | |
| @annotations.select(&block) | |
| else | |
| @annotations.clone | |
| end | |
| end | |
| def following_annotations(region, n=1) | |
| res = [] | |
| region = region_(region) | |
| while res.size < n and (i = following_annotation_idx_(region)) != nil do | |
| res << i | |
| region = @annotations[i] | |
| end | |
| return res.map{|x| @annotations[x]} | |
| end | |
| def preceding_annotations(region, n=1) | |
| res = [] | |
| region = region_(region) | |
| while res.size < n and (i = preceding_annotation_idx_(region)) != nil do | |
| res << i | |
| region = @annotations[i] | |
| end | |
| return res.reverse.map{|x| @annotations[x]} | |
| end | |
| def render(print_begin_tag = Proc.new{|a| "<#{a.tag_name}>"}, | |
| print_end_tag = Proc.new{|a| "</#{a.tag_name}>"}, | |
| print_empty_tag = Proc.new{|a| "<#{a.tag_name}/>"}, | |
| region = [0, @body.length], | |
| anns = nil, | |
| emit_outside = false, | |
| &block) | |
| region = region_(region) | |
| unless anns then | |
| anns = self.annotations(&block) | |
| end | |
| i = 0 | |
| endtagstack = [] | |
| require 'stringio' | |
| input = StringIO.new(self.substr(0, region.end)) | |
| input.read(region.start) | |
| output = StringIO.new | |
| while !input.eof do | |
| if i >= anns.size then | |
| break | |
| end | |
| # empty tags | |
| while i < anns.size and input.pos == anns[i].start and anns[i].start == anns[i].end do | |
| output.print print_empty_tag.call(anns[i]) | |
| i += 1 | |
| end | |
| # end tags | |
| if endtagstack.size > 0 then | |
| if input.pos == endtagstack[0][0].end then | |
| endtagstack[0].each do |emit| | |
| output.print print_end_tag.call(emit) | |
| end | |
| endtagstack.shift | |
| end | |
| end | |
| # start tags | |
| while i < anns.size and input.pos == anns[i].start and anns[i].start != anns[i].end do | |
| output.print print_begin_tag.call(anns[i]) | |
| if endtagstack.size > 0 and endtagstack[0][0].end == anns[i].end then | |
| endtagstack[0].unshift anns[i] | |
| else | |
| endtagstack.unshift [anns[i]] | |
| end | |
| i += 1 | |
| end | |
| while i < anns.size and input.pos < anns[i].start and ( endtagstack.size == 0 or input.pos < endtagstack[0][0].end ) do | |
| output.putc input.getc | |
| end | |
| end | |
| # end tags | |
| while endtagstack.size > 0 do | |
| endtagstack[0].each do |emit| | |
| while input.pos < emit.end do | |
| output.putc input.getc | |
| end | |
| output.print print_end_tag.call(emit) | |
| end | |
| endtagstack.shift | |
| end | |
| if !input.eof then | |
| output.print input.read | |
| end | |
| return output.string | |
| end | |
| # private methods | |
| private | |
| def read_annotation_file_(file) | |
| ret = [] | |
| file.each_line do |line| | |
| unless match = line.match(/(\d+)\t(\d+)\t(.*)/) then | |
| next | |
| end | |
| s,e,tag = match[1..3] | |
| ret << Annotation.new(s,e,tag) | |
| end | |
| file.close | |
| return ret | |
| end | |
| def following_annotation_idx_(region) | |
| endpos = region.end | |
| r = 0...0 | |
| closest = @annotations.bsearch_lower_boundary(r.last) {|x| if x.start >= endpos then 0 | |
| else -1 end } | |
| r = @annotations.bsearch_range(closest) {|x| x.start <=> @annotations[closest].start} | |
| if !r.none? then | |
| return r.last-1 | |
| else | |
| return nil | |
| end | |
| end | |
| def preceding_annotation_start_(b, e, start) | |
| r = @annotations.bsearch_range(b, e){|x| | |
| if x.start <= start then 0 | |
| else 1 end | |
| } | |
| if r.none? then | |
| return -1 | |
| else | |
| return @annotations[r.last-1].start | |
| end | |
| end | |
| def preceding_annotation_idx_(region) | |
| r = @[email protected] | |
| startpos = preceding_annotation_start_(0, r.first, region.start-1) | |
| while startpos >= 0 do | |
| r = @annotations.bsearch_range(0, r.first) {|x| x.start <=> startpos} | |
| if r.none? then | |
| startpos = preceding_annotation_start_(0, r.first, startpos-1) | |
| else | |
| preds = @annotations.bsearch_range(r.first, r.last){|x| if x.end <= region.start then 0 | |
| else -1 end} | |
| if !preds.none? then | |
| return preds.last-1 | |
| end | |
| startpos = preceding_annotation_start_(0, r.first, startpos-1) | |
| end | |
| end | |
| return nil | |
| end | |
| def cmp_annotations_(x,y) | |
| # TODO: annotation から depth つまり idx を引くハッシュを用意して包含関係をより厳密に | |
| if (c = x.start - y.start) != 0 then | |
| return c | |
| end | |
| y.end - x.end | |
| #[x.start, -x.end] <=> [y.start, -y.end] | |
| end | |
| def region_(args) | |
| if args.is_a?(Annotation) then | |
| return args | |
| elsif args.is_a?(Array) and args.length >= 2 then | |
| unless args[0] and args[1] then | |
| raise Exception.new("#{self.class} #{caller[0]}: invalid argument : #{args.inspect}") | |
| end | |
| return Annotation.new(*args) | |
| end | |
| throw Exception.new(args.inspect) | |
| end | |
| def annotation_stable_sort_(a) | |
| n = 0 | |
| a.sort_by{|x| [x.start, -x.end, n+=1, x.tag]} | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| # -*- coding: utf-8; mode: ruby -*- | |
| require 'test/unit' | |
| require 'annotation' | |
| require 'set' | |
| class TC_Annotation < Test::Unit::TestCase | |
| def test_0 | |
| a = Annotation.new(0,1,'mytag') | |
| assert_equal([0,1,'mytag'], [a.start,a.end,a.tag]) | |
| assert_equal([0,1,'mytag'], a.to_a) | |
| end | |
| def test_copy | |
| a = Annotation.new(0,1,'mytag') | |
| b = Annotation.new(a.start, a.end, a.tag) | |
| c = a.clone | |
| assert_equal(b, c) | |
| assert_equal(a, b) | |
| assert_equal(c, a) | |
| end | |
| def test_attributes | |
| assert_equal(['20', '10'], | |
| Annotation.new(0,1,'width=10 height=20').attributes_at('height','width')) | |
| assert_equal(['Push here'], | |
| Annotation.new(0,1,'value="Push here"').attributes_at('value')) | |
| assert_equal(['Push', 'Push here'], | |
| Annotation.new(0,1,'value="Push" value="Push here"').attributes_at('value')) | |
| end | |
| def test_contain | |
| a = Annotation.new(2,3,'mytag') | |
| b = Annotation.new(2,2,'mytag') | |
| c = Annotation.new(0,2,'mytag') | |
| d = Annotation.new(2,4,'mytag') | |
| assert(a.contain?(a)) | |
| assert(a.contain?(b)) | |
| assert(!a.contain?(c)) | |
| assert(!a.contain?(d)) | |
| end | |
| def test_hash | |
| a = Set.new([Annotation.new(2,3,'mytag'), | |
| Annotation.new(2,2,'mytag'), | |
| Annotation.new(0,2,'mytag'), | |
| Annotation.new(2,4,'mytag')]) | |
| b = a.clone | |
| b << Annotation.new(2,4,'mytag') | |
| assert_equal(a,b) | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/env ruby | |
| # -*- coding: utf-8; mode: ruby -*- | |
| require 'test/unit' | |
| require 'standoff' | |
| require 'pp' | |
| require 'stringio' | |
| require 'tempfile' | |
| class StandOff | |
| def containings(x) | |
| s = [] | |
| each_containing_annotation(x) do|a| | |
| s << a | |
| end | |
| s | |
| end | |
| def containeds(x) | |
| s = [] | |
| each_contained_annotation(x) do|a| | |
| s << a | |
| end | |
| s | |
| end | |
| end | |
| class TC_StandOff < Test::Unit::TestCase | |
| TMP = Tempfile.new('tmp.standoff') | |
| LETTER_ANNON = { | |
| 'one' =>'1', | |
| 'two' =>'2', | |
| 'three'=>'3', | |
| 'four' =>'4' } | |
| def setup | |
| @so = StandOff.new('1234') | |
| @so.read_annotation_file(StringIO.new <<'EOD') | |
| 0 3 hundread-and-twenty-three | |
| 0 2 twelve | |
| 0 1 one | |
| 1 3 twenty-three | |
| 1 2 two | |
| 2 4 thirty-four | |
| 2 3 three | |
| 3 4 Four | |
| 3 4 four | |
| EOD | |
| s =<<'EOD' | |
| 0 22 ptb_article id="mydoc" | |
| 0 22 NP | |
| 4 8 NN | |
| 4 8 synonym | |
| 14 17 JJ | |
| 14 17 synonym | |
| 18 22 NN | |
| 18 22 synonym | |
| EOD | |
| @so2 = StandOff.new('the girl with red eyes', StringIO.new(s)) | |
| end | |
| def test_join | |
| s1 = <<'EOD' | |
| 0 22 ptb_article id="mydoc" | |
| 0 22 NP | |
| 4 8 NN | |
| 14 17 JJ | |
| 18 22 NN | |
| EOD | |
| s2 = <<'EOD' | |
| 4 8 synonym | |
| 14 17 synonym | |
| 18 22 synonym | |
| EOD | |
| so = StandOff.new() | |
| so.read_annotation_file(StringIO.new(s1)) | |
| so.add_annotations(StringIO.new(s2)) | |
| assert_equal(<<'EOD', so.annotations.map{|x| x.to_s+"\n"}.join) | |
| 0 22 ptb_article id="mydoc" | |
| 0 22 NP | |
| 4 8 NN | |
| 4 8 synonym | |
| 14 17 JJ | |
| 14 17 synonym | |
| 18 22 NN | |
| 18 22 synonym | |
| EOD | |
| end | |
| def test_substr | |
| assert_equal('123', @so.substr(0, 3)) | |
| assert_equal( '23', @so.substr(1, 3)) | |
| assert_equal('123', @so.substr(Annotation.new(0, 3))) | |
| end | |
| def test_matching_annotations | |
| assert_equal([Annotation.new(0, 1, 'a')], | |
| StandOff.new('', [Annotation.new(0, 1, 'A'), | |
| Annotation.new(0, 1, 'a'), | |
| Annotation.new(0, 1, 'b')]). | |
| matching_disjoint_annotations(/^[a-z]$/)) | |
| assert_equal([Annotation.new(0, 0, '0'), | |
| Annotation.new(1, 2, 'b'), | |
| Annotation.new(2, 3, 'c')], | |
| StandOff.new('', [Annotation.new(0, 3, '-'), | |
| Annotation.new(0, 0, '0'), | |
| Annotation.new(1, 2, 'b'), | |
| Annotation.new(2, 3, 'c__'), | |
| Annotation.new(2, 3, 'c'), | |
| Annotation.new(3, 4, 'd')]). | |
| matching_disjoint_annotations(/^.$/, Annotation.new(0, 3, '-'))) | |
| regexp = Regexp.new('^('+LETTER_ANNON.keys.join('|')+')$') | |
| assert_equal(Set.new(LETTER_ANNON.to_a), | |
| Set.new(@so.matching_disjoint_annotations(regexp).map{|x| [x.tag, @so.substr(x)]})) | |
| s = Set.new | |
| @so.each_matching_disjoint_annotation(regexp) do |a,_| | |
| s << [a.tag, @so.substr(a)] | |
| end | |
| assert_equal(Set.new(LETTER_ANNON.to_a), s) | |
| end | |
| def test_annotations_select | |
| assert_equal([Annotation.new(0,1,''), Annotation.new(0,0,'')], | |
| StandOff.new('', [Annotation.new(0, 2, ''), | |
| Annotation.new(0, 1, ''), | |
| Annotation.new(0, 0, '')]).annotations do |a| | |
| a.end - a.start <= 1 | |
| end) | |
| end | |
| def test_enclosing_annotation | |
| assert_equal(Annotation.new(0,1,'b'), | |
| StandOff.new('', [Annotation.new(0, 1, 'a'), | |
| Annotation.new(0, 1, 'b'), | |
| Annotation.new(0, 1, 'c')]).enclosing_annotation(Annotation.new(0, 1, 'c'))) | |
| assert_equal(nil, | |
| StandOff.new('', [Annotation.new(0, 1, 'a'), | |
| Annotation.new(0, 1, 'b'), | |
| Annotation.new(0, 1, 'c')]).enclosing_annotation(Annotation.new(0, 100, 'd'))) | |
| assert_equal(Annotation.new(0,2,'twelve'), | |
| @so.enclosing_annotation([0,1])) | |
| assert_equal(Annotation.new(2,4,'thirty-four'), | |
| @so.enclosing_annotation([2,3])) | |
| assert_equal(Annotation.new(1,3,'twenty-three'), | |
| @so.enclosing_annotation([2,3],2)) | |
| assert_equal(nil, | |
| @so.enclosing_annotation([2,4])) | |
| assert_equal(Annotation.new(3,4,'Four'), | |
| @so.enclosing_annotation([3,4,'four'])) | |
| end | |
| def test_near | |
| so = StandOff.new('123', | |
| StringIO.new(<<'EOD' | |
| 1 3 twenty-three | |
| 1 2 two | |
| 1 2 Two | |
| 1 2 TWO | |
| 2 3 three | |
| EOD | |
| )) | |
| assert_equal([Annotation.new(1,2,'TWO'), | |
| Annotation.new(1,2,'Two'), | |
| Annotation.new(1,2,'two')], | |
| so.near_annotations(Annotation.new(1,2,'two'), 0)) | |
| end | |
| def test_subset | |
| assert_equal([ Annotation.new(1,3,'twenty-three'), | |
| Annotation.new(1,2,'two'), | |
| Annotation.new(2,3,'three') ], | |
| @so.subset([1,3,'twenty-three']).annotations) | |
| assert_equal(@so2.annotations, @so2.subset(@so2.annotations[0]).annotations) | |
| end | |
| def test_bsearch | |
| assert_equal((1...3), [1,2,2,3,4].bsearch_range{|x| x <=> 2}) | |
| assert_equal((0...0), [1,2,2,3,4].bsearch_range{|x| x <=> 0}) | |
| assert_equal((4...4), [1,2,2,3,4].bsearch_range{|x| x <=> 3.5}) | |
| assert_equal((5...5), [1,2,2,3,4].bsearch_range{|x| x <=> 5}) | |
| assert_equal((4...5), [1,2,2,3,4].bsearch_range{|x| x <=> 4}) | |
| assert_equal((0...0), [].bsearch_range{|x| x <=> 2}) | |
| assert_equal((2...2), [[1,2],[2,1],[5,6],[5,1]].bsearch_range{|x| x[0] <=> 4}) | |
| assert_equal(Annotation.new(2,4,'thirty-four'), | |
| @so.annotations[@so.annotations.bsearch_lower_boundary{|x| [x.start, -x.end] <=> [2,[email protected]]}]) | |
| assert_equal(Annotation.new(3,4,'Four'), | |
| @so.annotations[@so.annotations.bsearch_lower_boundary{|x| [x.start, -x.end] <=> [2,0]}]) | |
| assert_equal(0, | |
| @so.annotations.bsearch_lower_boundary{|x| [x.start, -x.end] <=> [0,[email protected]]}) | |
| end | |
| def test_add_annotation | |
| so = StandOff.new | |
| @so.annotations.reverse.each do |a| | |
| so.add_annotation(a, false) | |
| end | |
| assert_equal(@so.annotations, | |
| so.annotations) | |
| end | |
| def test_following_preceding | |
| assert_equal([Annotation.new(2,3,'three'),Annotation.new(3,4,'four')], | |
| @so.following_annotations([0,2],2)) | |
| assert_equal([Annotation.new(2,3,'three')], | |
| @so.following_annotations([0,2],1)) | |
| assert_equal([Annotation.new(3,4,'four')], | |
| @so.following_annotations([0,3],1)) | |
| assert_equal([], | |
| @so.following_annotations([0,4],1)) | |
| assert_equal([@so.following_annotations([0,2],1)[0], | |
| @so.following_annotations(@so.following_annotations([0,2],1)[0],1)[0]], | |
| @so.following_annotations([0,2],2)) | |
| assert_equal([Annotation.new(0,1,'one'),Annotation.new(1,2,'two'),Annotation.new(2,3,'three')], | |
| @so.preceding_annotations([3,4],3)) | |
| assert_equal([Annotation.new(0,1,'one'),Annotation.new(1,2,'two'),Annotation.new(2,3,'three')], | |
| @so.preceding_annotations(@so.annotations.last,3)) | |
| assert_equal([], @so.preceding_annotations([0,3])) | |
| assert_equal([], @so.following_annotations([2,4])) | |
| assert_equal([Annotation.new(2,3,'three'),Annotation.new(3,4,'four')], | |
| @so.preceding_annotations([4,4],2)) | |
| end | |
| def test_following_preceding2 | |
| assert_equal([Annotation.new(4,8,'synonym')], | |
| @so2.preceding_annotations([14,17], 3)) | |
| assert_equal([Annotation.new(4,8,'synonym'),Annotation.new(14,17,'synonym')], | |
| @so2.preceding_annotations([18,22], 3)) | |
| assert_equal([], @so2.preceding_annotations([4,8], 3)) | |
| assert_equal([Annotation.new(14,17,'synonym'), Annotation.new(18,22,'synonym')], | |
| @so2.following_annotations([4,8], 3)) | |
| end | |
| def test_following_preceding3 | |
| # TODO: support the case where an empty annotation is preceding | |
| # so = StandOff.new(@so.body) | |
| # so.add_annotations(@so.annotations) | |
| # so.add_annotation(Annotation.new(0,0,'BEGIN')) | |
| # so.add_annotation(Annotation.new(4,4,'END')) | |
| # assert_equal([Annotation.new(0,0,'BEGIN')], | |
| # so.preceding_annotations([0,1],2)) | |
| end | |
| def test_following_preceding4 | |
| so = StandOff.new('the girl with red eyes , girl and chicks') | |
| so.read_annotation_file(StringIO.new <<'EOD') | |
| 0 40 ptb_article id="mydoc" | |
| 0 22 S | |
| 0 22 NP-SBJ-1 | |
| 4 8 NN | |
| 4 8 NN2 | |
| 14 17 JJ | |
| 14 17 JJ2 | |
| 18 22 NN | |
| 18 22 NN2 | |
| 25 29 NN | |
| 34 40 NN | |
| EOD | |
| assert_equal([Annotation.new(14,17,'JJ2'), | |
| Annotation.new(18,22,'NN2'), | |
| Annotation.new(25,29,'NN')], so.following_annotations([4,8], 3)) | |
| end | |
| def test_contain2 | |
| assert_equal([Annotation.new(4,8,'NN'), | |
| Annotation.new(4,8,'synonym')], | |
| @so2.containeds([0,16])) | |
| assert_equal([Annotation.new(14,17,'JJ'), | |
| Annotation.new(14,17,'synonym')], | |
| @so2.containeds([14,18])) | |
| assert_equal([Annotation.new(0,22,'ptb_article id="mydoc"'), | |
| Annotation.new(0,22,'NP')], | |
| @so2.containings([14,18])) | |
| # assert_equal([Annotation.new(14,17,'synonym')], | |
| # @so2.containeds([14,17,'synonym'])) | |
| # assert_equal([Annotation.new(0,22,'ptb_article id="mydoc"'), | |
| # Annotation.new(0,22,'NP')], | |
| # @so2containings([14,17,'JJ'])) | |
| end | |
| def test_contained_annotations | |
| s = [] | |
| @so.each_contained_annotation([1,3]) do |a| | |
| s << a | |
| end | |
| assert_equal([Annotation.new(1,3,'twenty-three'), | |
| Annotation.new(1,2,'two'), | |
| Annotation.new(2,3,'three')], | |
| s) | |
| end | |
| def test_containing_annotations | |
| ans1 = [Annotation.new(0,3,'hundread-and-twenty-three'), | |
| Annotation.new(0,2,'twelve'), | |
| Annotation.new(0,1,'one')] | |
| assert_equal(ans1, | |
| @so.containings([0,1])) | |
| s = [] | |
| @so.each_containing_annotation([0,1]) do |a| | |
| s << a | |
| end | |
| assert_equal(ans1, s) | |
| [[2,3], Annotation.new(2,3)].each do |region| | |
| assert_equal([Annotation.new(0,3,'hundread-and-twenty-three'), | |
| Annotation.new(1,3,'twenty-three'), | |
| Annotation.new(2,4,'thirty-four'), | |
| Annotation.new(2,3,'three')], | |
| @so.containings(region)) | |
| end | |
| end | |
| def test_write_annotation | |
| @so.write_annotation_file(File.open(TMP.path,'w')) | |
| so = StandOff.new(@so.body) | |
| TMP.close | |
| so.read_annotation_file(File.open(TMP.path,'r')) | |
| assert_equal(@so.annotations, | |
| so.annotations) | |
| end | |
| def test_render | |
| assert_equal("<one>1</one><two>2</two><three>3</three><four>4</four>", | |
| @so.render{|a| a.tag =~ /^(one|two|three|four)$/}) | |
| end | |
| def test_render_empties | |
| so = StandOff.new('1234.') | |
| so.add_annotations([ | |
| Annotation.new(0, 1, 'thousands'), | |
| Annotation.new(1, 1, 'thousands-separator'), | |
| Annotation.new(1, 4, 'ones'), | |
| ]) | |
| assert_equal("<THOUSANDS>1</THOUSANDS><ONES>234</ONES>.", | |
| so.render( | |
| Proc.new{|a| "<#{a.tag_name.upcase}>"}, | |
| Proc.new{|a| "</#{a.tag_name.upcase}>"}, | |
| Proc.new{|a| ""} | |
| ) | |
| ) | |
| assert_equal("<thousands>1</thousands><ones><thousands-separator/>234</ones>.", | |
| so.render) | |
| end | |
| end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment