brendano · June 22, 2012 03:23
diff --git a/page.rb b/page.rb
 #!/usr/bin/env ruby
 #
 # Data structures and proccessing of documents to be indexed, e.g. wikipedia
 # pages.  ok, everything is wikipedia-specific.  :-)
 #
 # This file can be executed for various sorts of testing (see bottom)


 require File.dirname(__FILE__)+'/common'

 require 'ostruct'
 require 'digest/md5'
 require 'rubygems'
 gem 'facets'
 require 'xml/libxml'

 require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
 require File.dirname(__FILE__)+'/semrep'

 # $sentence_breaker = SentenceBreaker.new
 $index_parser = Object.new
 def $index_parser.method_missing
  raise "Error, no index parser was ever specified, please set $index_parser"
 end

 # "abstract" class
 class Page
  attr_accessor :title, :sections, :id
  attr_accessor :body
 end

 class WikiPage < Page
  attr_accessor :xmldoc
  attr_accessor :categories_etc   # just text

  def initialize(pagestr)
    p = XML::Parser.new
    p.string = pagestr
    @xmldoc = p.parse
    parse_basics!
  end

  def break_sentences!
    sections.each { |sec| sec.break_sentences! }
  end

  # xml/wiki parse
  def parse_basics!
    parse_title!
    @id = Digest::MD5.hexdigest(title)

    # narrow down if appropriate
    coretext = rawtext
    @body = coretext
    if rawtext =~ / \[\[Category: /imx
      coretext = $~.pre_match
      @categories_etc = $~[0] + $~.post_match
    end

    # sections
    section_headers = []
    /^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match|
      section_headers << SectionHeader.create_from_match(match)
    end
    dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
    section_headers.unshift( dummy_start )
    # ppy section_headers

    # segment the text
    @sections = []
    section_headers.each_with_index do |sh, i|
      section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
      text = coretext[(sh.end+1) .. section_end] || ""
      @sections << Section.new(
          self,
          :text => text,
          :text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
          # :header => sh,
          :title => sh.title,
          :title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
          :depth => (sh.orig_str[/=+/] || OpenStruct.new).size
      )
    end

    # normalize depths: 1 depth for the top siblings, same as ARTICLE_START
    @sections[0].depth = 1
    mindepth = @sections[1..-1].map{|s| s.depth}.min || 0
    @sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1}

    make_section_tree!
    dfs_walk(@section_root) do |path, sec|
      sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ")
      sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
    end

    # extract out non-core-textual sections?
    # non_core_section_headers = section_headers.select do |sh|
    #   sh.title =~ /^ (other websites | see also | references) $/ix
    # end
    # puts section_headers.map{|sh| sh.title}
  end
  
  def dfs_walk(node)
    _dfs_walk([],node) { |p,n| yield [p,n] }
  end

  def _dfs_walk(path, node)
    yield [path,node]
    node.children.each do |c|
      _dfs_walk(path+[node], c) { |p,n| yield [p,n]}
    end
  end

  def make_section_tree!
    @section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
    sections.each_with_index do |sec,i|
      sections_before = [@section_root] + (sections[0...i] || [])
      parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1}
      if ! parent
        raise "impossible to not find parent now"
      end
      parent.children << sec
    end
    # could do better depth normalization here if we wanted
  end

  class Section
    attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
    attr_accessor :sentences
    attr_accessor :children, :depth, :title_path, :title_path_for_indexer

    def initialize(page, hash={})
      fill_attrs! hash
      @id = Digest::MD5.hexdigest(page.id + title)
      @sentences = []
      @children = []
    end

    def break_sentences!
      # return (@sentences = []) if $evil_global_dont_sentence_break

      text_first_clean = WikiPage.wiki_cleanup_largescale(text || "")
      text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean)  # super clean
      lines = (text_first_clean || "").split("\n")

      # if newlines can happen in the middle of sentences, we'd want to
      # strategically join back together lines right here.  this seems to
      # happen sometimes but not often, so let's not worry about it.

      @sentences = lines.map do |line|
        # line = WikiPage.wiki_cleanup_largescale(line)
        $sentence_breaker.break(line).map do |sent_text|
          Sentence.new(sent_text, self)
        end
      end.flatten
    end

  end

  class SectionHeader
    attr_accessor :title, :begin, :end, :orig_str

    def initialize(hash={})
      @orig_str = ""
      @title = ""
      fill_attrs! hash
    end

    def self.create_from_match(match)
      sh = SectionHeader.new
      sh.instance_eval do
        @orig_str = match[0]
        raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] )  =+ \s*$/x, 1]
        if raw_name
          @title = raw_name.strip.gsub(/\s{2,99}/, ' ')
          # leave in [[ ]]  markup...
        end
        @begin = match.begin(0)
        @end = match.end(0)
      end
      sh
    end
  end

  # this function should only do cleanups that are prerequisite for the
  # sentence breaker
  # therefore, dont do any small scale cleanups that could be done in-sentence.
  # e.g. anchor text cleanup.  we'd like to use those potentially...
  #
  def self.wiki_cleanup_largescale(s)
    # infoboxes are complex:  {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n
    s = s.gsub(/ \{\{ [^\n]* \n 
                (\| [^\n]* \n )+
                  \}\}  /mx,  '')
    # more tables, sloppier regex
    s = s.gsub(/^\s* \{\| \s* class="wikitable" .*?  \n\|\} /mx, '')
    # really sloppy, yikes
    s = s.gsub(/^\s* \{\| .*?  \n\|\} /mx, '')

    s = s.gsub(/<!-- .*? -->/mx, '')
    s = s.gsub(/ <ref [^>]* \/> /mx, '')
    s = s.gsub(/ <ref .*? <\/ref> /mx, '')
    s = s.gsub(/ \{\{ cite  [^}]*  \}\}/mx, '')
    # s = s.gsub(/ \{\{  [^|]*  \|  ([^}]*)  \}\}/mx,  '\1')
    s = s.gsub(/ \{\{  ([^}]*)  \}\}/mx,            '')
    s = s.gsub(/ < [^>]* > /x, '')   # arbitrary html or html-like tags
    # join together anchor texts that are spanning multiple lines
    s = s.gsub(/ \[\[ [^\]]* \n [^\]]*  \]\] /mx) { $~.to_s.gsub("\n"," ") }
  end

  def self.wiki_cleanup_smallscale(s)
    s = s.gsub(/'''/,"").gsub(/''/,"")
    s = s.gsub(/ \[\[   ([^\|\[\]]*)     \]\] /x,    '\1')  # anchor text of wiki link
    s = s.gsub(/ \[\[  [^\]]*  \|  ([^\]]+)  \]\] /x,'\1')  # anchor text of wiki link
    s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1')  # web links
    s = s.gsub(/ ^\*+ \s* /x, '')       # a list
  end

  class Sentence
    attr_accessor :text, :text_for_parser, :semrep, :id

    def initialize(text, section)
      @text = text
      @text_for_parser = get_text_for_parser(text)
      @id = Digest::MD5.hexdigest(section.id + text)
    end

    # in a smarter world, save the output in a standoff-y way to exploit it for
    # name/coref resolution
    def get_text_for_parser(wiki_text)
      s = WikiPage.wiki_cleanup_smallscale(wiki_text)
    end

    def parse!(no_lex=false)
      # puts @text_for_parser
      triples = $index_parser.parse_dep @text_for_parser
      semrep_class = eval( $index_parser.parser_name + "SemRep" )
      @semrep = semrep_class.new(triples, no_lex)
    end
  end

  def parse_title!
    titleelt = @xmldoc.find('//title').each{|t| break t}
    return unless titleelt
    c = titleelt.child
    return unless c
    @title = c.content
  end
      
  def rawtext
    return @rawtext if @rawtext
    textelt = @xmldoc.find('//text').each{|t| break t}
    return (@rawtext = "") unless textelt
    textnode = textelt.child
    return (@rawtext = "") unless textnode
    @rawtext = textnode.content
  end

 end


  
 if __FILE__ == $0
  
 banner_msg = <<-EOS
 testing: put the code per page on the cmdline
 e.g.  (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")

 examples ...

 test page title extraction:  
 ./w | lib/page.rb 'puts title'

 test section extraction:
 ./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'

 test sentence extraction and cleanup:
 ./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"'

 test sentence parsing/semrepping:
 ./w | lib/page.rb -s 'parse!'

 view minipar triples per sentence:
 ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'

 view sempairs per sentence semrep:
 ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'

 view sempairs per sentence semrep, without lex lookup:
 ./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'

 options ...
 EOS

 STDOUT.sync = true
 require File.dirname(__FILE__)+'/wikidump'
 require 'trollop'

 opts = Trollop::options do
  banner banner_msg
  opt :page, "run code per page", :default => true
  opt :sentence, "run code per sentence", :default => false
  opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
 end

 $index_parser = DRbObject.new(nil, opts[:parser])

 cmd = ARGV.join(" ")
 cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
 cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]

 # require 'unprof'

 if opts[:sentence]
  WikiDump.yield_pages $stdin do |page|
    page.break_sentences!
    page.sections.each do |sec|
      sec.sentences.each do |sent|
        sent.instance_eval { eval cmd }
      end
    end
  end
 elsif opts[:page]
  WikiDump.yield_pages $stdin do |page|
    page.instance_eval { eval cmd }
  end
 else
  Trollop::die "Illegal options"
 end

 end
	#!/usr/bin/env ruby
	#
	# Data structures and proccessing of documents to be indexed, e.g. wikipedia
	# pages. ok, everything is wikipedia-specific. :-)
	#
	# This file can be executed for various sorts of testing (see bottom)


	require File.dirname(__FILE__)+'/common'

	require 'ostruct'
	require 'digest/md5'
	require 'rubygems'
	gem 'facets'
	require 'xml/libxml'

	require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
	require File.dirname(__FILE__)+'/semrep'

	# $sentence_breaker = SentenceBreaker.new
	$index_parser = Object.new
	def $index_parser.method_missing
	raise "Error, no index parser was ever specified, please set $index_parser"
	end

	# "abstract" class
	class Page
	attr_accessor :title, :sections, :id
	attr_accessor :body
	end

	class WikiPage < Page
	attr_accessor :xmldoc
	attr_accessor :categories_etc # just text

	def initialize(pagestr)
	p = XML::Parser.new
	p.string = pagestr
	@xmldoc = p.parse
	parse_basics!
	end

	def break_sentences!
	sections.each { \|sec\| sec.break_sentences! }
	end

	# xml/wiki parse
	def parse_basics!
	parse_title!
	@id = Digest::MD5.hexdigest(title)

	# narrow down if appropriate
	coretext = rawtext
	@body = coretext
	if rawtext =~ / \[\[Category: /imx
	coretext = $~.pre_match
	@categories_etc = $~[0] + $~.post_match
	end

	# sections
	section_headers = []
	/^\s* = .[^=]. = \s*$ /x.matches(coretext) do \|match\|
	section_headers << SectionHeader.create_from_match(match)
	end
	dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
	section_headers.unshift( dummy_start )
	# ppy section_headers

	# segment the text
	@sections = []
	section_headers.each_with_index do \|sh, i\|
	section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
	text = coretext[(sh.end+1) .. section_end] \|\| ""
	@sections << Section.new(
	self,
	:text => text,
	:text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
	# :header => sh,
	:title => sh.title,
	:title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
	:depth => (sh.orig_str[/=+/] \|\| OpenStruct.new).size
	)
	end

	# normalize depths: 1 depth for the top siblings, same as ARTICLE_START
	@sections[0].depth = 1
	mindepth = @sections[1..-1].map{\|s\| s.depth}.min \|\| 0
	@sections[1..-1].each{\|s\| s.depth = s.depth - mindepth + 1}

	make_section_tree!
	dfs_walk(@section_root) do \|path, sec\|
	sec.title_path = ((path[1..-1] \|\| []) + [sec]).map{\|s\| s.title}.join(" -> ")
	sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
	end

	# extract out non-core-textual sections?
	# non_core_section_headers = section_headers.select do \|sh\|
	# sh.title =~ /^ (other websites \| see also \| references) $/ix
	# end
	# puts section_headers.map{\|sh\| sh.title}
	end

	def dfs_walk(node)
	_dfs_walk([],node) { \|p,n\| yield [p,n] }
	end

	def _dfs_walk(path, node)
	yield [path,node]
	node.children.each do \|c\|
	_dfs_walk(path+[node], c) { \|p,n\| yield [p,n]}
	end
	end

	def make_section_tree!
	@section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
	sections.each_with_index do \|sec,i\|
	sections_before = [@section_root] + (sections[0...i] \|\| [])
	parent = sections_before.reverse.find {\|s\| s.depth <= sec.depth-1}
	if ! parent
	raise "impossible to not find parent now"
	end
	parent.children << sec
	end
	# could do better depth normalization here if we wanted
	end

	class Section
	attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
	attr_accessor :sentences
	attr_accessor :children, :depth, :title_path, :title_path_for_indexer

	def initialize(page, hash={})
	fill_attrs! hash
	@id = Digest::MD5.hexdigest(page.id + title)
	@sentences = []
	@children = []
	end

	def break_sentences!
	# return (@sentences = []) if $evil_global_dont_sentence_break

	text_first_clean = WikiPage.wiki_cleanup_largescale(text \|\| "")
	text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean) # super clean
	lines = (text_first_clean \|\| "").split("\n")

	# if newlines can happen in the middle of sentences, we'd want to
	# strategically join back together lines right here. this seems to
	# happen sometimes but not often, so let's not worry about it.

	@sentences = lines.map do \|line\|
	# line = WikiPage.wiki_cleanup_largescale(line)
	$sentence_breaker.break(line).map do \|sent_text\|
	Sentence.new(sent_text, self)
	end
	end.flatten
	end

	end

	class SectionHeader
	attr_accessor :title, :begin, :end, :orig_str

	def initialize(hash={})
	@orig_str = ""
	@title = ""
	fill_attrs! hash
	end

	def self.create_from_match(match)
	sh = SectionHeader.new
	sh.instance_eval do
	@orig_str = match[0]
	raw_name = @orig_str[ /^\s=+ ([^=] . [^=] ) =+ \s*$/x, 1]
	if raw_name
	@title = raw_name.strip.gsub(/\s{2,99}/, ' ')
	# leave in [[ ]] markup...
	end
	@begin = match.begin(0)
	@end = match.end(0)
	end
	sh
	end
	end

	# this function should only do cleanups that are prerequisite for the
	# sentence breaker
	# therefore, dont do any small scale cleanups that could be done in-sentence.
	# e.g. anchor text cleanup. we'd like to use those potentially...
	#
	def self.wiki_cleanup_largescale(s)
	# infoboxes are complex: {{ .. \n\|key=val.. \n\|key2=val2 .. \n}}\n
	s = s.gsub(/ \{\{ [^\n]* \n
	(\\| [^\n]* \n )+
	\}\} /mx, '')
	# more tables, sloppier regex
	s = s.gsub(/^\s* \{\\| \s* class="wikitable" .*? \n\\|\} /mx, '')
	# really sloppy, yikes
	s = s.gsub(/^\s* \{\\| .*? \n\\|\} /mx, '')

	s = s.gsub(/<!-- .*? -->/mx, '')
	s = s.gsub(/ <ref [^>]* \/> /mx, '')
	s = s.gsub(/ <ref .*? <\/ref> /mx, '')
	s = s.gsub(/ \{\{ cite [^}]* \}\}/mx, '')
	# s = s.gsub(/ \{\{ [^\|]* \\| ([^}]*) \}\}/mx, '\1')
	s = s.gsub(/ \{\{ ([^}]*) \}\}/mx, '')
	s = s.gsub(/ < [^>]* > /x, '') # arbitrary html or html-like tags
	# join together anchor texts that are spanning multiple lines
	s = s.gsub(/ \[\[ [^\]]* \n [^\]]* \]\] /mx) { $~.to_s.gsub("\n"," ") }
	end

	def self.wiki_cleanup_smallscale(s)
	s = s.gsub(/'''/,"").gsub(/''/,"")
	s = s.gsub(/ \[\[ ([^\\|\[\]]*) \]\] /x, '\1') # anchor text of wiki link
	s = s.gsub(/ \[\[ [^\]]* \\| ([^\]]+) \]\] /x,'\1') # anchor text of wiki link
	s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1') # web links
	s = s.gsub(/ ^\+ \s /x, '') # a list
	end

	class Sentence
	attr_accessor :text, :text_for_parser, :semrep, :id

	def initialize(text, section)
	@text = text
	@text_for_parser = get_text_for_parser(text)
	@id = Digest::MD5.hexdigest(section.id + text)
	end

	# in a smarter world, save the output in a standoff-y way to exploit it for
	# name/coref resolution
	def get_text_for_parser(wiki_text)
	s = WikiPage.wiki_cleanup_smallscale(wiki_text)
	end

	def parse!(no_lex=false)
	# puts @text_for_parser
	triples = $index_parser.parse_dep @text_for_parser
	semrep_class = eval( $index_parser.parser_name + "SemRep" )
	@semrep = semrep_class.new(triples, no_lex)
	end
	end

	def parse_title!
	titleelt = @xmldoc.find('//title').each{\|t\| break t}
	return unless titleelt
	c = titleelt.child
	return unless c
	@title = c.content
	end

	def rawtext
	return @rawtext if @rawtext
	textelt = @xmldoc.find('//text').each{\|t\| break t}
	return (@rawtext = "") unless textelt
	textnode = textelt.child
	return (@rawtext = "") unless textnode
	@rawtext = textnode.content
	end

	end



	if __FILE__ == $0

	banner_msg = <<-EOS
	testing: put the code per page on the cmdline
	e.g. (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")

	examples ...

	test page title extraction:
	./w \| lib/page.rb 'puts title'

	test section extraction:
	./w \| lib/page.rb 'sections.each{\|sec\| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'

	test sentence extraction and cleanup:
	./w \| lib/page.rb -s 'puts "*** #\{text_for_parser\}"'

	test sentence parsing/semrepping:
	./w \| lib/page.rb -s 'parse!'

	view minipar triples per sentence:
	./w \| lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'

	view sempairs per sentence semrep:
	./w \| lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'

	view sempairs per sentence semrep, without lex lookup:
	./w \| lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'

	options ...
	EOS

	STDOUT.sync = true
	require File.dirname(__FILE__)+'/wikidump'
	require 'trollop'

	opts = Trollop::options do
	banner banner_msg
	opt :page, "run code per page", :default => true
	opt :sentence, "run code per sentence", :default => false
	opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
	end

	$index_parser = DRbObject.new(nil, opts[:parser])

	cmd = ARGV.join(" ")
	cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
	cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]

	# require 'unprof'

	if opts[:sentence]
	WikiDump.yield_pages $stdin do \|page\|
	page.break_sentences!
	page.sections.each do \|sec\|
	sec.sentences.each do \|sent\|
	sent.instance_eval { eval cmd }
	end
	end
	end
	elsif opts[:page]
	WikiDump.yield_pages $stdin do \|page\|
	page.instance_eval { eval cmd }
	end
	else
	Trollop::die "Illegal options"
	end

	end
No results found