Created
June 22, 2012 03:23
-
-
Save brendano/2969995 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # | |
| # Data structures and proccessing of documents to be indexed, e.g. wikipedia | |
| # pages. ok, everything is wikipedia-specific. :-) | |
| # | |
| # This file can be executed for various sorts of testing (see bottom) | |
| require File.dirname(__FILE__)+'/common' | |
| require 'ostruct' | |
| require 'digest/md5' | |
| require 'rubygems' | |
| gem 'facets' | |
| require 'xml/libxml' | |
| require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker' | |
| require File.dirname(__FILE__)+'/semrep' | |
| # $sentence_breaker = SentenceBreaker.new | |
| $index_parser = Object.new | |
| def $index_parser.method_missing | |
| raise "Error, no index parser was ever specified, please set $index_parser" | |
| end | |
| # "abstract" class | |
| class Page | |
| attr_accessor :title, :sections, :id | |
| attr_accessor :body | |
| end | |
| class WikiPage < Page | |
| attr_accessor :xmldoc | |
| attr_accessor :categories_etc # just text | |
| def initialize(pagestr) | |
| p = XML::Parser.new | |
| p.string = pagestr | |
| @xmldoc = p.parse | |
| parse_basics! | |
| end | |
| def break_sentences! | |
| sections.each { |sec| sec.break_sentences! } | |
| end | |
| # xml/wiki parse | |
| def parse_basics! | |
| parse_title! | |
| @id = Digest::MD5.hexdigest(title) | |
| # narrow down if appropriate | |
| coretext = rawtext | |
| @body = coretext | |
| if rawtext =~ / \[\[Category: /imx | |
| coretext = $~.pre_match | |
| @categories_etc = $~[0] + $~.post_match | |
| end | |
| # sections | |
| section_headers = [] | |
| /^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match| | |
| section_headers << SectionHeader.create_from_match(match) | |
| end | |
| dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START") | |
| section_headers.unshift( dummy_start ) | |
| # ppy section_headers | |
| # segment the text | |
| @sections = [] | |
| section_headers.each_with_index do |sh, i| | |
| section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1 | |
| text = coretext[(sh.end+1) .. section_end] || "" | |
| @sections << Section.new( | |
| self, | |
| :text => text, | |
| :text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)), | |
| # :header => sh, | |
| :title => sh.title, | |
| :title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title), | |
| :depth => (sh.orig_str[/=+/] || OpenStruct.new).size | |
| ) | |
| end | |
| # normalize depths: 1 depth for the top siblings, same as ARTICLE_START | |
| @sections[0].depth = 1 | |
| mindepth = @sections[1..-1].map{|s| s.depth}.min || 0 | |
| @sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1} | |
| make_section_tree! | |
| dfs_walk(@section_root) do |path, sec| | |
| sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ") | |
| sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path) | |
| end | |
| # extract out non-core-textual sections? | |
| # non_core_section_headers = section_headers.select do |sh| | |
| # sh.title =~ /^ (other websites | see also | references) $/ix | |
| # end | |
| # puts section_headers.map{|sh| sh.title} | |
| end | |
| def dfs_walk(node) | |
| _dfs_walk([],node) { |p,n| yield [p,n] } | |
| end | |
| def _dfs_walk(path, node) | |
| yield [path,node] | |
| node.children.each do |c| | |
| _dfs_walk(path+[node], c) { |p,n| yield [p,n]} | |
| end | |
| end | |
| def make_section_tree! | |
| @section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0) | |
| sections.each_with_index do |sec,i| | |
| sections_before = [@section_root] + (sections[0...i] || []) | |
| parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1} | |
| if ! parent | |
| raise "impossible to not find parent now" | |
| end | |
| parent.children << sec | |
| end | |
| # could do better depth normalization here if we wanted | |
| end | |
| class Section | |
| attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer | |
| attr_accessor :sentences | |
| attr_accessor :children, :depth, :title_path, :title_path_for_indexer | |
| def initialize(page, hash={}) | |
| fill_attrs! hash | |
| @id = Digest::MD5.hexdigest(page.id + title) | |
| @sentences = [] | |
| @children = [] | |
| end | |
| def break_sentences! | |
| # return (@sentences = []) if $evil_global_dont_sentence_break | |
| text_first_clean = WikiPage.wiki_cleanup_largescale(text || "") | |
| text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean) # super clean | |
| lines = (text_first_clean || "").split("\n") | |
| # if newlines can happen in the middle of sentences, we'd want to | |
| # strategically join back together lines right here. this seems to | |
| # happen sometimes but not often, so let's not worry about it. | |
| @sentences = lines.map do |line| | |
| # line = WikiPage.wiki_cleanup_largescale(line) | |
| $sentence_breaker.break(line).map do |sent_text| | |
| Sentence.new(sent_text, self) | |
| end | |
| end.flatten | |
| end | |
| end | |
| class SectionHeader | |
| attr_accessor :title, :begin, :end, :orig_str | |
| def initialize(hash={}) | |
| @orig_str = "" | |
| @title = "" | |
| fill_attrs! hash | |
| end | |
| def self.create_from_match(match) | |
| sh = SectionHeader.new | |
| sh.instance_eval do | |
| @orig_str = match[0] | |
| raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] ) =+ \s*$/x, 1] | |
| if raw_name | |
| @title = raw_name.strip.gsub(/\s{2,99}/, ' ') | |
| # leave in [[ ]] markup... | |
| end | |
| @begin = match.begin(0) | |
| @end = match.end(0) | |
| end | |
| sh | |
| end | |
| end | |
| # this function should only do cleanups that are prerequisite for the | |
| # sentence breaker | |
| # therefore, dont do any small scale cleanups that could be done in-sentence. | |
| # e.g. anchor text cleanup. we'd like to use those potentially... | |
| # | |
| def self.wiki_cleanup_largescale(s) | |
| # infoboxes are complex: {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n | |
| s = s.gsub(/ \{\{ [^\n]* \n | |
| (\| [^\n]* \n )+ | |
| \}\} /mx, '') | |
| # more tables, sloppier regex | |
| s = s.gsub(/^\s* \{\| \s* class="wikitable" .*? \n\|\} /mx, '') | |
| # really sloppy, yikes | |
| s = s.gsub(/^\s* \{\| .*? \n\|\} /mx, '') | |
| s = s.gsub(/<!-- .*? -->/mx, '') | |
| s = s.gsub(/ <ref [^>]* \/> /mx, '') | |
| s = s.gsub(/ <ref .*? <\/ref> /mx, '') | |
| s = s.gsub(/ \{\{ cite [^}]* \}\}/mx, '') | |
| # s = s.gsub(/ \{\{ [^|]* \| ([^}]*) \}\}/mx, '\1') | |
| s = s.gsub(/ \{\{ ([^}]*) \}\}/mx, '') | |
| s = s.gsub(/ < [^>]* > /x, '') # arbitrary html or html-like tags | |
| # join together anchor texts that are spanning multiple lines | |
| s = s.gsub(/ \[\[ [^\]]* \n [^\]]* \]\] /mx) { $~.to_s.gsub("\n"," ") } | |
| end | |
| def self.wiki_cleanup_smallscale(s) | |
| s = s.gsub(/'''/,"").gsub(/''/,"") | |
| s = s.gsub(/ \[\[ ([^\|\[\]]*) \]\] /x, '\1') # anchor text of wiki link | |
| s = s.gsub(/ \[\[ [^\]]* \| ([^\]]+) \]\] /x,'\1') # anchor text of wiki link | |
| s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1') # web links | |
| s = s.gsub(/ ^\*+ \s* /x, '') # a list | |
| end | |
| class Sentence | |
| attr_accessor :text, :text_for_parser, :semrep, :id | |
| def initialize(text, section) | |
| @text = text | |
| @text_for_parser = get_text_for_parser(text) | |
| @id = Digest::MD5.hexdigest(section.id + text) | |
| end | |
| # in a smarter world, save the output in a standoff-y way to exploit it for | |
| # name/coref resolution | |
| def get_text_for_parser(wiki_text) | |
| s = WikiPage.wiki_cleanup_smallscale(wiki_text) | |
| end | |
| def parse!(no_lex=false) | |
| # puts @text_for_parser | |
| triples = $index_parser.parse_dep @text_for_parser | |
| semrep_class = eval( $index_parser.parser_name + "SemRep" ) | |
| @semrep = semrep_class.new(triples, no_lex) | |
| end | |
| end | |
| def parse_title! | |
| titleelt = @xmldoc.find('//title').each{|t| break t} | |
| return unless titleelt | |
| c = titleelt.child | |
| return unless c | |
| @title = c.content | |
| end | |
| def rawtext | |
| return @rawtext if @rawtext | |
| textelt = @xmldoc.find('//text').each{|t| break t} | |
| return (@rawtext = "") unless textelt | |
| textnode = textelt.child | |
| return (@rawtext = "") unless textnode | |
| @rawtext = textnode.content | |
| end | |
| end | |
| if __FILE__ == $0 | |
| banner_msg = <<-EOS | |
| testing: put the code per page on the cmdline | |
| e.g. (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml") | |
| examples ... | |
| test page title extraction: | |
| ./w | lib/page.rb 'puts title' | |
| test section extraction: | |
| ./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}' | |
| test sentence extraction and cleanup: | |
| ./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"' | |
| test sentence parsing/semrepping: | |
| ./w | lib/page.rb -s 'parse!' | |
| view minipar triples per sentence: | |
| ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples' | |
| view sempairs per sentence semrep: | |
| ./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs' | |
| view sempairs per sentence semrep, without lex lookup: | |
| ./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs' | |
| options ... | |
| EOS | |
| STDOUT.sync = true | |
| require File.dirname(__FILE__)+'/wikidump' | |
| require 'trollop' | |
| opts = Trollop::options do | |
| banner banner_msg | |
| opt :page, "run code per page", :default => true | |
| opt :sentence, "run code per sentence", :default => false | |
| opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser'] | |
| end | |
| $index_parser = DRbObject.new(nil, opts[:parser]) | |
| cmd = ARGV.join(" ") | |
| cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence] | |
| cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page] | |
| # require 'unprof' | |
| if opts[:sentence] | |
| WikiDump.yield_pages $stdin do |page| | |
| page.break_sentences! | |
| page.sections.each do |sec| | |
| sec.sentences.each do |sent| | |
| sent.instance_eval { eval cmd } | |
| end | |
| end | |
| end | |
| elsif opts[:page] | |
| WikiDump.yield_pages $stdin do |page| | |
| page.instance_eval { eval cmd } | |
| end | |
| else | |
| Trollop::die "Illegal options" | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment