Skip to content

Instantly share code, notes, and snippets.

@brendano
Created June 22, 2012 03:23
Show Gist options
  • Select an option

  • Save brendano/2969995 to your computer and use it in GitHub Desktop.

Select an option

Save brendano/2969995 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
#
# Data structures and proccessing of documents to be indexed, e.g. wikipedia
# pages. ok, everything is wikipedia-specific. :-)
#
# This file can be executed for various sorts of testing (see bottom)
require File.dirname(__FILE__)+'/common'
require 'ostruct'
require 'digest/md5'
require 'rubygems'
gem 'facets'
require 'xml/libxml'
require File.dirname(__FILE__)+'/../sentbreaker/sentbreaker'
require File.dirname(__FILE__)+'/semrep'
# $sentence_breaker = SentenceBreaker.new
$index_parser = Object.new
def $index_parser.method_missing
raise "Error, no index parser was ever specified, please set $index_parser"
end
# "abstract" class
class Page
attr_accessor :title, :sections, :id
attr_accessor :body
end
class WikiPage < Page
attr_accessor :xmldoc
attr_accessor :categories_etc # just text
def initialize(pagestr)
p = XML::Parser.new
p.string = pagestr
@xmldoc = p.parse
parse_basics!
end
def break_sentences!
sections.each { |sec| sec.break_sentences! }
end
# xml/wiki parse
def parse_basics!
parse_title!
@id = Digest::MD5.hexdigest(title)
# narrow down if appropriate
coretext = rawtext
@body = coretext
if rawtext =~ / \[\[Category: /imx
coretext = $~.pre_match
@categories_etc = $~[0] + $~.post_match
end
# sections
section_headers = []
/^\s* = .*[^=].* = \s*$ /x.matches(coretext) do |match|
section_headers << SectionHeader.create_from_match(match)
end
dummy_start = SectionHeader.new(:begin=>0, :end=>-1, :title => "ARTICLE_START")
section_headers.unshift( dummy_start )
# ppy section_headers
# segment the text
@sections = []
section_headers.each_with_index do |sh, i|
section_end = section_headers[i+1] ? (section_headers[i+1].begin - 1) : -1
text = coretext[(sh.end+1) .. section_end] || ""
@sections << Section.new(
self,
:text => text,
:text_for_indexer => WikiPage.wiki_cleanup_smallscale(WikiPage.wiki_cleanup_largescale(text)),
# :header => sh,
:title => sh.title,
:title_for_indexer => WikiPage.wiki_cleanup_smallscale(sh.title),
:depth => (sh.orig_str[/=+/] || OpenStruct.new).size
)
end
# normalize depths: 1 depth for the top siblings, same as ARTICLE_START
@sections[0].depth = 1
mindepth = @sections[1..-1].map{|s| s.depth}.min || 0
@sections[1..-1].each{|s| s.depth = s.depth - mindepth + 1}
make_section_tree!
dfs_walk(@section_root) do |path, sec|
sec.title_path = ((path[1..-1] || []) + [sec]).map{|s| s.title}.join(" -> ")
sec.title_path_for_indexer = WikiPage.wiki_cleanup_smallscale(sec.title_path)
end
# extract out non-core-textual sections?
# non_core_section_headers = section_headers.select do |sh|
# sh.title =~ /^ (other websites | see also | references) $/ix
# end
# puts section_headers.map{|sh| sh.title}
end
def dfs_walk(node)
_dfs_walk([],node) { |p,n| yield [p,n] }
end
def _dfs_walk(path, node)
yield [path,node]
node.children.each do |c|
_dfs_walk(path+[node], c) { |p,n| yield [p,n]}
end
end
def make_section_tree!
@section_root = Section.new(self, :title=>"SECTION_ROOT", :depth=>0)
sections.each_with_index do |sec,i|
sections_before = [@section_root] + (sections[0...i] || [])
parent = sections_before.reverse.find {|s| s.depth <= sec.depth-1}
if ! parent
raise "impossible to not find parent now"
end
parent.children << sec
end
# could do better depth normalization here if we wanted
end
class Section
attr_accessor :id, :text, :text_for_indexer, :title, :title_for_indexer
attr_accessor :sentences
attr_accessor :children, :depth, :title_path, :title_path_for_indexer
def initialize(page, hash={})
fill_attrs! hash
@id = Digest::MD5.hexdigest(page.id + title)
@sentences = []
@children = []
end
def break_sentences!
# return (@sentences = []) if $evil_global_dont_sentence_break
text_first_clean = WikiPage.wiki_cleanup_largescale(text || "")
text_first_clean = WikiPage.wiki_cleanup_smallscale(text_first_clean) # super clean
lines = (text_first_clean || "").split("\n")
# if newlines can happen in the middle of sentences, we'd want to
# strategically join back together lines right here. this seems to
# happen sometimes but not often, so let's not worry about it.
@sentences = lines.map do |line|
# line = WikiPage.wiki_cleanup_largescale(line)
$sentence_breaker.break(line).map do |sent_text|
Sentence.new(sent_text, self)
end
end.flatten
end
end
class SectionHeader
attr_accessor :title, :begin, :end, :orig_str
def initialize(hash={})
@orig_str = ""
@title = ""
fill_attrs! hash
end
def self.create_from_match(match)
sh = SectionHeader.new
sh.instance_eval do
@orig_str = match[0]
raw_name = @orig_str[ /^\s*=+ ([^=] .* [^=] ) =+ \s*$/x, 1]
if raw_name
@title = raw_name.strip.gsub(/\s{2,99}/, ' ')
# leave in [[ ]] markup...
end
@begin = match.begin(0)
@end = match.end(0)
end
sh
end
end
# this function should only do cleanups that are prerequisite for the
# sentence breaker
# therefore, dont do any small scale cleanups that could be done in-sentence.
# e.g. anchor text cleanup. we'd like to use those potentially...
#
def self.wiki_cleanup_largescale(s)
# infoboxes are complex: {{ .. \n|key=val.. \n|key2=val2 .. \n}}\n
s = s.gsub(/ \{\{ [^\n]* \n
(\| [^\n]* \n )+
\}\} /mx, '')
# more tables, sloppier regex
s = s.gsub(/^\s* \{\| \s* class="wikitable" .*? \n\|\} /mx, '')
# really sloppy, yikes
s = s.gsub(/^\s* \{\| .*? \n\|\} /mx, '')
s = s.gsub(/<!-- .*? -->/mx, '')
s = s.gsub(/ <ref [^>]* \/> /mx, '')
s = s.gsub(/ <ref .*? <\/ref> /mx, '')
s = s.gsub(/ \{\{ cite [^}]* \}\}/mx, '')
# s = s.gsub(/ \{\{ [^|]* \| ([^}]*) \}\}/mx, '\1')
s = s.gsub(/ \{\{ ([^}]*) \}\}/mx, '')
s = s.gsub(/ < [^>]* > /x, '') # arbitrary html or html-like tags
# join together anchor texts that are spanning multiple lines
s = s.gsub(/ \[\[ [^\]]* \n [^\]]* \]\] /mx) { $~.to_s.gsub("\n"," ") }
end
def self.wiki_cleanup_smallscale(s)
s = s.gsub(/'''/,"").gsub(/''/,"")
s = s.gsub(/ \[\[ ([^\|\[\]]*) \]\] /x, '\1') # anchor text of wiki link
s = s.gsub(/ \[\[ [^\]]* \| ([^\]]+) \]\] /x,'\1') # anchor text of wiki link
s = s.gsub(/ \[http:[^\s]+ \s* ([^\]]*) \] /x, '\1') # web links
s = s.gsub(/ ^\*+ \s* /x, '') # a list
end
class Sentence
attr_accessor :text, :text_for_parser, :semrep, :id
def initialize(text, section)
@text = text
@text_for_parser = get_text_for_parser(text)
@id = Digest::MD5.hexdigest(section.id + text)
end
# in a smarter world, save the output in a standoff-y way to exploit it for
# name/coref resolution
def get_text_for_parser(wiki_text)
s = WikiPage.wiki_cleanup_smallscale(wiki_text)
end
def parse!(no_lex=false)
# puts @text_for_parser
triples = $index_parser.parse_dep @text_for_parser
semrep_class = eval( $index_parser.parser_name + "SemRep" )
@semrep = semrep_class.new(triples, no_lex)
end
end
def parse_title!
titleelt = @xmldoc.find('//title').each{|t| break t}
return unless titleelt
c = titleelt.child
return unless c
@title = c.content
end
def rawtext
return @rawtext if @rawtext
textelt = @xmldoc.find('//text').each{|t| break t}
return (@rawtext = "") unless textelt
textnode = textelt.child
return (@rawtext = "") unless textnode
@rawtext = textnode.content
end
end
if __FILE__ == $0
banner_msg = <<-EOS
testing: put the code per page on the cmdline
e.g. (./w here is "bzcat enwiki...xml.bz2" or "cat test/obama.xml")
examples ...
test page title extraction:
./w | lib/page.rb 'puts title'
test section extraction:
./w | lib/page.rb 'sections.each{|sec| puts "DOC: #\{self.title\} -- SEC: #\{sec.title\}"}'
test sentence extraction and cleanup:
./w | lib/page.rb -s 'puts "*** #\{text_for_parser\}"'
test sentence parsing/semrepping:
./w | lib/page.rb -s 'parse!'
view minipar triples per sentence:
./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.parse_triples'
view sempairs per sentence semrep:
./w | lib/page.rb -s 'parse!; puts text_for_parser; puts semrep.sempairs'
view sempairs per sentence semrep, without lex lookup:
./w | lib/page.rb -s 'parse! true; puts text_for_parser; puts semrep.sempairs'
options ...
EOS
STDOUT.sync = true
require File.dirname(__FILE__)+'/wikidump'
require 'trollop'
opts = Trollop::options do
banner banner_msg
opt :page, "run code per page", :default => true
opt :sentence, "run code per sentence", :default => false
opt :parser, "parser to use (as a drb uri)", :default => $miniq_conf['index_parser']
end
$index_parser = DRbObject.new(nil, opts[:parser])
cmd = ARGV.join(" ")
cmd = 'puts "*** #{text}\n=== #{text_for_parser}"' if cmd == "" && opts[:sentence]
cmd = 'puts "-- #{title}"' if cmd == "" && opts[:page]
# require 'unprof'
if opts[:sentence]
WikiDump.yield_pages $stdin do |page|
page.break_sentences!
page.sections.each do |sec|
sec.sentences.each do |sent|
sent.instance_eval { eval cmd }
end
end
end
elsif opts[:page]
WikiDump.yield_pages $stdin do |page|
page.instance_eval { eval cmd }
end
else
Trollop::die "Illegal options"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment