Created
March 1, 2013 04:19
-
-
Save myitcv/5062452 to your computer and use it in GitHub Desktop.
Some interesting text processing: combines XPath and some unstructured extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
doc = Nokogiri::HTML(open('http://a16z.com/portfolio/')) | |
def collect_up(string,lvals) | |
progress = [string] | |
# Result hash | |
result = {} | |
lvals.each { |split| | |
# reduce by the split | |
progress.map!{ |item| item.split(split) } | |
# we only want 'matches' - i.e. where the split result is > 1 | |
# > 1 because there could be multiple occurences of the lhs | |
result[split] = progress.select{ |item| item.length > 1 }.map{ |item| item[1..-1] }.flatten | |
# flatten progress and remove the empty strings (these are no use) | |
# all other strings need to be preserved (we can't tell whether | |
# we have reduced enough) | |
progress.flatten! | |
progress.delete_if { |item| item.empty? } | |
# We need to go back over previous processed splits and reduce | |
# them by the current split; this is where this becomes an n^2 | |
# problem... | |
result.keys.each { |doner| | |
result[doner].map!{ |item| | |
item.split(split).first | |
} | |
} | |
} | |
result | |
end | |
doc.xpath('//div[h1/@class = "title entry-title"]').each do |x| | |
company = x.xpath('h1').first.content | |
website = x.xpath('div/p/a').attribute('href').value unless x.xpath('div/p/a').empty? | |
description = x.xpath('div/p')[1..-1].map {|y| y.content }.join(" ") | |
founders = x.xpath('div/p')[0].inner_html.split("<br>")[0].split("</strong> ")[1] | |
lvals = ["Founders:","Founded:","Headquarters:","Website:","Type of business:"] | |
detail = collect_up(x.xpath('div/p')[0].content,lvals) | |
founders = detail["Founders:"].first | |
founded = detail["Founded:"].first | |
hq = detail["Headquarters:"].first | |
type = detail["Type of business:"].first | |
puts "#{company}\t#{founded}\t#{website}\t#{founders}\t#{hq}\t#{type}\t#{description}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment