myitcv · March 1, 2013 04:19
diff --git a/a16z_portfolio.rb b/a16z_portfolio.rb
 require 'nokogiri'
 require 'open-uri'

 doc = Nokogiri::HTML(open('http://a16z.com/portfolio/'))

 def collect_up(string,lvals)
  progress = [string]

  # Result hash
  result = {}

  lvals.each { |split|

    # reduce by the split
    progress.map!{ |item| item.split(split) }

    # we only want 'matches' - i.e. where the split result is > 1
    # > 1 because there could be multiple occurences of the lhs
    result[split] = progress.select{ |item| item.length > 1 }.map{ |item| item[1..-1] }.flatten

    # flatten progress and remove the empty strings (these are no use)
    # all other strings need to be preserved (we can't tell whether
    # we have reduced enough)
    progress.flatten!
    progress.delete_if { |item| item.empty? }

    # We need to go back over previous processed splits and reduce
    # them by the current split; this is where this becomes an n^2
    # problem...
    result.keys.each { |doner|
      result[doner].map!{ |item|
        item.split(split).first
      }
    }
  }

  result
 end

 doc.xpath('//div[h1/@class = "title entry-title"]').each do |x|
  company = x.xpath('h1').first.content
  website = x.xpath('div/p/a').attribute('href').value unless x.xpath('div/p/a').empty?
  description = x.xpath('div/p')[1..-1].map {|y| y.content }.join(" ")
  founders = x.xpath('div/p')[0].inner_html.split("<br>")[0].split("</strong> ")[1]
  lvals = ["Founders:","Founded:","Headquarters:","Website:","Type of business:"]
  detail = collect_up(x.xpath('div/p')[0].content,lvals)
  founders = detail["Founders:"].first
  founded = detail["Founded:"].first
  hq = detail["Headquarters:"].first
  type = detail["Type of business:"].first
  puts "#{company}\t#{founded}\t#{website}\t#{founders}\t#{hq}\t#{type}\t#{description}"
 end
	require 'nokogiri'
	require 'open-uri'

	doc = Nokogiri::HTML(open('http://a16z.com/portfolio/'))

	def collect_up(string,lvals)
	progress = [string]

	# Result hash
	result = {}

	lvals.each { \|split\|

	# reduce by the split
	progress.map!{ \|item\| item.split(split) }

	# we only want 'matches' - i.e. where the split result is > 1
	# > 1 because there could be multiple occurences of the lhs
	result[split] = progress.select{ \|item\| item.length > 1 }.map{ \|item\| item[1..-1] }.flatten

	# flatten progress and remove the empty strings (these are no use)
	# all other strings need to be preserved (we can't tell whether
	# we have reduced enough)
	progress.flatten!
	progress.delete_if { \|item\| item.empty? }

	# We need to go back over previous processed splits and reduce
	# them by the current split; this is where this becomes an n^2
	# problem...
	result.keys.each { \|doner\|
	result[doner].map!{ \|item\|
	item.split(split).first
	}
	}
	}

	result
	end

	doc.xpath('//div[h1/@class = "title entry-title"]').each do \|x\|
	company = x.xpath('h1').first.content
	website = x.xpath('div/p/a').attribute('href').value unless x.xpath('div/p/a').empty?
	description = x.xpath('div/p')[1..-1].map {\|y\| y.content }.join(" ")
	founders = x.xpath('div/p')[0].inner_html.split("<br>")[0].split("</strong> ")[1]
	lvals = ["Founders:","Founded:","Headquarters:","Website:","Type of business:"]
	detail = collect_up(x.xpath('div/p')[0].content,lvals)
	founders = detail["Founders:"].first
	founded = detail["Founded:"].first
	hq = detail["Headquarters:"].first
	type = detail["Type of business:"].first
	puts "#{company}\t#{founded}\t#{website}\t#{founders}\t#{hq}\t#{type}\t#{description}"
	end