moflo · December 8, 2018 01:55 · moflo · Dec 8, 2018
diff --git a/Webcrawl YCombinator Database, ycdb.co b/Webcrawl YCombinator Database, ycdb.co
 require 'airrecord'
 require 'kimurai'



 # Use the Capybara based Kimurai web crawler to iterate through the YCDB.co data and populate an AirTable spreadsheet

 class YCDBSpider < Kimurai::Base
  @name = "example_spider"
  @engine = :mechanize
  @start_urls = ["https://www.ycdb.co/batch/w00"]
  @config = {
    user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
    after_request: { delay: 4..7 }
  }


  def parse(response, url:, data: {})
    @YCTable = Airrecord.table("key000000000", "app000000000", "Table 1")

    response.xpath("//tr/td/a").each do |a|
      request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
    end

    if next_page = response.at_xpath("//a[@class='page-link']")
      request_to :parse, url: absolute_url(next_page[:href], base: url)
    end
  end

  def parse_repo_page(response, url:, data: {})
    item = {}

    item[:title] = response.xpath("//title").text.squish

    item[:name] = response.css("h1").text.squish

    response.css(".badge").each do |e|
      v = e.text
      item[:status] = "Dead" if v.match(/dead/i)
      item[:status] = "Exited" if v.match(/exited/i)
      item[:status] = "Live" if v.match(/live/i)
    end

    response.css("img").each do |e|
      v = e.attribute('src').value
      item[:landing] = v if v.match(/screenshots/)
      item[:logo] = v if v.match(/logo/)
    end

    response.css('.btn-primary').each do |e|
      v = e.attribute('href')
      item[:www] = v.value if v
    end

    item[:category] = response.xpath("//p[contains(., 'Category:')]/a").text

    item[:batch] = response.xpath("//p[contains(., 'Batch:')]/a").text

    item[:funding] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Funding')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Funding')]").count > 0

    item[:employees] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Employees')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Employees')]").count > 0

    item[:alexa] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Alexa')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Alexa')]").count > 0

    save_to "results.json", item, format: :pretty_json

    yc = @YCTable.new("Name" => item[:name], "YCDBUrl" => item[:www])
    yc["LandingPage"] = item[:landing] if item[:landing]
    yc["LandingImage"] = [{url: item[:landing] }] if item[:landing]
    yc["logo"] = item[:logo] if item[:landing]
    yc["www"] = item[:www] if item[:landing]
    yc["category"] = item[:category] if item[:landing]
    yc["batch"] = item[:batch] if item[:landing]
    yc["funding"] = item[:funding] if item[:landing]
    yc["logo"] = item[:employees] if item[:landing]
    yc["alexa"] = item[:alexa] if item[:landing]
    yc["status"] = item[:status] if item[:status]

    yc.save


    return item

  end
 end

 # Crawl through the YC batch data, Spring / Winter cohorts

 for i in 6..18 do
  url = "https://www.ycdb.co/batch/s" + i.to_s.rjust(2, '0')
  YCDBSpider.parse!(:parse, url: url)

  url = "https://www.ycdb.co/batch/w" + i.to_s.rjust(2, '0')
  YCDBSpider.parse!(:parse, url: url)

 end
	require 'airrecord'
	require 'kimurai'



	# Use the Capybara based Kimurai web crawler to iterate through the YCDB.co data and populate an AirTable spreadsheet

	class YCDBSpider < Kimurai::Base
	@name = "example_spider"
	@engine = :mechanize
	@start_urls = ["https://www.ycdb.co/batch/w00"]
	@config = {
	user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
	after_request: { delay: 4..7 }
	}


	def parse(response, url:, data: {})
	@YCTable = Airrecord.table("key000000000", "app000000000", "Table 1")

	response.xpath("//tr/td/a").each do \|a\|
	request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
	end

	if next_page = response.at_xpath("//a[@class='page-link']")
	request_to :parse, url: absolute_url(next_page[:href], base: url)
	end
	end

	def parse_repo_page(response, url:, data: {})
	item = {}

	item[:title] = response.xpath("//title").text.squish

	item[:name] = response.css("h1").text.squish

	response.css(".badge").each do \|e\|
	v = e.text
	item[:status] = "Dead" if v.match(/dead/i)
	item[:status] = "Exited" if v.match(/exited/i)
	item[:status] = "Live" if v.match(/live/i)
	end

	response.css("img").each do \|e\|
	v = e.attribute('src').value
	item[:landing] = v if v.match(/screenshots/)
	item[:logo] = v if v.match(/logo/)
	end

	response.css('.btn-primary').each do \|e\|
	v = e.attribute('href')
	item[:www] = v.value if v
	end

	item[:category] = response.xpath("//p[contains(., 'Category:')]/a").text

	item[:batch] = response.xpath("//p[contains(., 'Batch:')]/a").text

	item[:funding] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Funding')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Funding')]").count > 0

	item[:employees] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Employees')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Employees')]").count > 0

	item[:alexa] = response.xpath("//div[@class='d-flex']/h6[contains(., 'Alexa')]").first.parent.css('.badge').text.squish if response.xpath("//div[@class='d-flex']/h6[contains(., 'Alexa')]").count > 0

	save_to "results.json", item, format: :pretty_json

	yc = @YCTable.new("Name" => item[:name], "YCDBUrl" => item[:www])
	yc["LandingPage"] = item[:landing] if item[:landing]
	yc["LandingImage"] = [{url: item[:landing] }] if item[:landing]
	yc["logo"] = item[:logo] if item[:landing]
	yc["www"] = item[:www] if item[:landing]
	yc["category"] = item[:category] if item[:landing]
	yc["batch"] = item[:batch] if item[:landing]
	yc["funding"] = item[:funding] if item[:landing]
	yc["logo"] = item[:employees] if item[:landing]
	yc["alexa"] = item[:alexa] if item[:landing]
	yc["status"] = item[:status] if item[:status]

	yc.save


	return item

	end
	end

	# Crawl through the YC batch data, Spring / Winter cohorts

	for i in 6..18 do
	url = "https://www.ycdb.co/batch/s" + i.to_s.rjust(2, '0')
	YCDBSpider.parse!(:parse, url: url)

	url = "https://www.ycdb.co/batch/w" + i.to_s.rjust(2, '0')
	YCDBSpider.parse!(:parse, url: url)

	end
No results found