Created
November 12, 2023 22:44
-
-
Save fsndzomga/04ca773d6cf944e72432472b1c60b8d6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'selenium-webdriver' | |
require 'webdrivers' | |
require 'yaml' | |
require 'json' | |
chromedriver_path = '/Users/fsndzomga/Downloads/chromedriver-mac-arm64/chromedriver' | |
Selenium::WebDriver::Chrome::Service.driver_path = chromedriver_path | |
class YcScraper | |
YC_BASE_URL = 'https://www.ycombinator.com' | |
YC_COMPANIES_PATH = '/companies' | |
def self.setup_driver | |
options = Selenium::WebDriver::Chrome::Options.new | |
options.add_argument('--headless') | |
options.add_argument('--disable-gpu') | |
options.add_argument('--no-sandbox') # This disables the sandbox for the ChromeDriver session | |
options.add_argument('--disable-dev-shm-usage') # This forces Chrome to use the /tmp directory for shared memory | |
Selenium::WebDriver.for(:chrome, options: options) | |
end | |
def self.scroll_to_load_all_content(driver) | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
no_new_content_loads = 0 # Counter to keep track of consecutive failed content loads | |
loop do | |
# Scroll down to the bottom and wait for the page to load | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
sleep 5 # Increase if necessary | |
# Check if there are any indicators that no more content is available | |
break if driver.execute_script("return document.body.innerText").include?("No more content") | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height | |
no_new_content_loads += 1 | |
# Try reloading 3 times before breaking the loop, in case it's a temporary issue | |
break if no_new_content_loads >= 3 | |
else | |
# Reset the counter if new content is loaded | |
no_new_content_loads = 0 | |
end | |
# Update the last_height to the new height for the next iteration | |
last_height = new_height | |
end | |
end | |
def self.generate_batch_params(year) | |
# Formats the year with leading zeros if less than 10 | |
summer_batch = year < 10 ? "S0#{year}" : "S#{year}" | |
winter_batch = year < 10 ? "W0#{year}" : "W#{year}" | |
[summer_batch, winter_batch] | |
end | |
def self.scrape_by_batches | |
(5..24).each do |year_suffix| | |
generate_batch_params(year_suffix).each do |batch| | |
driver = setup_driver | |
batch_url = "#{YC_BASE_URL}#{YC_COMPANIES_PATH}?batch=#{batch}" | |
puts batch_url | |
driver.navigate.to(batch_url) | |
# Use the refactored scrolling method to ensure all content is loaded | |
scroll_to_load_all_content(driver) | |
begin | |
scrape_startups(driver) # Make sure this method accepts the driver as an argument | |
rescue => e | |
puts "An error occurred: #{e.message}" | |
puts e.backtrace | |
end | |
driver.quit | |
# Pause for 10 seconds before starting the next WebDriver session | |
sleep 30 | |
end | |
end | |
end | |
def self.scrape_startups(driver) | |
counter = 0 | |
# driver = setup_driver | |
# driver.navigate.to(YC_BASE_URL + YC_COMPANIES_PATH) | |
# Parse the page with Nokogiri after scrolling | |
doc = Nokogiri::HTML(driver.page_source) | |
# Initialize an empty array to hold startup data | |
startups = [] | |
doc.css('._company_lx3q7_339').each do |startup_div| | |
name = startup_div.at_css('._coName_lx3q7_454')&.text&.strip || 'N/A' | |
location = startup_div.at_css('._coLocation_lx3q7_470')&.text&.strip || 'N/A' | |
description = startup_div.at_css('._coDescription_lx3q7_479')&.text&.strip || 'N/A' | |
batch_element = startup_div.at_css('.fa-y-combinator') | |
batch = batch_element ? batch_element.parent.text.strip : 'N/A' | |
industry_elements = startup_div.css('._tagLink_lx3q7_1013 .pill') | |
if batch == 'N/A' | |
industry = industry_elements.any? ? industry_elements.map(&:text) : [] | |
else | |
industry = industry_elements.any? ? industry_elements.map(&:text).drop(1) : [] | |
end | |
# Get the outermost a tag href attribute for the startup | |
profile_url = startup_div[:href] ? YC_BASE_URL + startup_div[:href] : nil | |
extended_description = profile_url ? scrape_extended_description(driver, profile_url) : 'N/A' | |
puts "Name: #{name}" | |
puts "Location: #{location}" | |
puts "Description: #{description}" | |
puts "Batch: #{batch}" | |
puts "Industry: #{industry}" | |
puts "Extended Description: #{extended_description}" | |
puts "-----------------------------------" | |
# Add the startup hash to the startups array | |
startup_data = { | |
name: name, | |
location: location, | |
description: description, | |
batch: batch, | |
industry: industry, | |
extended_description: extended_description | |
} | |
startups << startup_data | |
# Write the current state of startups array to the YAML file | |
File.open("startups.yml", "a") { |file| file.write(startup_data.to_yaml) } | |
# Write the current startup data to the JSON file | |
File.open("startups.json", "a") do |file| | |
file.puts(JSON.generate(startup_data)) # Appends each startup data as a JSON object | |
end | |
counter += 1 | |
end | |
# driver.quit # Make sure to quit the driver to free resources | |
puts "Total startups scraped: #{counter}" | |
end | |
def self.scrape_extended_description(driver, profile_url) | |
driver.navigate.to(profile_url) | |
sleep 2 # Wait to load page | |
profile_doc = Nokogiri::HTML(driver.page_source) | |
# Return the text or nil if not found | |
profile_doc.at_css('div.prose p')&.text&.strip | |
rescue OpenURI::HTTPError => e | |
puts "Failed to open #{profile_url}: #{e}" | |
nil | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment