Skip to content

Instantly share code, notes, and snippets.

@hcarreras
Created June 5, 2023 13:10
Show Gist options
  • Save hcarreras/169d8f7a2eed96a0f2eff90e162cbb27 to your computer and use it in GitHub Desktop.
Save hcarreras/169d8f7a2eed96a0f2eff90e162cbb27 to your computer and use it in GitHub Desktop.
Scrape data
require 'capybara'
require 'selenium-webdriver'
# List of URLs to be iterated
urls = [
'https://example.com/kontakt/om-os/miljoe-csr',
'https://example.com/kontakt/om-os',
]
# Capybara setup
Capybara.register_driver :selenium do |app|
Capybara::Selenium::Driver.new(app, browser: :chrome)
end
Capybara.default_driver = :selenium
Capybara.javascript_driver = :selenium
session = Capybara.current_session
urls.each do |url|
begin
puts "Visiting #{url}"
session.visit url
# Accept cookies
if session.has_link?("Tillad alle")
puts "Accepting cookies"
session.click_link("Tillad alle")
end
sleep 2 # Wait for the page to load after clicking the accept button
valid_text = session.text
# The pages I was scraping had 2 footers with irrelevant info, so I removed the text from them.
# It was not possible to simply select the relevant content because it had no structure (no class names, no hirerchy)
if session.has_css?('footer')
session.all('footer').each do |footer|
valid_text = valid_text.gsub(footer.text, '')
end
end
file_name = url.split('/').last + '.txt'
puts "saving into #{file_name}"
File.open(file_name, 'w') { |file| file.write(valid_text) }
rescue => e
puts "Error processing #{url}: #{e}"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment