hcarreras · June 5, 2023 13:10
diff --git a/scrape.rb b/scrape.rb
 require 'capybara'
 require 'selenium-webdriver'

 # List of URLs to be iterated
 urls = [
  'https://example.com/kontakt/om-os/miljoe-csr',
  'https://example.com/kontakt/om-os',
 ]

 # Capybara setup
 Capybara.register_driver :selenium do |app|
  Capybara::Selenium::Driver.new(app, browser: :chrome)
 end

 Capybara.default_driver = :selenium
 Capybara.javascript_driver = :selenium

 session = Capybara.current_session

 urls.each do |url|
  begin
    puts "Visiting #{url}"
    session.visit url

    # Accept cookies
    if session.has_link?("Tillad alle")
      puts "Accepting cookies"
      session.click_link("Tillad alle")
    end

    sleep 2 # Wait for the page to load after clicking the accept button

    valid_text = session.text

    # The pages I was scraping had 2 footers with irrelevant info, so I removed the text from them.
    # It was not possible to simply select the relevant content because it had no structure (no class names, no hirerchy)
    if session.has_css?('footer')
      session.all('footer').each do |footer|
        valid_text = valid_text.gsub(footer.text, '')
      end
    end

    file_name = url.split('/').last + '.txt'
    puts "saving into #{file_name}"
    File.open(file_name, 'w') { |file| file.write(valid_text) }

  rescue => e
    puts "Error processing #{url}: #{e}"
  end
 end
	require 'capybara'
	require 'selenium-webdriver'

	# List of URLs to be iterated
	urls = [
	'https://example.com/kontakt/om-os/miljoe-csr',
	'https://example.com/kontakt/om-os',
	]

	# Capybara setup
	Capybara.register_driver :selenium do \|app\|
	Capybara::Selenium::Driver.new(app, browser: :chrome)
	end

	Capybara.default_driver = :selenium
	Capybara.javascript_driver = :selenium

	session = Capybara.current_session

	urls.each do \|url\|
	begin
	puts "Visiting #{url}"
	session.visit url

	# Accept cookies
	if session.has_link?("Tillad alle")
	puts "Accepting cookies"
	session.click_link("Tillad alle")
	end

	sleep 2 # Wait for the page to load after clicking the accept button

	valid_text = session.text

	# The pages I was scraping had 2 footers with irrelevant info, so I removed the text from them.
	# It was not possible to simply select the relevant content because it had no structure (no class names, no hirerchy)
	if session.has_css?('footer')
	session.all('footer').each do \|footer\|
	valid_text = valid_text.gsub(footer.text, '')
	end
	end

	file_name = url.split('/').last + '.txt'
	puts "saving into #{file_name}"
	File.open(file_name, 'w') { \|file\| file.write(valid_text) }

	rescue => e
	puts "Error processing #{url}: #{e}"
	end
	end
No results found