Created
May 31, 2025 07:26
-
-
Save swombat/e3d96562cb1508bda6d14433165af45b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "faraday" | |
require "json" | |
class BrowserlessApi | |
def initialize(api_key: Rails.application.credentials.dig(:browserless, :api_key) || "<BROWSERLESS_API_KEY>", debug: false) | |
@api_key = api_key | |
@base_url = "https://production-sfo.browserless.io" | |
@debug = debug | |
end | |
def get_full_html(url) | |
params = { | |
url: url, | |
waitForTimeout: 10000 | |
} | |
retry_block do | |
response = http_client.post("/content", params.to_json) | |
response.body | |
end | |
end | |
def get_links(url) | |
params = { | |
url: url, | |
elements: [ | |
{ selector: "a" } | |
], | |
waitForSelector: { | |
selector: "a", | |
timeout: 5000 | |
} | |
} | |
retry_block do | |
response = http_client.post("/scrape", params.to_json) | |
response.body | |
end | |
end | |
def extract_link_urls(url) | |
links_response = get_links(url) | |
results = links_response.dig("data", 0, "results") || [] | |
results.map do |link| | |
href_attr = link.dig("attributes")&.find { |attr| attr["name"] == "href" } | |
href_attr&.dig("value") | |
end.compact.uniq | |
end | |
def http_client | |
Faraday.new(base_url) do |faraday| | |
faraday.request :url_encoded | |
faraday.headers["Content-Type"] = "application/json" | |
faraday.headers["Cache-Control"] = "no-cache" | |
if @debug | |
faraday.response :logger, ::Logger.new($stdout), { bodies: true } do |logger| | |
logger.filter(/(token=)(\S+)/, '\1REDACTED') | |
end | |
end | |
faraday.response :raise_error | |
faraday.response :json | |
end | |
end | |
def base_url | |
"#{@base_url}?token=#{@api_key}" | |
end | |
def retry_block | |
tries_counter = 0 | |
begin | |
yield | |
rescue Faraday::TimeoutError => e | |
Rails.logger.error "Faraday::TimeoutError: #{e}" | |
tries_counter += 1 | |
if tries_counter < 6 | |
sleep_time = 2**tries_counter | |
Rails.logger.info "Sleeping for #{sleep_time} seconds and trying again" | |
sleep sleep_time | |
Rails.logger.info "Done - trying again" | |
retry if tries_counter < 6 | |
end | |
rescue StandardError => e | |
Rails.logger.error "Unknown Error (tries: #{tries_counter}): #{e}" | |
tries_counter += 1 | |
retry if tries_counter < 3 | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "test_helper" | |
require "support/vcr_setup" | |
class BrowserlessApiTest < ActiveSupport::TestCase | |
setup do | |
@api = BrowserlessApi.new | |
end | |
test "initialization sets correct default parameters" do | |
assert_equal Rails.application.credentials.dig(:browserless, :api_key) || "<BROWSERLESS_API_KEY>", @api.instance_variable_get(:@api_key) | |
assert_equal "https://production-sfo.browserless.io", @api.instance_variable_get(:@base_url) | |
assert_equal false, @api.instance_variable_get(:@debug) | |
assert_not_nil @api.instance_variable_get(:@api_key) | |
end | |
test "initialization sets custom parameters correctly" do | |
custom_api_key = "custom_api_key_123" | |
custom_api = BrowserlessApi.new(api_key: custom_api_key, debug: true) | |
assert_equal custom_api_key, custom_api.instance_variable_get(:@api_key) | |
assert_equal "https://production-sfo.browserless.io", custom_api.instance_variable_get(:@base_url) | |
assert_equal true, custom_api.instance_variable_get(:@debug) | |
end | |
test "base_url includes API token" do | |
api_key = @api.instance_variable_get(:@api_key) | |
expected_url = "https://production-sfo.browserless.io?token=#{api_key}" | |
assert_equal expected_url, @api.base_url | |
end | |
test "get_full_html retrieves HTML content from a URL" do | |
VCR.use_cassette("browserless_api/get_full_html") do | |
url = "https://granttree.co.uk" | |
response = @api.get_full_html(url) | |
assert_not_nil response | |
assert response.is_a?(String), "Response should be a String but was #{response.class}" | |
assert response.length > 0, "Response should not be empty" | |
assert_includes response, "GrantTree", "Response should contain expected content" | |
end | |
end | |
test "get_links retrieves links data from a URL" do | |
VCR.use_cassette("browserless_api/get_links") do | |
url = "https://granttree.co.uk" | |
response = @api.get_links(url) | |
assert_not_nil response | |
assert response.is_a?(Hash), "Response should be a Hash but was #{response.class}" | |
assert response.has_key?("data"), "Response should have 'data' key" | |
results = response.dig("data", 0, "results") | |
assert_not_nil results, "Response should have results" | |
assert results.is_a?(Array), "Results should be an Array" | |
assert results.length > 0, "Should find at least one link" | |
end | |
end | |
test "extract_link_urls extracts unique URLs from links response" do | |
VCR.use_cassette("browserless_api/extract_link_urls") do | |
url = "https://granttree.co.uk" | |
urls = @api.extract_link_urls(url) | |
assert_not_nil urls | |
assert urls.is_a?(Array), "URLs should be an Array but was #{urls.class}" | |
assert urls.length > 0, "Should find at least one URL" | |
urls.each do |url| | |
assert url.is_a?(String), "Each URL should be a String" | |
assert url.length > 0, "URLs should not be empty" | |
end | |
assert_includes urls, "https://granttree.co.uk/services/rd-tax-credits/" | |
assert_equal urls.uniq, urls, "URLs should be unique" | |
end | |
end | |
test "retry_block handles timeouts and retries correctly" do | |
execution_count = 0 | |
VCR.use_cassette("browserless_api/timeout_retry") do | |
successful_execution = false | |
@api.retry_block do | |
execution_count += 1 | |
if execution_count == 1 | |
raise Faraday::TimeoutError, "Simulated timeout" | |
else | |
successful_execution = true | |
end | |
end | |
assert successful_execution, "The block should have been retried and succeeded" | |
assert_equal 2, execution_count, "Block should have executed twice (once failing, once succeeding)" | |
end | |
end | |
test "retry_block respects maximum retry attempts" do | |
max_retries = 5 | |
execution_count = 0 | |
VCR.use_cassette("browserless_api/max_retries") do | |
@api.retry_block do | |
execution_count += 1 | |
if execution_count <= max_retries | |
raise Faraday::TimeoutError, "Simulated timeout" | |
end | |
end | |
assert_equal max_retries + 1, execution_count, | |
"Should have attempted #{max_retries + 1} times (original + max retries)" | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment