Skip to content

Instantly share code, notes, and snippets.

@swombat
Created May 31, 2025 07:26
Show Gist options
  • Save swombat/e3d96562cb1508bda6d14433165af45b to your computer and use it in GitHub Desktop.
Save swombat/e3d96562cb1508bda6d14433165af45b to your computer and use it in GitHub Desktop.
require "faraday"
require "json"
class BrowserlessApi
def initialize(api_key: Rails.application.credentials.dig(:browserless, :api_key) || "<BROWSERLESS_API_KEY>", debug: false)
@api_key = api_key
@base_url = "https://production-sfo.browserless.io"
@debug = debug
end
def get_full_html(url)
params = {
url: url,
waitForTimeout: 10000
}
retry_block do
response = http_client.post("/content", params.to_json)
response.body
end
end
def get_links(url)
params = {
url: url,
elements: [
{ selector: "a" }
],
waitForSelector: {
selector: "a",
timeout: 5000
}
}
retry_block do
response = http_client.post("/scrape", params.to_json)
response.body
end
end
def extract_link_urls(url)
links_response = get_links(url)
results = links_response.dig("data", 0, "results") || []
results.map do |link|
href_attr = link.dig("attributes")&.find { |attr| attr["name"] == "href" }
href_attr&.dig("value")
end.compact.uniq
end
def http_client
Faraday.new(base_url) do |faraday|
faraday.request :url_encoded
faraday.headers["Content-Type"] = "application/json"
faraday.headers["Cache-Control"] = "no-cache"
if @debug
faraday.response :logger, ::Logger.new($stdout), { bodies: true } do |logger|
logger.filter(/(token=)(\S+)/, '\1REDACTED')
end
end
faraday.response :raise_error
faraday.response :json
end
end
def base_url
"#{@base_url}?token=#{@api_key}"
end
def retry_block
tries_counter = 0
begin
yield
rescue Faraday::TimeoutError => e
Rails.logger.error "Faraday::TimeoutError: #{e}"
tries_counter += 1
if tries_counter < 6
sleep_time = 2**tries_counter
Rails.logger.info "Sleeping for #{sleep_time} seconds and trying again"
sleep sleep_time
Rails.logger.info "Done - trying again"
retry if tries_counter < 6
end
rescue StandardError => e
Rails.logger.error "Unknown Error (tries: #{tries_counter}): #{e}"
tries_counter += 1
retry if tries_counter < 3
end
end
end
require "test_helper"
require "support/vcr_setup"
class BrowserlessApiTest < ActiveSupport::TestCase
setup do
@api = BrowserlessApi.new
end
test "initialization sets correct default parameters" do
assert_equal Rails.application.credentials.dig(:browserless, :api_key) || "<BROWSERLESS_API_KEY>", @api.instance_variable_get(:@api_key)
assert_equal "https://production-sfo.browserless.io", @api.instance_variable_get(:@base_url)
assert_equal false, @api.instance_variable_get(:@debug)
assert_not_nil @api.instance_variable_get(:@api_key)
end
test "initialization sets custom parameters correctly" do
custom_api_key = "custom_api_key_123"
custom_api = BrowserlessApi.new(api_key: custom_api_key, debug: true)
assert_equal custom_api_key, custom_api.instance_variable_get(:@api_key)
assert_equal "https://production-sfo.browserless.io", custom_api.instance_variable_get(:@base_url)
assert_equal true, custom_api.instance_variable_get(:@debug)
end
test "base_url includes API token" do
api_key = @api.instance_variable_get(:@api_key)
expected_url = "https://production-sfo.browserless.io?token=#{api_key}"
assert_equal expected_url, @api.base_url
end
test "get_full_html retrieves HTML content from a URL" do
VCR.use_cassette("browserless_api/get_full_html") do
url = "https://granttree.co.uk"
response = @api.get_full_html(url)
assert_not_nil response
assert response.is_a?(String), "Response should be a String but was #{response.class}"
assert response.length > 0, "Response should not be empty"
assert_includes response, "GrantTree", "Response should contain expected content"
end
end
test "get_links retrieves links data from a URL" do
VCR.use_cassette("browserless_api/get_links") do
url = "https://granttree.co.uk"
response = @api.get_links(url)
assert_not_nil response
assert response.is_a?(Hash), "Response should be a Hash but was #{response.class}"
assert response.has_key?("data"), "Response should have 'data' key"
results = response.dig("data", 0, "results")
assert_not_nil results, "Response should have results"
assert results.is_a?(Array), "Results should be an Array"
assert results.length > 0, "Should find at least one link"
end
end
test "extract_link_urls extracts unique URLs from links response" do
VCR.use_cassette("browserless_api/extract_link_urls") do
url = "https://granttree.co.uk"
urls = @api.extract_link_urls(url)
assert_not_nil urls
assert urls.is_a?(Array), "URLs should be an Array but was #{urls.class}"
assert urls.length > 0, "Should find at least one URL"
urls.each do |url|
assert url.is_a?(String), "Each URL should be a String"
assert url.length > 0, "URLs should not be empty"
end
assert_includes urls, "https://granttree.co.uk/services/rd-tax-credits/"
assert_equal urls.uniq, urls, "URLs should be unique"
end
end
test "retry_block handles timeouts and retries correctly" do
execution_count = 0
VCR.use_cassette("browserless_api/timeout_retry") do
successful_execution = false
@api.retry_block do
execution_count += 1
if execution_count == 1
raise Faraday::TimeoutError, "Simulated timeout"
else
successful_execution = true
end
end
assert successful_execution, "The block should have been retried and succeeded"
assert_equal 2, execution_count, "Block should have executed twice (once failing, once succeeding)"
end
end
test "retry_block respects maximum retry attempts" do
max_retries = 5
execution_count = 0
VCR.use_cassette("browserless_api/max_retries") do
@api.retry_block do
execution_count += 1
if execution_count <= max_retries
raise Faraday::TimeoutError, "Simulated timeout"
end
end
assert_equal max_retries + 1, execution_count,
"Should have attempted #{max_retries + 1} times (original + max retries)"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment