Skip to content

Instantly share code, notes, and snippets.

@alexgodin
Created November 14, 2024 19:39
Show Gist options
  • Save alexgodin/9a4e25a8fdab85f718146d061263bbd4 to your computer and use it in GitHub Desktop.
Save alexgodin/9a4e25a8fdab85f718146d061263bbd4 to your computer and use it in GitHub Desktop.
class FindProductImagesJob
include Sidekiq::Job
def perform
with_browser do |browser|
find_and_process_urls(browser)
end
end
def with_browser
options = Selenium::WebDriver::Chrome::Options.new
options.binary = ENV['GOOGLE_CHROME_SHIM']
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
browser = Selenium::WebDriver.for(:chrome,
options:,
service: Selenium::WebDriver::Chrome::Service.new(path: ENV['CHROMEDRIVER_PATH']))
browser.manage.timeouts.implicit_wait = 10
browser.manage.window.resize_to(800, 1200)
yield(browser)
ensure
browser&.quit
end
def find_and_process_urls(browser)
# TODO: schedule this job to run with concurrency 1
imports = PatternImport.where(last_image_scrape: nil).order('RANDOM()').limit(100)
imports.each do |import|
puts import.external_url
result = process_single_url(import, browser)
images = result[:images]
html = result[:html]
screenshot = take_screenshot(browser)
save_images(import, images)
log_success(import, html, screenshot)
rescue StandardError => e
log_errors(import, e)
next
end
end
def process_single_url(import, browser)
browser.get(import.external_url)
image_urls = extract_image_urls(browser)
images = image_urls.map { |u| filter_size_and_build_image_object(u) }.compact
html = browser.page_source
{ images:, html: }
end
def extract_image_urls(browser)
[
extract_via_data_attributes(browser),
extract_pure_images(browser),
extract_via_js(browser),
extract_srcsets(browser),
extract_woocommerce_images(browser)
].flatten.uniq.compact
end
def extract_via_js(browser)
js_variables = %w[productImageUrls galleryImages productImages]
js_variables.flat_map do |variable|
script = "return typeof #{variable} !== 'undefined' ? JSON.stringify(#{variable}) : null;"
result = browser.execute_script(script)
result ? JSON.parse(result) : []
end
end
def extract_pure_images(browser)
browser.find_elements(css: 'img').map { |i| i.attribute('src') }
end
def extract_via_data_attributes(browser)
data_attributes = %w[data-full-size-url data-large-image data-zoom-image data-original data-image data-image-source]
xpath_attributes = data_attributes.map { |attr| "@#{attr}" }.join(' or ')
xpath = "//*[self::div or self::img or self::a][#{xpath_attributes}]"
browser.find_elements(:xpath, xpath).flat_map do |el|
data_attributes.map { |attr| el.attribute(attr) }.compact
end.uniq
end
def extract_srcsets(browser)
browser.find_elements(css: 'img[srcset], source[srcset]').flat_map do |el|
srcset = el.attribute('srcset')
return [] if srcset.nil? || srcset.empty?
srcset.split(',').map { |src| src.strip.split(/\s+/).first }
end
end
def extract_woocommerce_images(browser)
browser.find_elements(css: '.woocommerce-product-gallery__image a').map { |link| link.attribute('href') }
end
def save_images(url, images)
puts 'SAVING'
images.each do |image|
record = url.pattern_import_images.create!(image)
filename = File.basename(URI.parse(image[:url]).path)
io = Down.download(image[:url])
record.image.attach(io:, filename:)
rescue StandardError => e
binding.pry
end
end
def take_screenshot(browser)
screenshot_path = Rails.root.join('tmp', 'screenshot.png').to_s
browser.save_screenshot(screenshot_path)
File.open(screenshot_path)
end
def filter_size_and_build_image_object(image_url)
url = add_http(image_url)
size_x, size_y = FastImage.size(image_url)
return if size_x.blank? || size_x <= 250 || size_y <= 250
{
url:,
size_x:,
size_y:
}
end
def add_http(url)
if url.start_with?('http:', 'https:')
url
else
"https://#{url}"
end
end
def log_errors(url, e)
puts 'ERROR'
puts e
puts e.backtrace
url.update(last_image_scrape: DateTime.now, success: false)
end
def log_success(url, html, screenshot)
puts 'SUCCESS'
url.update(last_image_scrape: DateTime.now, success: true, html:)
begin
url.screenshot.attach(io: screenshot, filename: 'screenshot.png')
rescue StandardError => e
puts 'ERROR'
puts e
binding.pry if rails.env == 'development'
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment