Created
November 14, 2024 19:39
-
-
Save alexgodin/9a4e25a8fdab85f718146d061263bbd4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FindProductImagesJob | |
include Sidekiq::Job | |
def perform | |
with_browser do |browser| | |
find_and_process_urls(browser) | |
end | |
end | |
def with_browser | |
options = Selenium::WebDriver::Chrome::Options.new | |
options.binary = ENV['GOOGLE_CHROME_SHIM'] | |
options.add_argument('--headless') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
options.add_argument('--disable-gpu') | |
browser = Selenium::WebDriver.for(:chrome, | |
options:, | |
service: Selenium::WebDriver::Chrome::Service.new(path: ENV['CHROMEDRIVER_PATH'])) | |
browser.manage.timeouts.implicit_wait = 10 | |
browser.manage.window.resize_to(800, 1200) | |
yield(browser) | |
ensure | |
browser&.quit | |
end | |
def find_and_process_urls(browser) | |
# TODO: schedule this job to run with concurrency 1 | |
imports = PatternImport.where(last_image_scrape: nil).order('RANDOM()').limit(100) | |
imports.each do |import| | |
puts import.external_url | |
result = process_single_url(import, browser) | |
images = result[:images] | |
html = result[:html] | |
screenshot = take_screenshot(browser) | |
save_images(import, images) | |
log_success(import, html, screenshot) | |
rescue StandardError => e | |
log_errors(import, e) | |
next | |
end | |
end | |
def process_single_url(import, browser) | |
browser.get(import.external_url) | |
image_urls = extract_image_urls(browser) | |
images = image_urls.map { |u| filter_size_and_build_image_object(u) }.compact | |
html = browser.page_source | |
{ images:, html: } | |
end | |
def extract_image_urls(browser) | |
[ | |
extract_via_data_attributes(browser), | |
extract_pure_images(browser), | |
extract_via_js(browser), | |
extract_srcsets(browser), | |
extract_woocommerce_images(browser) | |
].flatten.uniq.compact | |
end | |
def extract_via_js(browser) | |
js_variables = %w[productImageUrls galleryImages productImages] | |
js_variables.flat_map do |variable| | |
script = "return typeof #{variable} !== 'undefined' ? JSON.stringify(#{variable}) : null;" | |
result = browser.execute_script(script) | |
result ? JSON.parse(result) : [] | |
end | |
end | |
def extract_pure_images(browser) | |
browser.find_elements(css: 'img').map { |i| i.attribute('src') } | |
end | |
def extract_via_data_attributes(browser) | |
data_attributes = %w[data-full-size-url data-large-image data-zoom-image data-original data-image data-image-source] | |
xpath_attributes = data_attributes.map { |attr| "@#{attr}" }.join(' or ') | |
xpath = "//*[self::div or self::img or self::a][#{xpath_attributes}]" | |
browser.find_elements(:xpath, xpath).flat_map do |el| | |
data_attributes.map { |attr| el.attribute(attr) }.compact | |
end.uniq | |
end | |
def extract_srcsets(browser) | |
browser.find_elements(css: 'img[srcset], source[srcset]').flat_map do |el| | |
srcset = el.attribute('srcset') | |
return [] if srcset.nil? || srcset.empty? | |
srcset.split(',').map { |src| src.strip.split(/\s+/).first } | |
end | |
end | |
def extract_woocommerce_images(browser) | |
browser.find_elements(css: '.woocommerce-product-gallery__image a').map { |link| link.attribute('href') } | |
end | |
def save_images(url, images) | |
puts 'SAVING' | |
images.each do |image| | |
record = url.pattern_import_images.create!(image) | |
filename = File.basename(URI.parse(image[:url]).path) | |
io = Down.download(image[:url]) | |
record.image.attach(io:, filename:) | |
rescue StandardError => e | |
binding.pry | |
end | |
end | |
def take_screenshot(browser) | |
screenshot_path = Rails.root.join('tmp', 'screenshot.png').to_s | |
browser.save_screenshot(screenshot_path) | |
File.open(screenshot_path) | |
end | |
def filter_size_and_build_image_object(image_url) | |
url = add_http(image_url) | |
size_x, size_y = FastImage.size(image_url) | |
return if size_x.blank? || size_x <= 250 || size_y <= 250 | |
{ | |
url:, | |
size_x:, | |
size_y: | |
} | |
end | |
def add_http(url) | |
if url.start_with?('http:', 'https:') | |
url | |
else | |
"https://#{url}" | |
end | |
end | |
def log_errors(url, e) | |
puts 'ERROR' | |
puts e | |
puts e.backtrace | |
url.update(last_image_scrape: DateTime.now, success: false) | |
end | |
def log_success(url, html, screenshot) | |
puts 'SUCCESS' | |
url.update(last_image_scrape: DateTime.now, success: true, html:) | |
begin | |
url.screenshot.attach(io: screenshot, filename: 'screenshot.png') | |
rescue StandardError => e | |
puts 'ERROR' | |
puts e | |
binding.pry if rails.env == 'development' | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment