alexgodin · November 14, 2024 19:39
diff --git a/find_product_images_job.rb b/find_product_images_job.rb
 class FindProductImagesJob
  include Sidekiq::Job

  def perform
    with_browser do |browser|
      find_and_process_urls(browser)
    end
  end

  def with_browser
    options = Selenium::WebDriver::Chrome::Options.new
    options.binary = ENV['GOOGLE_CHROME_SHIM']
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')

    browser = Selenium::WebDriver.for(:chrome,
                                      options:,
                                      service: Selenium::WebDriver::Chrome::Service.new(path: ENV['CHROMEDRIVER_PATH']))
    browser.manage.timeouts.implicit_wait = 10
    browser.manage.window.resize_to(800, 1200)
    yield(browser)
  ensure
    browser&.quit
  end

  def find_and_process_urls(browser)
    # TODO: schedule this job to run with concurrency 1
    imports = PatternImport.where(last_image_scrape: nil).order('RANDOM()').limit(100)
    imports.each do |import|
      puts import.external_url
      result = process_single_url(import, browser)
      images = result[:images]
      html = result[:html]
      screenshot = take_screenshot(browser)
      save_images(import, images)
      log_success(import, html, screenshot)
    rescue StandardError => e
      log_errors(import, e)
      next
    end
  end

  def process_single_url(import, browser)
    browser.get(import.external_url)
    image_urls = extract_image_urls(browser)
    images = image_urls.map { |u| filter_size_and_build_image_object(u) }.compact

    html = browser.page_source

    { images:, html: }
  end

  def extract_image_urls(browser)
    [
      extract_via_data_attributes(browser),
      extract_pure_images(browser),
      extract_via_js(browser),
      extract_srcsets(browser),
      extract_woocommerce_images(browser)
    ].flatten.uniq.compact
  end

  def extract_via_js(browser)
    js_variables = %w[productImageUrls galleryImages productImages]
    js_variables.flat_map do |variable|
      script = "return typeof #{variable} !== 'undefined' ? JSON.stringify(#{variable}) : null;"
      result = browser.execute_script(script)
      result ? JSON.parse(result) : []
    end
  end

  def extract_pure_images(browser)
    browser.find_elements(css: 'img').map { |i| i.attribute('src') }
  end

  def extract_via_data_attributes(browser)
    data_attributes = %w[data-full-size-url data-large-image data-zoom-image data-original data-image data-image-source]
    xpath_attributes = data_attributes.map { |attr| "@#{attr}" }.join(' or ')
    xpath = "//*[self::div or self::img or self::a][#{xpath_attributes}]"
    browser.find_elements(:xpath, xpath).flat_map do |el|
      data_attributes.map { |attr| el.attribute(attr) }.compact
    end.uniq
  end

  def extract_srcsets(browser)
    browser.find_elements(css: 'img[srcset], source[srcset]').flat_map do |el|
      srcset = el.attribute('srcset')
      return [] if srcset.nil? || srcset.empty?

      srcset.split(',').map { |src| src.strip.split(/\s+/).first }
    end
  end

  def extract_woocommerce_images(browser)
    browser.find_elements(css: '.woocommerce-product-gallery__image a').map { |link| link.attribute('href') }
  end

  def save_images(url, images)
    puts 'SAVING'
    images.each do |image|
      record = url.pattern_import_images.create!(image)
      filename = File.basename(URI.parse(image[:url]).path)
      io = Down.download(image[:url])
      record.image.attach(io:, filename:)
    rescue StandardError => e
      binding.pry
    end
  end

  def take_screenshot(browser)
    screenshot_path = Rails.root.join('tmp', 'screenshot.png').to_s
    browser.save_screenshot(screenshot_path)
    File.open(screenshot_path)
  end

  def filter_size_and_build_image_object(image_url)
    url = add_http(image_url)
    size_x, size_y = FastImage.size(image_url)
    return if size_x.blank? || size_x <= 250 || size_y <= 250

    {
      url:,
      size_x:,
      size_y:
    }
  end

  def add_http(url)
    if url.start_with?('http:', 'https:')
      url
    else
      "https://#{url}"
    end
  end

  def log_errors(url, e)
    puts 'ERROR'
    puts e
    puts e.backtrace
    url.update(last_image_scrape: DateTime.now, success: false)
  end

  def log_success(url, html, screenshot)
    puts 'SUCCESS'
    url.update(last_image_scrape: DateTime.now, success: true, html:)
    begin
      url.screenshot.attach(io: screenshot, filename: 'screenshot.png')
    rescue StandardError => e
      puts 'ERROR'
      puts e
      binding.pry if rails.env == 'development'
    end
  end
 end
	class FindProductImagesJob
	include Sidekiq::Job

	def perform
	with_browser do \|browser\|
	find_and_process_urls(browser)
	end
	end

	def with_browser
	options = Selenium::WebDriver::Chrome::Options.new
	options.binary = ENV['GOOGLE_CHROME_SHIM']
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--disable-gpu')

	browser = Selenium::WebDriver.for(:chrome,
	options:,
	service: Selenium::WebDriver::Chrome::Service.new(path: ENV['CHROMEDRIVER_PATH']))
	browser.manage.timeouts.implicit_wait = 10
	browser.manage.window.resize_to(800, 1200)
	yield(browser)
	ensure
	browser&.quit
	end

	def find_and_process_urls(browser)
	# TODO: schedule this job to run with concurrency 1
	imports = PatternImport.where(last_image_scrape: nil).order('RANDOM()').limit(100)
	imports.each do \|import\|
	puts import.external_url
	result = process_single_url(import, browser)
	images = result[:images]
	html = result[:html]
	screenshot = take_screenshot(browser)
	save_images(import, images)
	log_success(import, html, screenshot)
	rescue StandardError => e
	log_errors(import, e)
	next
	end
	end

	def process_single_url(import, browser)
	browser.get(import.external_url)
	image_urls = extract_image_urls(browser)
	images = image_urls.map { \|u\| filter_size_and_build_image_object(u) }.compact

	html = browser.page_source

	{ images:, html: }
	end

	def extract_image_urls(browser)
	[
	extract_via_data_attributes(browser),
	extract_pure_images(browser),
	extract_via_js(browser),
	extract_srcsets(browser),
	extract_woocommerce_images(browser)
	].flatten.uniq.compact
	end

	def extract_via_js(browser)
	js_variables = %w[productImageUrls galleryImages productImages]
	js_variables.flat_map do \|variable\|
	script = "return typeof #{variable} !== 'undefined' ? JSON.stringify(#{variable}) : null;"
	result = browser.execute_script(script)
	result ? JSON.parse(result) : []
	end
	end

	def extract_pure_images(browser)
	browser.find_elements(css: 'img').map { \|i\| i.attribute('src') }
	end

	def extract_via_data_attributes(browser)
	data_attributes = %w[data-full-size-url data-large-image data-zoom-image data-original data-image data-image-source]
	xpath_attributes = data_attributes.map { \|attr\| "@#{attr}" }.join(' or ')
	xpath = "//*[self::div or self::img or self::a][#{xpath_attributes}]"
	browser.find_elements(:xpath, xpath).flat_map do \|el\|
	data_attributes.map { \|attr\| el.attribute(attr) }.compact
	end.uniq
	end

	def extract_srcsets(browser)
	browser.find_elements(css: 'img[srcset], source[srcset]').flat_map do \|el\|
	srcset = el.attribute('srcset')
	return [] if srcset.nil? \|\| srcset.empty?

	srcset.split(',').map { \|src\| src.strip.split(/\s+/).first }
	end
	end

	def extract_woocommerce_images(browser)
	browser.find_elements(css: '.woocommerce-product-gallery__image a').map { \|link\| link.attribute('href') }
	end

	def save_images(url, images)
	puts 'SAVING'
	images.each do \|image\|
	record = url.pattern_import_images.create!(image)
	filename = File.basename(URI.parse(image[:url]).path)
	io = Down.download(image[:url])
	record.image.attach(io:, filename:)
	rescue StandardError => e
	binding.pry
	end
	end

	def take_screenshot(browser)
	screenshot_path = Rails.root.join('tmp', 'screenshot.png').to_s
	browser.save_screenshot(screenshot_path)
	File.open(screenshot_path)
	end

	def filter_size_and_build_image_object(image_url)
	url = add_http(image_url)
	size_x, size_y = FastImage.size(image_url)
	return if size_x.blank? \|\| size_x <= 250 \|\| size_y <= 250

	{
	url:,
	size_x:,
	size_y:
	}
	end

	def add_http(url)
	if url.start_with?('http:', 'https:')
	url
	else
	"https://#{url}"
	end
	end

	def log_errors(url, e)
	puts 'ERROR'
	puts e
	puts e.backtrace
	url.update(last_image_scrape: DateTime.now, success: false)
	end

	def log_success(url, html, screenshot)
	puts 'SUCCESS'
	url.update(last_image_scrape: DateTime.now, success: true, html:)
	begin
	url.screenshot.attach(io: screenshot, filename: 'screenshot.png')
	rescue StandardError => e
	puts 'ERROR'
	puts e
	binding.pry if rails.env == 'development'
	end
	end
	end