zaid · July 21, 2016 20:45
diff --git a/html_to_pdf_converter.rb b/html_to_pdf_converter.rb
 require 'pdfkit'
 require 'nokogiri'
 require 'charlock_holmes'

 PDFKit.configure do |config|
  config.default_options[:load_error_handling] = 'ignore'
  config.default_options[:load_media_error_handling] = 'ignore'
  config.default_options[:disable_javascript] = true
 end

 def extract_pdf_files(path)
  extraction_directory = File.join(File.dirname(path), 'extracted_files')
  FileUtils.mkdir(extraction_directory) unless File.exists?(extraction_directory)

  Dir.glob(File.join(path, '*.htm')).each do |file_path|
    pdf_filepath = File.join(extraction_directory, "#{File.basename(file_path, '.*')}.pdf")
    next if File.exists?(pdf_filepath)

    File.open(file_path, 'rb') do |file|
      raw = file.read

      detection = CharlockHolmes::EncodingDetector.detect(raw)
      raw_utf8 = CharlockHolmes::Converter.convert(raw, detection[:encoding], 'UTF-8')

      document = Nokogiri::HTML::Document.new(raw_utf8)
      raw_html = document.to_html(encoding: 'UTF-8')

      kit = PDFKit.new(raw_html)
      kit.to_file(pdf_filepath)
    end

    print '.'
  end
 end
	require 'pdfkit'
	require 'nokogiri'
	require 'charlock_holmes'

	PDFKit.configure do \|config\|
	config.default_options[:load_error_handling] = 'ignore'
	config.default_options[:load_media_error_handling] = 'ignore'
	config.default_options[:disable_javascript] = true
	end

	def extract_pdf_files(path)
	extraction_directory = File.join(File.dirname(path), 'extracted_files')
	FileUtils.mkdir(extraction_directory) unless File.exists?(extraction_directory)

	Dir.glob(File.join(path, '*.htm')).each do \|file_path\|
	pdf_filepath = File.join(extraction_directory, "#{File.basename(file_path, '.*')}.pdf")
	next if File.exists?(pdf_filepath)

	File.open(file_path, 'rb') do \|file\|
	raw = file.read

	detection = CharlockHolmes::EncodingDetector.detect(raw)
	raw_utf8 = CharlockHolmes::Converter.convert(raw, detection[:encoding], 'UTF-8')

	document = Nokogiri::HTML::Document.new(raw_utf8)
	raw_html = document.to_html(encoding: 'UTF-8')

	kit = PDFKit.new(raw_html)
	kit.to_file(pdf_filepath)
	end

	print '.'
	end
	end
No results found