Created
October 19, 2016 19:19
-
-
Save jakevose/57dcc26765f38bb9b138164301c99993 to your computer and use it in GitHub Desktop.
Rewriting munged PDF files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class PdfFixer | |
def initialize | |
@paths = Dir.glob('in/**/*.pdf').sort | |
`rm -rf out && mkdir out` | |
end | |
def process_files | |
start_logging | |
@paths.each do |path| | |
validate_and_repair(path) | |
end | |
ensure | |
stop_logging | |
end | |
private | |
def start_logging | |
@log = File.open('error.log', 'w') | |
@good = File.open('good.log', 'w') | |
@bad = File.open('bad.log', 'w') | |
@log.puts('Beginning pdf fix.') | |
end | |
def good_start?(file) | |
first = file.readline | |
file.rewind | |
first.start_with?('%PDF') | |
end | |
def validate_and_repair(path) | |
file = File.open(path, 'r') | |
if good_start?(file) | |
@good.puts(path) | |
else | |
@bad.puts(path) | |
try_repair(file) | |
end | |
end | |
def try_repair(file) | |
any_pdf_data = nil | |
output_path = "out/#{file.path.split('/').slice(1..-2).join('/')}" | |
out_file = nil | |
file.each_line do |line| | |
any_pdf_data = any_pdf_data || line.start_with?('%PDF') | |
if any_pdf_data | |
`mkdir -p #{output_path}` unless Dir.exists?(output_path) | |
out_file ||= File.open(output_path + '/' + file.path.split('/').last, 'w') | |
out_file.write(line) | |
end | |
end | |
rescue StandardError => e | |
@log.puts("Error in try_repair for file #{file.path}: #{e.message}") | |
ensure | |
out_file.close if out_file | |
end | |
def stop_logging | |
@log.puts('Finishing pdf fix.') | |
@log.close | |
@good.close | |
@bad.close | |
end | |
end | |
PdfFixer.new.process_files |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment