This sorts the pages of a PDF, in order of the titles of the PDF bookmarks.
This was hacked together for a one-off task. Hardcoding the input name made sense, with a symlink. It was the first time I'd scripted anything re: PDFs.
#!/usr/bin/env ruby | |
# writing locally in ruby 3.0.2 | |
require 'hexapdf' | |
require 'byebug' | |
doc = HexaPDF::Document.open('input.pdf') | |
pages_to_ref = Hash.new # page number to [oid, gen] | |
bookmarks_to_ref = Hash.new # bookmark title to [oid, gen] | |
counter = 0 | |
print "iterating through pages, '.' per 100 pages " | |
doc.pages.each_with_index do |page, index| | |
# byebug | |
# puts "pausing" | |
# page.methods | |
# page.oid = 4 | |
# page.gen = 0 | |
pages_to_ref[index] = [page.oid, page.gen] | |
counter += 1 | |
print "." if counter % 100 == 0 | |
end | |
puts " done!" | |
counter = 0 | |
print "parsing bookmarks / outlines, . per 100 " | |
doc.outline.each_item do |item, index| | |
item.each_item do |inner_item, inner_index| | |
# first one: | |
# inner_item.value[:Title] = "040922 01" | |
# which is the title of the first bookmark / outline | |
# inner_item.value[:Dest][0] = #<HexaPDF::Reference [4, 0]> | |
# inner_item.value[:Dest][0].oid = 4 | |
# inner_item.value[:Dest][0].gen = 0 | |
title = inner_item.value[:Title] | |
next if title.empty? | |
next if title == "DUPE" | |
bookmarks_to_ref[title] = [inner_item.value[:Dest][0].oid, inner_item.value[:Dest][0].gen] | |
counter += 1 | |
print "." if counter % 100 == 0 | |
end | |
end | |
puts " done!" | |
puts "total bookmark count: #{bookmarks_to_ref.keys.count}" | |
# copying pages, by iterating through sorted bookmark titles | |
counter = 0 | |
target = HexaPDF::Document.new | |
print "copying pages, . per 100 " | |
bookmarks_to_ref.keys.sort.each do |title| | |
# copy src page into dest pdf | |
ref = bookmarks_to_ref[title] | |
page_num = pages_to_ref.key(ref) | |
target.pages << target.import(doc.pages[page_num]) | |
counter += 1 | |
print "." if counter % 100 == 0 | |
end # /copying pages | |
puts " done!" | |
# copy bookmarks to dest pdf | |
counter = 0 | |
print "copying bookmarks, . per 100 " | |
main = target.outline.add_item("Main") # do |main| | |
bookmarks_to_ref.keys.sort.each do |title| | |
main.add_item(title, destination: counter) | |
counter += 1 | |
print "." if counter % 100 == 0 | |
end # /copying bookmarks | |
puts " done!" | |
puts "writing target pdf..." | |
target.write("sorted.pdf") | |
puts " done!" | |
# OCR dest pdf, outside of this |