Last active
August 26, 2022 17:20
-
-
Save danlucraft/5277732 to your computer and use it in GitHub Desktop.
Extract annotations from PDFs with pdf-reader gem
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pdf-reader' | |
require './markup_receiver' | |
doc = PDF::Reader.new(ARGV[0]) | |
$objects = doc.objects | |
def is_note?(object) | |
object[:Type] == :Annot && [:Text, :FreeText].include?(object[:Subtype]) | |
end | |
def is_markup?(object) | |
object[:Type] == :Annot && [:Highlight, :Underline].include?(object[:Subtype]) | |
end | |
def annots_on_page(page) | |
references = (page.attributes[:Annots] || []) | |
lookup_all(references).flatten | |
end | |
def lookup_all(refs) | |
refs = *refs | |
refs.map { |ref| lookup(ref) } | |
end | |
def lookup(ref) | |
object = $objects[ref] | |
return object unless object.is_a?(Array) | |
lookup_all(object) | |
end | |
def notes_on_page(page) | |
all_annots = annots_on_page(page) | |
all_annots.select { |a| is_note?(a) } | |
end | |
def markups_on_page(page) | |
all_annots = annots_on_page(page) | |
markups = all_annots.select { |a| is_markup?(a) }.map {|a| Markup.new(a) } | |
if markups.any? | |
receiver = MarkupReceiver.new(markups) | |
page.walk(receiver) | |
coords = nil | |
receiver.set_markup_texts | |
end | |
markups | |
end | |
class Markup | |
attr_reader :attributes | |
attr_accessor :text | |
def initialize(attributes) | |
@attributes = attributes | |
end | |
class Rectangle | |
attr_reader :quad_points | |
def initialize(points) | |
@quad_points = points.sort | |
end | |
def bottom_left | |
quad_points[0] | |
end | |
def top_left | |
quad_points[1] | |
end | |
def bottom_right | |
quad_points[2] | |
end | |
def top_right | |
quad_points[3] | |
end | |
def contains?(coords) | |
x, y = *coords | |
x >= bottom_left.first && x <= top_right.first && | |
y >= bottom_left.last && y <= top_right.last | |
end | |
def within?(bottom, top) | |
bottom_left[1] >= bottom && bottom_left[1] <= top | |
end | |
end | |
def rectangles | |
attributes[:QuadPoints].each_slice(8).to_a.map do |ps| | |
Rectangle.new(ps.each_slice(2).to_a) | |
end | |
end | |
def color | |
rgb_to_hex(attributes[:C]) | |
end | |
def contains?(x, y) | |
rectangles.any? {|r| r.contains?([x, y]) } | |
end | |
def within?(bottom, top) | |
rectangles.any? {|r| r.within?(bottom, top) } | |
end | |
def rgb_to_hex(rgb) | |
"#" + rgb.map {|i| (i*255).to_i.to_s(16).rjust(2, "0").upcase }.join | |
end | |
end | |
doc.pages.each do |page| | |
notes = notes_on_page(page) | |
markups = markups_on_page(page) | |
next unless notes.any? or markups.any? | |
puts "# Page #{page.number}" | |
notes.each do |note| | |
puts " * " + note[:Contents] | |
end | |
markups.each do |markup| | |
puts " - " + (markup.text || "") | |
end | |
puts | |
puts | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pdf/reader/page_layout' | |
# Builds a UTF-8 string of all the text on a single page within the given markups | |
# by processing all the operaters in a content stream. | |
class MarkupReceiver | |
extend Forwardable | |
def initialize(markups) | |
@markups = markups | |
end | |
SPACE = " " | |
attr_reader :state, :content, :options | |
########## BEGIN FORWARDERS ########## | |
# Graphics State Operators | |
def_delegators :@state, :save_graphics_state, :restore_graphics_state | |
# Matrix Operators | |
def_delegators :@state, :concatenate_matrix | |
# Text Object Operators | |
def_delegators :@state, :begin_text_object, :end_text_object | |
# Text State Operators | |
def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling | |
def_delegators :@state, :set_text_font_and_size, :font_size | |
def_delegators :@state, :set_text_leading, :set_text_rendering_mode | |
def_delegators :@state, :set_text_rise, :set_word_spacing | |
# Text Positioning Operators | |
def_delegators :@state, :move_text_position, :move_text_position_and_set_leading | |
def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line | |
########## END FORWARDERS ########## | |
# starting a new page | |
def page=(page) | |
@state = PDF::Reader::PageState.new(page) | |
@content = [] | |
@characters = Hash.new {|h,k| h[k] = [] } | |
@mediabox = page.attributes[:MediaBox] | |
end | |
def set_markup_texts | |
@characters.each do |markup, text_runs| | |
text = PDF::Reader::PageLayout.new(text_runs, @mediabox).to_s | |
markup.text = text | |
end | |
end | |
##################################################### | |
# Text Showing Operators | |
##################################################### | |
# record text that is drawn on the page | |
def show_text(string) # Tj (AWAY) | |
internal_show_text(string) | |
end | |
def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)] | |
params.each do |arg| | |
if arg.is_a?(String) | |
internal_show_text(arg) | |
else | |
@state.process_glyph_displacement(0, arg, false) | |
end | |
end | |
end | |
def move_to_next_line_and_show_text(str) # ' | |
@state.move_to_start_of_next_line | |
show_text(str) | |
end | |
def set_spacing_next_line_show_text(aw, ac, string) # " | |
@state.set_word_spacing(aw) | |
@state.set_character_spacing(ac) | |
move_to_next_line_and_show_text(string) | |
end | |
##################################################### | |
# XObjects | |
##################################################### | |
def invoke_xobject(label) | |
@state.invoke_xobject(label) do |xobj| | |
case xobj | |
when PDF::Reader::FormXObject then | |
xobj.walk(self) | |
end | |
end | |
end | |
private | |
def internal_show_text(string) | |
if @state.current_font.nil? | |
raise PDF::Reader::MalformedPDFError, "current font is invalid" | |
end | |
glyphs = @state.current_font.unpack(string) | |
glyphs.each_with_index do |glyph_code, index| | |
# paint the current glyph | |
newx, newy = @state.trm_transform(0,0) | |
utf8_chars = @state.current_font.to_utf8(glyph_code) | |
# apply to glyph displacment for the current glyph so the next | |
# glyph will appear in the correct position | |
glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0 | |
th = 1 | |
scaled_glyph_width = glyph_width * @state.font_size * th | |
unless utf8_chars == SPACE | |
@markups.each do |markup| | |
if markup.contains?(newx, newy) | |
text_run = PDF::Reader::TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars) | |
@characters[markup] << text_run | |
end | |
end | |
end | |
@state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE) | |
end | |
end | |
end |
Thank you danlucraft, your script helped me a lot to extract hyperlinks from the PDF page.
require 'pdf-reader'
puts 'Running...'
file = "/Users/diego/Downloads/DiegoMarinoProfile.pdf"
puts "File: #{file}"
doc = PDF::Reader.new(file)
$objects = doc.objects
def is_link?(object)
object[:Type] == :Annot && [:Link].include?(object[:Subtype])
end
def is_note?(object)
object[:Type] == :Annot && [:Text, :FreeText].include?(object[:Subtype])
end
def annots_on_page(page)
references = (page.attributes[:Annots] || [])
lookup_all(references).flatten
end
def lookup_all(refs)
refs = *refs
refs.map { |ref| lookup(ref) }
end
def lookup(ref)
object = $objects[ref]
return object unless object.is_a?(Array)
lookup_all(object)
end
def notes_on_page(page)
all_annots = annots_on_page(page)
all_annots.select { |a| is_note?(a) }
end
def links_on_page(page)
all_annots = annots_on_page(page)
all_annots.select { |a| is_link?(a) }
end
doc.pages.each do |page|
links = links_on_page(page)
notes = notes_on_page(page)
next unless notes.any? || links.any?
puts "# Page #{page.number}"
links.each do |link|
puts " * " + $objects[link[:A]][:URI]
end
notes.each do |note|
puts " * " + note[:Contents]
end
puts
puts
end
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, is it possible to extract other information from annotation like user who annoted or date?
thanks
Fabio