Created
September 11, 2009 02:06
-
-
Save jashkenas/185010 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Depends on working pdftk, gm (GraphicsMagick), and pdftotext (Poppler) commands. | |
# Splits a pdf into batches of N pages, creates their thumbnails and icons, | |
# as specified in the Job options, gets the text for every page, and merges | |
# it all back into a tar archive for convenient download. | |
# | |
# See <tt>examples/process_pdfs_example.rb</tt> for more information. | |
class ProcessPdfs < CloudCrowd::Action | |
# Split up a large pdf into single-page pdfs. Batch them into 'batch_size' | |
# chunks for processing. The double pdftk shuffle fixes the document xrefs. | |
def split | |
`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"` | |
FileUtils.rm input_path | |
pdfs = Dir["*.pdf_temp"] | |
pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`} | |
pdfs = Dir["*.pdf"] | |
batch_size = options['batch_size'] | |
batches = (pdfs.length / batch_size.to_f).ceil | |
batches.times do |batch_num| | |
tar_path = "#{sprintf('%05d', batch_num)}.tar" | |
batch_pdfs = pdfs[batch_num*batch_size...(batch_num + 1)*batch_size] | |
`tar -czf #{tar_path} #{batch_pdfs.join(' ')}` | |
end | |
Dir["*.tar"].map {|tar| save(tar) } | |
end | |
# Convert a pdf page into different-sized thumbnails. Grab the text. | |
def process | |
`tar -xzf #{input_path}` | |
FileUtils.rm input_path | |
cmds = [] | |
generate_images_commands(cmds) | |
generate_text_commands(cmds) | |
system cmds.join(' && ') | |
FileUtils.rm Dir['*.pdf'] | |
`tar -czf #{file_name}.tar *` | |
save("#{file_name}.tar") | |
end | |
# Merge all of the resulting images, all of the resulting text files, and | |
# the concatenated merge of the full-text into a single tar archive, ready to | |
# for download. | |
def merge | |
input.each do |batch_url| | |
batch_path = File.basename(batch_url) | |
download(batch_url, batch_path) | |
`tar -xzf #{batch_path}` | |
FileUtils.rm batch_path | |
end | |
names = Dir['*.txt'].map {|fn| fn.sub(/_\d+(_\w+)?\.txt\Z/, '') }.uniq | |
dirs = names.map {|n| ["#{n}/text/full", "#{n}/text/pages"] + options['images'].map {|i| "#{n}/images/#{i['name']}" } }.flatten | |
FileUtils.mkdir_p(dirs) | |
Dir['*.*'].each do |file| | |
ext = File.extname(file) | |
name = file.sub(/_\d+(_\w+)?#{ext}\Z/, '') | |
if ext == '.txt' | |
FileUtils.mv(file, "#{name}/text/pages/#{file}") | |
else | |
suffix = file.match(/_([^_]+)#{ext}\Z/)[1] | |
sans_suffix = file.sub(/_([^_]+)#{ext}\Z/, ext) | |
FileUtils.mv(file, "#{name}/images/#{suffix}/#{sans_suffix}") | |
end | |
end | |
names.each {|n| `cat #{n}/text/pages/*.txt > #{n}/text/full/#{n}.txt` } | |
`tar -czf processed_pdfs.tar *` | |
save("processed_pdfs.tar") | |
end | |
private | |
def generate_images_commands(command_list) | |
Dir["*.pdf"].each do |pdf| | |
name = File.basename(pdf, File.extname(pdf)) | |
options['images'].each do |i| | |
command_list << "gm convert #{i['options']} #{pdf} #{name}_#{i['name']}.#{i['extension']}" | |
end | |
end | |
end | |
def generate_text_commands(command_list) | |
Dir["*.pdf"].each do |pdf| | |
name = File.basename(pdf, File.extname(pdf)) | |
command_list << "pdftotext -enc UTF-8 -layout -q #{pdf} #{name}.txt" | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby -rubygems | |
require 'restclient' | |
require 'json' | |
# This example demonstrates a fairly complicated PDF-processing action, designed | |
# to extract the PDF's text, and produce GIF versions of each page. The action | |
# (actions/process_pdfs.rb) shows an example of using all three steps, | |
# split, process, and merge. | |
RestClient.post('http://localhost:9173/jobs', | |
{:job => { | |
'action' => 'process_pdfs', | |
'inputs' => [ | |
'http://tigger.uic.edu/~victor/personal/futurism.pdf', | |
'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf', | |
'http://www.dzignism.com/articles/Futurist.Manifesto.pdf', | |
'http://benfry.com/phd/dissertation-050312b-acrobat.pdf' | |
], | |
'options' => { | |
'batch_size' => 7, | |
'images' => [{ | |
'name' => '700', | |
'options' => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03', | |
'extension' => 'gif' | |
},{ | |
'name' => '1000', | |
'options' => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03', | |
'extension' => 'gif' | |
}] | |
} | |
}.to_json} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment