jashkenas · September 11, 2009 02:06
diff --git a/process_pdfs.rb b/process_pdfs.rb
 # Depends on working pdftk, gm (GraphicsMagick), and pdftotext (Poppler) commands.
 # Splits a pdf into batches of N pages, creates their thumbnails and icons,
 # as specified in the Job options, gets the text for every page, and merges 
 # it all back into a tar archive for convenient download.
 #
 # See <tt>examples/process_pdfs_example.rb</tt> for more information.
 class ProcessPdfs < CloudCrowd::Action
  
  # Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
  # chunks for processing. The double pdftk shuffle fixes the document xrefs.
  def split
    `pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
    FileUtils.rm input_path
    pdfs = Dir["*.pdf_temp"]
    pdfs.each {|pdf| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`}
    pdfs = Dir["*.pdf"]
    batch_size = options['batch_size']
    batches = (pdfs.length / batch_size.to_f).ceil
    batches.times do |batch_num|
      tar_path = "#{sprintf('%05d', batch_num)}.tar"
      batch_pdfs = pdfs[batch_num*batch_size...(batch_num + 1)*batch_size]
      `tar -czf #{tar_path} #{batch_pdfs.join(' ')}`
    end
    Dir["*.tar"].map {|tar| save(tar) }
  end

  # Convert a pdf page into different-sized thumbnails. Grab the text.
  def process
    `tar -xzf #{input_path}`
    FileUtils.rm input_path
    cmds = []
    generate_images_commands(cmds)
    generate_text_commands(cmds)
    system cmds.join(' && ')
    FileUtils.rm Dir['*.pdf']
    `tar -czf #{file_name}.tar *`
    save("#{file_name}.tar")
  end
  
  # Merge all of the resulting images, all of the resulting text files, and
  # the concatenated merge of the full-text into a single tar archive, ready to
  # for download.
  def merge
    input.each do |batch_url|
      batch_path = File.basename(batch_url)
      download(batch_url, batch_path)
      `tar -xzf #{batch_path}`
      FileUtils.rm batch_path
    end
    
    names = Dir['*.txt'].map {|fn| fn.sub(/_\d+(_\w+)?\.txt\Z/, '') }.uniq
    dirs = names.map {|n| ["#{n}/text/full", "#{n}/text/pages"] + options['images'].map {|i| "#{n}/images/#{i['name']}" } }.flatten
    FileUtils.mkdir_p(dirs)
    
    Dir['*.*'].each do |file|
      ext = File.extname(file)
      name = file.sub(/_\d+(_\w+)?#{ext}\Z/, '')
      if ext == '.txt'
        FileUtils.mv(file, "#{name}/text/pages/#{file}")
      else
        suffix      = file.match(/_([^_]+)#{ext}\Z/)[1]
        sans_suffix = file.sub(/_([^_]+)#{ext}\Z/, ext)
        FileUtils.mv(file, "#{name}/images/#{suffix}/#{sans_suffix}")
      end
    end
    
    names.each {|n| `cat #{n}/text/pages/*.txt > #{n}/text/full/#{n}.txt` }
    
    `tar -czf processed_pdfs.tar *`
    save("processed_pdfs.tar")
  end
  
  
  private
  
  def generate_images_commands(command_list)
    Dir["*.pdf"].each do |pdf| 
      name = File.basename(pdf, File.extname(pdf))
      options['images'].each do |i|
        command_list << "gm convert #{i['options']} #{pdf} #{name}_#{i['name']}.#{i['extension']}"
      end
    end
  end
  
  def generate_text_commands(command_list)
    Dir["*.pdf"].each do |pdf|
      name = File.basename(pdf, File.extname(pdf))
      command_list << "pdftotext -enc UTF-8 -layout -q #{pdf} #{name}.txt"
    end
  end

 end
diff --git a/process_pdfs_example.rb b/process_pdfs_example.rb
 #!/usr/bin/env ruby -rubygems

 require 'restclient'
 require 'json'

 # This example demonstrates a fairly complicated PDF-processing action, designed
 # to extract the PDF's text, and produce GIF versions of each page. The action
 # (actions/process_pdfs.rb) shows an example of using all three steps,
 # split, process, and merge.

 RestClient.post('http://localhost:9173/jobs',
  {:job => {
  
    'action' => 'process_pdfs',
    
    'inputs' => [
      'http://tigger.uic.edu/~victor/personal/futurism.pdf',
      'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
      'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
      'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
    ],
    
    'options' => {
      
      'batch_size' => 7,
      
      'images' => [{
        'name'      => '700',
        'options'   => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
        'extension' => 'gif'
      },{
        'name'      => '1000',
        'options'   => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
        'extension' => 'gif'
      }]
      
    }
    
  }.to_json}
 )
	# Depends on working pdftk, gm (GraphicsMagick), and pdftotext (Poppler) commands.
	# Splits a pdf into batches of N pages, creates their thumbnails and icons,
	# as specified in the Job options, gets the text for every page, and merges
	# it all back into a tar archive for convenient download.
	#
	# See <tt>examples/process_pdfs_example.rb</tt> for more information.
	class ProcessPdfs < CloudCrowd::Action

	# Split up a large pdf into single-page pdfs. Batch them into 'batch_size'
	# chunks for processing. The double pdftk shuffle fixes the document xrefs.
	def split
	`pdftk #{input_path} burst output "#{file_name}_%05d.pdf_temp"`
	FileUtils.rm input_path
	pdfs = Dir["*.pdf_temp"]
	pdfs.each {\|pdf\| `pdftk #{pdf} output #{File.basename(pdf, '.pdf_temp')}.pdf`}
	pdfs = Dir["*.pdf"]
	batch_size = options['batch_size']
	batches = (pdfs.length / batch_size.to_f).ceil
	batches.times do \|batch_num\|
	tar_path = "#{sprintf('%05d', batch_num)}.tar"
	batch_pdfs = pdfs[batch_numbatch_size...(batch_num + 1)batch_size]
	`tar -czf #{tar_path} #{batch_pdfs.join(' ')}`
	end
	Dir["*.tar"].map {\|tar\| save(tar) }
	end

	# Convert a pdf page into different-sized thumbnails. Grab the text.
	def process
	`tar -xzf #{input_path}`
	FileUtils.rm input_path
	cmds = []
	generate_images_commands(cmds)
	generate_text_commands(cmds)
	system cmds.join(' && ')
	FileUtils.rm Dir['*.pdf']
	`tar -czf #{file_name}.tar *`
	save("#{file_name}.tar")
	end

	# Merge all of the resulting images, all of the resulting text files, and
	# the concatenated merge of the full-text into a single tar archive, ready to
	# for download.
	def merge
	input.each do \|batch_url\|
	batch_path = File.basename(batch_url)
	download(batch_url, batch_path)
	`tar -xzf #{batch_path}`
	FileUtils.rm batch_path
	end

	names = Dir['*.txt'].map {\|fn\| fn.sub(/_\d+(_\w+)?\.txt\Z/, '') }.uniq
	dirs = names.map {\|n\| ["#{n}/text/full", "#{n}/text/pages"] + options['images'].map {\|i\| "#{n}/images/#{i['name']}" } }.flatten
	FileUtils.mkdir_p(dirs)

	Dir['.'].each do \|file\|
	ext = File.extname(file)
	name = file.sub(/_\d+(_\w+)?#{ext}\Z/, '')
	if ext == '.txt'
	FileUtils.mv(file, "#{name}/text/pages/#{file}")
	else
	suffix = file.match(/_([^_]+)#{ext}\Z/)[1]
	sans_suffix = file.sub(/_([^_]+)#{ext}\Z/, ext)
	FileUtils.mv(file, "#{name}/images/#{suffix}/#{sans_suffix}")
	end
	end

	names.each {\|n\| `cat #{n}/text/pages/*.txt > #{n}/text/full/#{n}.txt` }

	`tar -czf processed_pdfs.tar *`
	save("processed_pdfs.tar")
	end


	private

	def generate_images_commands(command_list)
	Dir["*.pdf"].each do \|pdf\|
	name = File.basename(pdf, File.extname(pdf))
	options['images'].each do \|i\|
	command_list << "gm convert #{i['options']} #{pdf} #{name}_#{i['name']}.#{i['extension']}"
	end
	end
	end

	def generate_text_commands(command_list)
	Dir["*.pdf"].each do \|pdf\|
	name = File.basename(pdf, File.extname(pdf))
	command_list << "pdftotext -enc UTF-8 -layout -q #{pdf} #{name}.txt"
	end
	end

	end
	#!/usr/bin/env ruby -rubygems

	require 'restclient'
	require 'json'

	# This example demonstrates a fairly complicated PDF-processing action, designed
	# to extract the PDF's text, and produce GIF versions of each page. The action
	# (actions/process_pdfs.rb) shows an example of using all three steps,
	# split, process, and merge.

	RestClient.post('http://localhost:9173/jobs',
	{:job => {

	'action' => 'process_pdfs',

	'inputs' => [
	'http://tigger.uic.edu/~victor/personal/futurism.pdf',
	'http://www.jonasmekas.com/Catalog_excerpt/The%20Avant-Garde%20From%20Futurism%20to%20Fluxus.pdf',
	'http://www.dzignism.com/articles/Futurist.Manifesto.pdf',
	'http://benfry.com/phd/dissertation-050312b-acrobat.pdf'
	],

	'options' => {

	'batch_size' => 7,

	'images' => [{
	'name' => '700',
	'options' => '-resize 700x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
	'extension' => 'gif'
	},{
	'name' => '1000',
	'options' => '-resize 1000x -density 220 -depth 4 -unsharp 0.5x0.5+0.5+0.03',
	'extension' => 'gif'
	}]

	}

	}.to_json}
	)