Created
April 26, 2012 00:49
-
-
Save zph/2494870 to your computer and use it in GitHub Desktop.
Script to transform images to scanned documents (In Progress)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Author : [email protected] | |
# License: MIT | |
# Function: Wrapper for unpaper & imagemagick to allow easy conversions | |
# Date 2012.04.06 | |
# Credits: | |
# Requirements: | |
# Linux | |
# unpaper | |
# imagemagick | |
# RMagick Ruby Gem | |
# Must 'mkdir /media/ramdisk' && add 'mount -t tmpfs -o size=500M tmpfs /media/ramdisk' to /etc/rc.local | |
# => Source - http://neo4j.rubyforge.org/guides/configuration.html | |
# Commands in order | |
# convert INPUT.jpg -colorspace Gray OUTPUT.ppm | |
# unpaper --grayfilter-size 1,1 --grayfilter-step 1,1 --grayfilter-threshold 0.4 tmp.ppm tmp_out.ppm | |
# convert tmp_out.ppm final.jpg | |
# | |
# TODO | |
# | |
# use unix pipes instead of saving to hard drive, pipe STDIN / STDOUT to unpaper and from unpaper | |
# Implement RMagick conversions in memory | |
# Calculate image histograms to convert images based on prototypes (Range of 5 image styles) | |
# Localthresh command with very good though speckled results: bash localthresh -n yes -m 3 -r 35 -b 20 ../test_photos/IMG_2519.JPG IMG12.jpg | |
# source: http://peterhansen.ca/blog/using-imagemagick-to-filter-page-scans.html | |
# Then run isonoise to remove flecks | |
#======================================================= | |
# Notes on RMagick | |
# case sensitive require 'RMagick' | |
require 'pry' | |
require 'systemu' | |
require 'pathname' | |
require 'RMagick' | |
# binding.pry | |
def process_file(file) | |
tmp_dir = ("/media/ramdisk/" if File.exists?('/media/ramdisk')) || "#{Dir.pwd}/" | |
full_name = Pathname.new(File.expand_path(file)) | |
name = full_name.basename.to_s.split(full_name.extname)[0] | |
output_ppm = "#{tmp_dir}#{name}.ppm" | |
improved_ppm = "#{tmp_dir}#{name}_improved.ppm" | |
pdf_name = "#{name}.pdf" | |
jpg_name = "bw_#{name}.jpg" | |
intermediary_jpg = "#{tmp_dir}#{name}_tmp.jpg" | |
intermediary_jpg2 = "#{tmp_dir}#{name}_tmp2.jpg" | |
isonoise_location = File.expand_path("docs/isonoise") | |
localthresh_location = File.expand_path("docs/localthresh") | |
# Commandline testing options for convert | |
# -channel Red -threshold 70% -channel Blue -black-threshold 70% | |
convert_to_ppm = %Q(convert #{file} -colorspace Gray #{output_ppm}) | |
run_unpaper = %Q(unpaper --overwrite --grayfilter-size 1,1 --grayfilter-step 1,1 --grayfilter-threshold 2.0 #{output_ppm} #{improved_ppm}) | |
convert_to_pdf = %Q(convert #{improved_ppm} -compress zip #{pdf_name}) | |
localthresh = %Q(bash #{localthresh_location} -n yes -m 3 -r 35 -b 20 #{full_name} #{intermediary_jpg}) | |
isonoise = %Q(bash #{isonoise_location} #{intermediary_jpg} #{intermediary_jpg2}) | |
compress = %Q(convert #{intermediary_jpg2} -quality 10 #{jpg_name}) | |
pre_compress = %Q(convert #{full_name} -quality 20 #{full_name}) | |
# RMagick Methods for replacing system calls | |
# rimage = Magick::Image::read(full_name) | |
# gray_image = rimage.first.quantize(2, Magick::GRAYColorspace ) | |
# gray_image.write(output_ppm) | |
# RMagic Method to convert ppm to pdf | |
# ppm_image = Magick::Image::read(improved_ppm) | |
# ppm_image.write(pdf_name) | |
def system_u(command) | |
status, stdout, stderr = systemu(command) | |
[status, stdout, stderr] | |
end | |
#binding.pry | |
# | |
# hack while testing to remove prior pdf | |
File.unlink(pdf_name) if File.exists?(pdf_name) | |
# puts %x[#{convert_to_ppm}] | |
# puts %x[#{run_unpaper}] | |
# puts %x[#{convert_to_pdf}] | |
# system_u("#{convert_to_ppm}") | |
#rimage = Magick::Image::read(full_name) | |
#gray_image = rimage.first.quantize(64, Magick::GRAYColorspace ) | |
#gray_image.write(output_ppm) | |
#a = system_u("#{run_unpaper}") | |
#puts a | |
#system_u("#{convert_to_pdf}") | |
system_u("#{localthresh}") | |
system_u("#{isonoise}") | |
system_u("#{compress}") | |
File.unlink(improved_ppm) if File.exists?(improved_ppm) | |
File.unlink(output_ppm) if File.exists?(output_ppm) | |
# Cleanup process | |
# Clear ramdisk | |
Dir.glob('/media/ramdisk/*').each { |f| File.unlink(f) } | |
end | |
if File.identical?(__FILE__, $0) | |
input_jpg = ARGV[0].dup | |
process_file(ARGV) if ARGV.is_a?(String) | |
ARGV.each { |f| process_file(f) } if ARGV.is_a?(Array) | |
end | |
=begin | |
# Notes gathered from internet | |
#======================================= | |
# => Links | |
http://www.imagemagick.org/discourse-server/viewtopic.php?f=1&t=7867&start=0 | |
http://www.imagemagick.org/discourse-server/viewtopic.php?f=1&t=16916&sid=a7cc7c8175c3af78ff0dd0486922821e&start=15 | |
http://stackoverflow.com/questions/9608279/cleaning-scanned-grayscale-images-with-imagemagick | |
http://www.imagemagick.org/Usage/color_mods/#sigmoidal | |
http://www.imagemagick.org/Usage/color_mods/#level | |
http://www.simplesystems.org/RMagick/doc/constants.html#ColorspaceType | |
#!/bin/bash | |
# | |
# Digital Camera + This Software + Printer = A Document Photocopier | |
# | |
# Input: pictures of B&W Text documents taken with a digital camera using | |
# flash from about 3 feet away with no dark border around the page. | |
# | |
# Output1: b-file.tif (a very small B&W TIF file) | |
# Output2: g-file.jpg (a alternative grayscale file) | |
# | |
# If input is purely black and white, Output1 should be better | |
# If input is not purely black and white, Output2 may be better | |
# Source: http://staff.washington.edu/corey/camscan/scancvt | |
# Corey Satten, corey @ cac.washington.edu, March 2007 | |
do1 () { | |
echo starting $1 1>&2 | |
BASE="${1##*/}"; NAME=${BASE%.[jJ][pP][gG]}; TMP1="t-$BASE"; TMP2="x-$BASE" | |
trap 'rm -f "$TMP1" "$TMP2"; exit' 0 1 2 13 15 | |
CGQ="-colorspace gray -quality" | |
CGT="-compress group4 -density 480x480" | |
convert $CGQ 99 "$1" -resize 5120x5120 "$TMP2" | |
convert $CGQ 99 "$1" -resize 1024x1024 -negate -blur 15,15 -resize 5120x5120 "$TMP1" | |
composite $CGQ 99 -compose plus "$TMP2" "$TMP1" "$TMP1" | |
convert $CGQ 60 "$TMP1" -normalize -level 50,85% "g-$BASE" | |
convert $CGT "$TMP1" -normalize -threshold 85% "b-$NAME.tif" | |
rm -f "$TMP1" "$TMP2" | |
} | |
# This tries to detect multiprocessors and run 2 conversions in parallel | |
# Move CPUS=1 after the test to effectively disable the test. | |
CPUS=1 | |
if [ -f /proc/cpuinfo ] ;then | |
CPUS=`grep ^processor /proc/cpuinfo | wc -l` | |
if [ "$CPUS" -lt 2 ] ;then CPUS=1; fi | |
fi | |
for i in "$@"; do | |
case $#/$CPUS in | |
0/*) exit;; # done | |
1/*) do1 "$1"; shift;; # only one file to do | |
*/1) do1 "$1"; shift;; # only one cpu to use | |
*) do1 "$1" & do1 "$2"; wait; shift; shift;; # process 2 files at once | |
esac | |
done | |
exit 0 | |
__________________ | |
ORsame source | |
================== | |
mogrify -format tif -colorspace gray -compress group4 -resize 5120x5120 -normalize -threshold 65% *.jpg | |
=end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment