Created
March 13, 2013 14:15
-
-
Save jronallo/5152518 to your computer and use it in GitHub Desktop.
A toy command line utility for OCRing and cleaning OCR output.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# tesse: commandline tool for looking at tesseract OCR and cleaning the output | |
# Besides the following gem requirements it requires the following Linux programs: | |
# eog: for viewing the images | |
# wmctrl: for resizing and positioning the image viewing window | |
require 'tesseract' | |
require 'ffi/aspell' | |
require 'tempfile' | |
require 'formatador' | |
file = File.expand_path(ARGV[0]) | |
system('clear') | |
engine = Tesseract::Engine.new {|e| | |
e.language = :eng | |
e.blacklist = '|' | |
e.whitelist = [*'a'..'z', *'A'..'Z', *0..9, " ."].join | |
} | |
Speller = FFI::Aspell::Speller.new('en_US') | |
def spelling_percentage(text) | |
words = text.split(' ') | |
total = words.length | |
correct = 0.0 | |
words.each do |word| | |
if Speller.correct?(word) | |
correct += 1 | |
end | |
end | |
correct / total * 100.0 | |
end | |
cleaned_ocr = [] | |
engine.each_line_for(file) do |line| | |
text = line.text | |
puts text | |
Formatador.display "[green]\n-----------------------------------------------\n[/]" | |
table = {} | |
table[:confidence] = line.confidence | |
table[:spelling] = spelling_percentage(text) | |
Formatador.display_table([table]) | |
# write the line image to a file | |
file = Tempfile.new('tesse_image') | |
# puts file.path | |
file.write line.image.to_blob | |
# show the image in eog and position the windows nicely | |
eog_pid = fork do | |
exec %Q|eog #{file.path} 2>/dev/null| | |
end | |
system "sleep 1; wmctrl -a tesse; wmctrl -r tesse_image -e 0,0,700,1500,300;" | |
print "Continue? " | |
value = STDIN.gets.chomp | |
case value | |
when 'e' | |
puts "Type out a better transcript of this line:" | |
text = STDIN.gets.chomp | |
when 'd' | |
text = nil | |
end | |
cleaned_ocr << text | |
# clean up by killing the image viewer process | |
Process.kill 'ABRT', eog_pid | |
`killall eog` | |
system('clear') | |
file.close | |
file.unlink | |
end | |
puts 'Done! Following is the cleaned output' | |
Formatador.display "[green]\n-----------------------------------------------\n[/]" | |
cleaned_ocr.compact! | |
puts cleaned_ocr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment