fiee · September 30, 2011 08:36
diff --git a/ocerrors.py b/ocerrors.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 Replace common (G)OCR errors (in German)

 Usage: python ocerrors.py inputfile [outputfile]
 """

 import locale
 import os, sys 
 import shutil
 import re

 if len(sys.argv) < 2:
 	print __doc__
 	sys.exit(1)
 	
 infilename = sys.argv[1]
 if not os.path.isfile(infilename):
 	print u"%s is not a file!" % infilename
 	sys.exit(2)
 	
 infile = open(infilename, 'rU')
 text = u''.join(infile.readlines())
 infile.close()


 if len(sys.argv) > 2:
 	print sys.argv
 	outfilename = sys.argv[2]
 else:
 	shutil.move(infilename, infilename.replace('.txt','')+'.bak')
 	outfilename = infilename


 textre = (
 #	('\r',	'\n'), # not necessary with filemode rU
 	('-\n', ''),
 )

 rere = (
 	# combine lines
 	(u'([^\.][\.\!\?])\n',	r'\1\n###\n'), # CR -> #
 	(u'\n\n+',	r'\n###\n'), # CR -> #
 	(u'\n+',	u' '), # CR -> _
 	(u'###\s*', r'\n\n'),
 	# typical (G)OCR errors
 	(u'_', ''), # _ are always errors (?)
 	(u'(\w)(N)([a-zäöüß])',	u"\\1v\\3"), # N-v
 	(u'(\w)(F)([a-zäöüß])',	u"\\1f\\3"), # F-f
 	(u'(\w)(5|S)([a-zäöüß])',	u"\\1s\\3"), # 5/S-s
 	(u'(\w)(2)([a-zäöüß])',	u"\\1z\\3"), # 2-z
 	(u'(\w)(0)([a-zäöüß])',	u"\\1o\\3"), # 0-o
 	(u'(\w)(1|I)([a-zäöüß ])',	u"\\1l\\3"), # 1/I-l
 	
 	(u'([A-ZÄÖÜ])(l)([A-ZÖÄÜ])',	u"\\1I\\3"), # l-I
 	(u'([A-ZÄÖÜ])(0)([A-ZÖÄÜ])',	u"\\1O\\3"), # 0-O

 	(u'lc([^h])', u'k\\1'), # lc-k
 	(u'[1Il]C', u'K'), # 1/I/lC-K
 	
 	(u'(\d+)(o|O)',	u'\\g<1>0'), # o-0
 	(u'(\d+)(o|O)',	u'\\g<1>0'), # again
 	
 	(u'[ \t]+',	u' '),
 	(r"(\w+)u?\s?'\s?'\s?(\w+)",	u'\\1ü\\2'), # u''-ü
 	(u'(ie|ei|eu|au|äu|ai|rö)ss',	u'\\1ß'), # manuscript ß errors
 	(u'(<<|_<|<_|«)',	u'„'), # opening quotes
 	(u'\s?(>>|_>|>_|»)',	u'“'), # closing quotes
 	
 	(u'\n\. ?\n',	''), # whitespace
 )

 text = text.replace('-\n','') # remove hyphenation
 for (k, v) in rere:
 	try:
 		pat = re.compile(k, re.L|re.U)
 		(text, n) = re.subn(pat, v, text)
 		#print u'replace "%s"\twith "%s":\t%d replacements' % (k, v, n)
 		#print text
 		#raw_input('?')
 	except Exception, ex:
 		print ex
 		print k
 		sys.exit(3)

 outfile = open(outfilename, 'wb')
 outfile.write(text.encode('utf-8'))
 outfile.close()
diff --git a/ocr.sh b/ocr.sh
 #!/bin/bash
 prefix=$1
 prefix=${prefix:-noname}
 inputfilter=$prefix

 if [ "$prefix" = "noname" ]
 then
 	inputfilter=
 fi
 outputfile=$prefix.txt

 for scan in pbm/$inputfilter*.pbm
 do
 	echo processing $scan to $outputfile
 	gocr -i $scan -f UTF8 -l 0 -d -1 -m 130 -p ~/.db/ >> $outputfile
 done

 python ocerrors.py $outputfile

 aspell check $outputfile -lde

 edit $outputfile
diff --git a/tif2pbm.sh b/tif2pbm.sh
 #!/bin/bash
 # gocr understands only PBM pictures
 mkdir tif
 mkdir pbm
 for tif in *.tif
 do
 	pbm=${tif%tif}pbm
 	echo converting $tif to $pbm ...
 	convert $tif pbm/$pbm
 done
 mv *.tif tif/
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Replace common (G)OCR errors (in German)

	Usage: python ocerrors.py inputfile [outputfile]
	"""

	import locale
	import os, sys
	import shutil
	import re

	if len(sys.argv) < 2:
	print __doc__
	sys.exit(1)

	infilename = sys.argv[1]
	if not os.path.isfile(infilename):
	print u"%s is not a file!" % infilename
	sys.exit(2)

	infile = open(infilename, 'rU')
	text = u''.join(infile.readlines())
	infile.close()


	if len(sys.argv) > 2:
	print sys.argv
	outfilename = sys.argv[2]
	else:
	shutil.move(infilename, infilename.replace('.txt','')+'.bak')
	outfilename = infilename


	textre = (
	# ('\r', '\n'), # not necessary with filemode rU
	('-\n', ''),
	)

	rere = (
	# combine lines
	(u'([^\.][\.\!\?])\n', r'\1\n###\n'), # CR -> #
	(u'\n\n+', r'\n###\n'), # CR -> #
	(u'\n+', u' '), # CR -> _
	(u'###\s*', r'\n\n'),
	# typical (G)OCR errors
	(u'_', ''), # _ are always errors (?)
	(u'(\w)(N)([a-zäöüß])', u"\\1v\\3"), # N-v
	(u'(\w)(F)([a-zäöüß])', u"\\1f\\3"), # F-f
	(u'(\w)(5\|S)([a-zäöüß])', u"\\1s\\3"), # 5/S-s
	(u'(\w)(2)([a-zäöüß])', u"\\1z\\3"), # 2-z
	(u'(\w)(0)([a-zäöüß])', u"\\1o\\3"), # 0-o
	(u'(\w)(1\|I)([a-zäöüß ])', u"\\1l\\3"), # 1/I-l

	(u'([A-ZÄÖÜ])(l)([A-ZÖÄÜ])', u"\\1I\\3"), # l-I
	(u'([A-ZÄÖÜ])(0)([A-ZÖÄÜ])', u"\\1O\\3"), # 0-O

	(u'lc([^h])', u'k\\1'), # lc-k
	(u'[1Il]C', u'K'), # 1/I/lC-K

	(u'(\d+)(o\|O)', u'\\g<1>0'), # o-0
	(u'(\d+)(o\|O)', u'\\g<1>0'), # again

	(u'[ \t]+', u' '),
	(r"(\w+)u?\s?'\s?'\s?(\w+)", u'\\1ü\\2'), # u''-ü
	(u'(ie\|ei\|eu\|au\|äu\|ai\|rö)ss', u'\\1ß'), # manuscript ß errors
	(u'(<<\|_<\|<_\|«)', u'„'), # opening quotes
	(u'\s?(>>\|_>\|>_\|»)', u'“'), # closing quotes

	(u'\n\. ?\n', ''), # whitespace
	)

	text = text.replace('-\n','') # remove hyphenation
	for (k, v) in rere:
	try:
	pat = re.compile(k, re.L\|re.U)
	(text, n) = re.subn(pat, v, text)
	#print u'replace "%s"\twith "%s":\t%d replacements' % (k, v, n)
	#print text
	#raw_input('?')
	except Exception, ex:
	print ex
	print k
	sys.exit(3)

	outfile = open(outfilename, 'wb')
	outfile.write(text.encode('utf-8'))
	outfile.close()
	#!/bin/bash
	prefix=$1
	prefix=${prefix:-noname}
	inputfilter=$prefix

	if [ "$prefix" = "noname" ]
	then
	inputfilter=
	fi
	outputfile=$prefix.txt

	for scan in pbm/$inputfilter*.pbm
	do
	echo processing $scan to $outputfile
	gocr -i $scan -f UTF8 -l 0 -d -1 -m 130 -p ~/.db/ >> $outputfile
	done

	python ocerrors.py $outputfile

	aspell check $outputfile -lde

	edit $outputfile
	#!/bin/bash
	# gocr understands only PBM pictures
	mkdir tif
	mkdir pbm
	for tif in *.tif
	do
	pbm=${tif%tif}pbm
	echo converting $tif to $pbm ...
	convert $tif pbm/$pbm
	done
	mv *.tif tif/