Created
April 12, 2011 12:49
-
-
Save yatt/915443 to your computer and use it in GitHub Desktop.
simple c# class for Optical Character Recognition(OCR) using tesseract (http://code.google.com/p/tesseract-ocr/) usage: pass .exe path to constructor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// usage: | |
// | |
// TesseractOCR ocr = TesseractOCR(@"C:\bin\tesseract.exe"); | |
// string result = ocr.OCRFromBitmap(bmp); | |
// textBox1.Text = result; | |
// | |
using System; | |
using System.IO; | |
using System.Diagnostics; | |
using System.Drawing; | |
public class TesseractOCR | |
{ | |
private string commandpath; | |
private string outpath; | |
private string tmppath; | |
public TesseractOCR(string commandpath) | |
{ | |
this.commandpath = commandpath; | |
tmppath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.tif"; | |
outpath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.txt"; | |
} | |
public string analyze(string filename) | |
{ | |
string args = filename + " " + outpath.Replace(".txt", ""); | |
ProcessStartInfo startinfo = new ProcessStartInfo(commandpath, args); | |
startinfo.CreateNoWindow = true; | |
startinfo.UseShellExecute = false; | |
Process.Start(startinfo).WaitForExit(); | |
string ret = ""; | |
using (StreamReader r = new StreamReader(outpath)) | |
{ | |
string content = r.ReadToEnd(); | |
ret = content; | |
} | |
File.Delete(outpath); | |
return ret; | |
} | |
public string OCRFromBitmap(Bitmap bmp) | |
{ | |
bmp.Save(tmppath, System.Drawing.Imaging.ImageFormat.Tiff); | |
string ret = analyze(tmppath); | |
File.Delete(tmppath); | |
return ret; | |
} | |
public string OCRFromFile(string filename) | |
{ | |
return analyze(filename); | |
} | |
} |
public class TesseractOCR
{
private string commandpath;
private string outpath;
private string tmppath;
public TesseractOCR(string commandpath, string tempDir)
{
this.commandpath = commandpath;
var guidImage = Guid.NewGuid();
tmppath = tempDir + @"\" + guidImage + ".tif";
outpath = tempDir + @"\" + guidImage + ".txt";
}
public string analyze(string filename)
{
var timeout = 1000 * 60;
string args = filename + " " + outpath.Replace(".txt", "");
string ret = "";
using (Process process = new Process())
{
process.StartInfo.FileName = commandpath;
process.StartInfo.Arguments = args;
process.StartInfo.UseShellExecute = false;
process.StartInfo.RedirectStandardOutput = true;
process.StartInfo.RedirectStandardError = true;
StringBuilder output = new StringBuilder();
StringBuilder error = new StringBuilder();
using (AutoResetEvent outputWaitHandle = new AutoResetEvent(false))
using (AutoResetEvent errorWaitHandle = new AutoResetEvent(false))
{
process.OutputDataReceived += (sender, e) =>
{
if (e.Data == null)
{
outputWaitHandle.Set();
}
else
{
output.AppendLine(e.Data);
}
};
process.ErrorDataReceived += (sender, e) =>
{
if (e.Data == null)
{
errorWaitHandle.Set();
}
else
{
error.AppendLine(e.Data);
}
};
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
if (process.WaitForExit(timeout) &&
outputWaitHandle.WaitOne(timeout) &&
errorWaitHandle.WaitOne(timeout))
{
using (StreamReader r = new StreamReader(outpath))
{
string content = r.ReadToEnd();
ret = content;
}
File.Delete(outpath);
}
else
{
throw new Exception("Time out" + error.ToString());
}
}
}
return ret.Trim();
}
public string OCRFromBitmap(Bitmap bmp)
{
bmp.Save(tmppath, System.Drawing.Imaging.ImageFormat.Tiff);
string ret = analyze(tmppath);
File.Delete(tmppath);
return ret;
}
public string OCRFromFile(string filename)
{
return analyze(filename);
}
}
Hello
It it throws an exception for not having the outpath, particularly this code does not work (I have tried different types of outpath)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Shouldn't there be a "new" keyword in the usage before TessercatOCR("path")?