Created
June 13, 2019 10:50
-
-
Save giacomelli/ae80c1ed9b374894b21890f1d2dee42a to your computer and use it in GitHub Desktop.
Using Tesseract4 with C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
namespace Ocr | |
{ | |
/// <summary> | |
/// Service to read texts from images through OCR Tesseract engine. | |
/// </summary> | |
public class TesseractService | |
{ | |
private readonly string _tesseractExePath; | |
private readonly string _language; | |
/// <summary> | |
/// Initializes a new instance of the <see cref="TesseractService"/> class. | |
/// </summary> | |
/// <param name="tesseractDir">The path for the Tesseract4 installation folder (C:\Program Files\Tesseract-OCR).</param> | |
/// <param name="language">The language used to extract text from images (eng, por, etc)</param> | |
/// <param name="dataDir">The data with the trained models (tessdata). Download the models from https://github.com/tesseract-ocr/tessdata_fast</param> | |
public TesseractService(string tesseractDir, string language = "en", string dataDir = null) | |
{ | |
// Tesseract configs. | |
_tesseractExePath = Path.Combine(tesseractDir, "tesseract.exe"); | |
_language = language; | |
if (String.IsNullOrEmpty(dataDir)) | |
dataDir = Path.Combine(tesseractDir, "tessdata"); | |
Environment.SetEnvironmentVariable("TESSDATA_PREFIX", dataDir); | |
} | |
/// <summary> | |
/// Read text from the images streams. | |
/// </summary> | |
/// <param name="images">The images streams.</param> | |
/// <returns>The images text.</returns> | |
public string GetText(params Stream[] images) | |
{ | |
var output = string.Empty; | |
if (images.Any()) | |
{ | |
var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); | |
Directory.CreateDirectory(tempPath); | |
var tempInputFile = NewTempFileName(tempPath); | |
var tempOutputFile = NewTempFileName(tempPath); | |
try | |
{ | |
WriteInputFiles(images, tempPath, tempInputFile); | |
var info = new ProcessStartInfo | |
{ | |
FileName = _tesseractExePath, | |
Arguments = $"{tempInputFile} {tempOutputFile} -l {_language}", | |
RedirectStandardError = true, | |
RedirectStandardOutput = true, | |
CreateNoWindow = true, | |
UseShellExecute = false | |
}; | |
using (var ps = Process.Start(info)) | |
{ | |
ps.WaitForExit(); | |
var exitCode = ps.ExitCode; | |
if (exitCode == 0) | |
{ | |
output = File.ReadAllText(tempOutputFile + ".txt"); | |
} | |
else | |
{ | |
var stderr = ps.StandardError.ReadToEnd(); | |
throw new InvalidOperationException(stderr); | |
} | |
} | |
} | |
finally | |
{ | |
Directory.Delete(tempPath, true); | |
} | |
} | |
return output; | |
} | |
private static void WriteInputFiles(Stream[] inputStreams, string tempPath, string tempInputFile) | |
{ | |
// If there is more thant one image file, so build the list file using the images as input files. | |
if (inputStreams.Length > 1) | |
{ | |
var imagesListFileContent = new StringBuilder(); | |
foreach (var inputStream in inputStreams) | |
{ | |
var imageFile = NewTempFileName(tempPath); | |
using (var tempStream = File.OpenWrite(imageFile)) | |
{ | |
CopyStream(inputStream, tempStream); | |
} | |
imagesListFileContent.AppendLine(imageFile); | |
} | |
File.WriteAllText(tempInputFile, imagesListFileContent.ToString()); | |
} | |
else | |
{ | |
// If is only one image file, than use the image file as input file. | |
using (var tempStream = File.OpenWrite(tempInputFile)) | |
{ | |
CopyStream(inputStreams.First(), tempStream); | |
} | |
} | |
} | |
private static void CopyStream(Stream input, Stream output) | |
{ | |
if (input.CanSeek) | |
input.Seek(0, SeekOrigin.Begin); | |
input.CopyTo(output); | |
input.Close(); | |
} | |
private static string NewTempFileName(string tempPath) | |
{ | |
return Path.Combine(tempPath, Guid.NewGuid().ToString()); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var service = new TesseractService(@"C:\Program Files\Tesseract-OCR", "eng", @"C:\Program Files\Tesseract-OCR\tessdata"); | |
// var stream = File.OpenRead(string path); | |
// var stream = WebRequest.Create(string url).GetResponse().GetResponseStream(); | |
// var stream = new MemoryStream(byte[] buffer); | |
var text = service.GetText(stream); |
How to read coordinates for each word fetched from the document?
@h-Ebr the code was tested on Windows only and use some API only available on it.
@sshankhdhar you need to use the GetIterator and the TryGetBoundingBox. Here is a good SO answer about it https://stackoverflow.com/a/51285474/956886
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Your code makes an exception at line 65 in uwp platform
here is the exception:
System.ComponentModel.Win32Exception
HResult=0x80004005
Message={Illegal System DLL Relocation}
The system DLL %hs was relocated in memory. The application will not run properly.
The relocation occurred because the DLL %hs occupied an address range reserved for Windows system DLLs. The vendor supplying the DLL should be contacted for a new DLL
Source=System.Diagnostics.Process
StackTrace:
at System.Diagnostics.Process.StartWithCreateProcess(ProcessStartInfo startInfo)
at System.Diagnostics.Process.Start(ProcessStartInfo startInfo)
at Ocr.TesseractService.GetText(Stream[] images) in E:\Developing\C#\Test projects 19\OCRCoreTest6\OCRCoreTest6\TesseractService.cs:line 65
at OCRCoreTest6.MainPage.btnProcess_Click(Object sender, RoutedEventArgs e) in E:\Developing\C#\Test projects 19\OCRCoreTest6\OCRCoreTest6\MainPage.xaml.cs:line 38