matthewjberger · March 14, 2018 00:29 · MarceloNascimento · Mar 14, 2018
diff --git a/Extract.cs b/Extract.cs
 using System;
 using System.Collections.Generic;
 using System.Drawing;
 using System.IO;
 using iTextSharp.text.pdf;
 using iTextSharp.text.pdf.parser;
 using Tesseract;
 using System.Drawing.Imaging;

 namespace ExtractInvoice
 {
    class Program
    {
        // The header ends at 1600 pixels on all invoices
        private const int headerHeight_ = 1600;

        static void Main(string[] args)
        {
            try
            {
                string tessDataPath =
                    @"C:\Users\Berger_MA\Documents\visual studio 2015\Projects\ExtractInvoice\tessdata";
                SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\a.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
                SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\b.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
                SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\c.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
                ExtractText(@"C:\Users\Berger_MA\Downloads\Invoices\Images\c0.jpg", tessDataPath);
            }
            catch (Exception ex)
            {
                System.Console.WriteLine(ex.Message);
            }
        }

        #region Methods

        /// <summary>
        /// Extract all images from a pdf, and store them in a list of Images.
        /// </summary>
        /// <param name="PDFSourcePath">Specify PDF Source Path</param>
        /// <returns>List</returns>
        private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
        {
            List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

            try
            {
                RandomAccessFileOrArray RAFObj = null;
                PdfReader PDFReaderObj = null;
                PdfObject PDFObj = null;
                PdfStream PDFStremObj = null;

                RAFObj = new RandomAccessFileOrArray(PDFSourcePath);
                PDFReaderObj = new PdfReader(RAFObj, null);

                for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
                {
                    PDFObj = PDFReaderObj.GetPdfObject(i);

                    if ((PDFObj != null) && PDFObj.IsStream())
                    {
                        PDFStremObj = (PdfStream)PDFObj;
                        PdfObject subtype = PDFStremObj.Get(PdfName.SUBTYPE);

                        if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
                        {
                            try
                            {
                                PdfImageObject PdfImageObj =
                                    new PdfImageObject((PRStream)PDFStremObj);

                                System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage();

                                ImgList.Add(ImgPDF);
                            }
                            catch (Exception) { /* Fail silently */ }
                        }
                    }
                }
                PDFReaderObj.Close();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
            return ImgList;
        }

        /// <summary>
        /// Extracts images from a pdf, and saves them to a file.
        /// </summary>
        private static void SaveImages(string pathToPdf, string outputPath)
        {
            try
            {
                string name = System.IO.Path.GetFileNameWithoutExtension(pathToPdf);
                if (!Directory.Exists(outputPath)) Directory.CreateDirectory(outputPath);

                // Get a List of Image
                List<System.Drawing.Image> ListImage = ExtractImages(pathToPdf);

                for (int i = 0; i < ListImage.Count; i++)
                {
                    try
                    {
                        string currentName = name + i + ".jpg";

                        Bitmap bmpImage = new Bitmap(ListImage[i]);

                        // White out logo
                        using (Graphics graphics = Graphics.FromImage(bmpImage))
                        {
                            graphics.FillRectangle(new SolidBrush(Color.White), 0, 0, 930, 540);
                        }

                        bmpImage = Sharpen(bmpImage);

                        // Crop the image
                        Rectangle cropRect = new Rectangle();
                        cropRect.X = 0;
                        cropRect.Y = 0;
                        cropRect.Width = bmpImage.Width;
                        cropRect.Height = headerHeight_;
                        Image croppedimage = bmpImage.Clone(cropRect, bmpImage.PixelFormat);

                        // Save the image to a file
                        croppedimage.Save(System.IO.Path.Combine(outputPath, currentName), System.Drawing.Imaging.ImageFormat.Jpeg);
                    }
                    catch (Exception)
                    { /* Fail silently and continue */ }
                }

            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }

        /// <summary>
        /// Extracts all the text from an image
        /// </summary>
        /// <param name="pathToImage">The path to the image to extract text from.</param>
        /// <returns>The extracted text</returns>
        private static string ExtractText(string pathToImage, string tessDataPath)
        {
            try
            {
                // Creating the tesseract OCR engine with English as the language
                using (var tEngine = new TesseractEngine(tessDataPath, "eng", EngineMode.Default)) 
                {
                    tEngine.SetVariable("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345789,-/@");
                    using (var img = Pix.LoadFromFile(pathToImage)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure
                    {
                        using (var page = tEngine.Process(img)) //process the specified image
                        {
                            var text = page.GetText(); //Gets the image's content as plain text.
                            Console.WriteLine(text); //display the text
                            Console.WriteLine(page.GetMeanConfidence()); //Get's the mean confidence that as a percentage of the recognized text.
                            Console.ReadKey();
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Unexpected Error: " + e.Message);
            }
            return "";
        }

        public static Bitmap Sharpen(Bitmap image)
        {
            Bitmap sharpenImage = (Bitmap)image.Clone();

            int filterWidth = 3;
            int filterHeight = 3;
            int width = image.Width;
            int height = image.Height;

            // Create sharpening filter.
            double[,] filter = new double[filterWidth, filterHeight];
            filter[0, 0] = filter[0, 1] = filter[0, 2] = filter[1, 0] = filter[1, 2] = filter[2, 0] = filter[2, 1] = filter[2, 2] = -1;
            filter[1, 1] = 9;

            double factor = 1.0;
            double bias = 0.0;

            Color[,] result = new Color[image.Width, image.Height];

            // Lock image bits for read/write.
            BitmapData pbits = sharpenImage.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb);

            // Declare an array to hold the bytes of the bitmap.
            int bytes = pbits.Stride * height;
            byte[] rgbValues = new byte[bytes];

            // Copy the RGB values into the array.
            System.Runtime.InteropServices.Marshal.Copy(pbits.Scan0, rgbValues, 0, bytes);

            int rgb;
            // Fill the color array with the new sharpened color values.
            for (int x = 0; x < width; ++x)
            {
                for (int y = 0; y < height; ++y)
                {
                    double red = 0.0, green = 0.0, blue = 0.0;

                    for (int filterX = 0; filterX < filterWidth; filterX++)
                    {
                        for (int filterY = 0; filterY < filterHeight; filterY++)
                        {
                            int imageX = (x - filterWidth / 2 + filterX + width) % width;
                            int imageY = (y - filterHeight / 2 + filterY + height) % height;

                            rgb = imageY * pbits.Stride + 3 * imageX;

                            red += rgbValues[rgb + 2] * filter[filterX, filterY];
                            green += rgbValues[rgb + 1] * filter[filterX, filterY];
                            blue += rgbValues[rgb + 0] * filter[filterX, filterY];
                        }
                        int r = Math.Min(Math.Max((int)(factor * red + bias), 0), 255);
                        int g = Math.Min(Math.Max((int)(factor * green + bias), 0), 255);
                        int b = Math.Min(Math.Max((int)(factor * blue + bias), 0), 255);

                        result[x, y] = Color.FromArgb(r, g, b);
                    }
                }
            }

            // Update the image with the sharpened pixels.
            for (int x = 0; x < width; ++x)
            {
                for (int y = 0; y < height; ++y)
                {
                    rgb = y * pbits.Stride + 3 * x;

                    rgbValues[rgb + 2] = result[x, y].R;
                    rgbValues[rgb + 1] = result[x, y].G;
                    rgbValues[rgb + 0] = result[x, y].B;
                }
            }

            // Copy the RGB values back to the bitmap.
            System.Runtime.InteropServices.Marshal.Copy(rgbValues, 0, pbits.Scan0, bytes);
            // Release image bits.
            sharpenImage.UnlockBits(pbits);

            return sharpenImage;
        }
        #endregion
    }
 }
	using System;
	using System.Collections.Generic;
	using System.Drawing;
	using System.IO;
	using iTextSharp.text.pdf;
	using iTextSharp.text.pdf.parser;
	using Tesseract;
	using System.Drawing.Imaging;

	namespace ExtractInvoice
	{
	class Program
	{
	// The header ends at 1600 pixels on all invoices
	private const int headerHeight_ = 1600;

	static void Main(string[] args)
	{
	try
	{
	string tessDataPath =
	@"C:\Users\Berger_MA\Documents\visual studio 2015\Projects\ExtractInvoice\tessdata";
	SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\a.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
	SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\b.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
	SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\c.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
	ExtractText(@"C:\Users\Berger_MA\Downloads\Invoices\Images\c0.jpg", tessDataPath);
	}
	catch (Exception ex)
	{
	System.Console.WriteLine(ex.Message);
	}
	}

	#region Methods

	/// <summary>
	/// Extract all images from a pdf, and store them in a list of Images.
	/// </summary>
	/// <param name="PDFSourcePath">Specify PDF Source Path</param>
	/// <returns>List</returns>
	private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
	{
	List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

	try
	{
	RandomAccessFileOrArray RAFObj = null;
	PdfReader PDFReaderObj = null;
	PdfObject PDFObj = null;
	PdfStream PDFStremObj = null;

	RAFObj = new RandomAccessFileOrArray(PDFSourcePath);
	PDFReaderObj = new PdfReader(RAFObj, null);

	for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
	{
	PDFObj = PDFReaderObj.GetPdfObject(i);

	if ((PDFObj != null) && PDFObj.IsStream())
	{
	PDFStremObj = (PdfStream)PDFObj;
	PdfObject subtype = PDFStremObj.Get(PdfName.SUBTYPE);

	if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
	{
	try
	{
	PdfImageObject PdfImageObj =
	new PdfImageObject((PRStream)PDFStremObj);

	System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage();

	ImgList.Add(ImgPDF);
	}
	catch (Exception) { /* Fail silently */ }
	}
	}
	}
	PDFReaderObj.Close();
	}
	catch (Exception ex)
	{
	throw new Exception(ex.Message);
	}
	return ImgList;
	}

	/// <summary>
	/// Extracts images from a pdf, and saves them to a file.
	/// </summary>
	private static void SaveImages(string pathToPdf, string outputPath)
	{
	try
	{
	string name = System.IO.Path.GetFileNameWithoutExtension(pathToPdf);
	if (!Directory.Exists(outputPath)) Directory.CreateDirectory(outputPath);

	// Get a List of Image
	List<System.Drawing.Image> ListImage = ExtractImages(pathToPdf);

	for (int i = 0; i < ListImage.Count; i++)
	{
	try
	{
	string currentName = name + i + ".jpg";

	Bitmap bmpImage = new Bitmap(ListImage[i]);

	// White out logo
	using (Graphics graphics = Graphics.FromImage(bmpImage))
	{
	graphics.FillRectangle(new SolidBrush(Color.White), 0, 0, 930, 540);
	}

	bmpImage = Sharpen(bmpImage);

	// Crop the image
	Rectangle cropRect = new Rectangle();
	cropRect.X = 0;
	cropRect.Y = 0;
	cropRect.Width = bmpImage.Width;
	cropRect.Height = headerHeight_;
	Image croppedimage = bmpImage.Clone(cropRect, bmpImage.PixelFormat);

	// Save the image to a file
	croppedimage.Save(System.IO.Path.Combine(outputPath, currentName), System.Drawing.Imaging.ImageFormat.Jpeg);
	}
	catch (Exception)
	{ /* Fail silently and continue */ }
	}

	}
	catch (Exception ex)
	{
	throw new Exception(ex.Message);
	}
	}

	/// <summary>
	/// Extracts all the text from an image
	/// </summary>
	/// <param name="pathToImage">The path to the image to extract text from.</param>
	/// <returns>The extracted text</returns>
	private static string ExtractText(string pathToImage, string tessDataPath)
	{
	try
	{
	// Creating the tesseract OCR engine with English as the language
	using (var tEngine = new TesseractEngine(tessDataPath, "eng", EngineMode.Default))
	{
	tEngine.SetVariable("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345789,-/@");
	using (var img = Pix.LoadFromFile(pathToImage)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure
	{
	using (var page = tEngine.Process(img)) //process the specified image
	{
	var text = page.GetText(); //Gets the image's content as plain text.
	Console.WriteLine(text); //display the text
	Console.WriteLine(page.GetMeanConfidence()); //Get's the mean confidence that as a percentage of the recognized text.
	Console.ReadKey();
	}
	}
	}
	}
	catch (Exception e)
	{
	Console.WriteLine("Unexpected Error: " + e.Message);
	}
	return "";
	}

	public static Bitmap Sharpen(Bitmap image)
	{
	Bitmap sharpenImage = (Bitmap)image.Clone();

	int filterWidth = 3;
	int filterHeight = 3;
	int width = image.Width;
	int height = image.Height;

	// Create sharpening filter.
	double[,] filter = new double[filterWidth, filterHeight];
	filter[0, 0] = filter[0, 1] = filter[0, 2] = filter[1, 0] = filter[1, 2] = filter[2, 0] = filter[2, 1] = filter[2, 2] = -1;
	filter[1, 1] = 9;

	double factor = 1.0;
	double bias = 0.0;

	Color[,] result = new Color[image.Width, image.Height];

	// Lock image bits for read/write.
	BitmapData pbits = sharpenImage.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb);

	// Declare an array to hold the bytes of the bitmap.
	int bytes = pbits.Stride * height;
	byte[] rgbValues = new byte[bytes];

	// Copy the RGB values into the array.
	System.Runtime.InteropServices.Marshal.Copy(pbits.Scan0, rgbValues, 0, bytes);

	int rgb;
	// Fill the color array with the new sharpened color values.
	for (int x = 0; x < width; ++x)
	{
	for (int y = 0; y < height; ++y)
	{
	double red = 0.0, green = 0.0, blue = 0.0;

	for (int filterX = 0; filterX < filterWidth; filterX++)
	{
	for (int filterY = 0; filterY < filterHeight; filterY++)
	{
	int imageX = (x - filterWidth / 2 + filterX + width) % width;
	int imageY = (y - filterHeight / 2 + filterY + height) % height;

	rgb = imageY * pbits.Stride + 3 * imageX;

	red += rgbValues[rgb + 2] * filter[filterX, filterY];
	green += rgbValues[rgb + 1] * filter[filterX, filterY];
	blue += rgbValues[rgb + 0] * filter[filterX, filterY];
	}
	int r = Math.Min(Math.Max((int)(factor * red + bias), 0), 255);
	int g = Math.Min(Math.Max((int)(factor * green + bias), 0), 255);
	int b = Math.Min(Math.Max((int)(factor * blue + bias), 0), 255);

	result[x, y] = Color.FromArgb(r, g, b);
	}
	}
	}

	// Update the image with the sharpened pixels.
	for (int x = 0; x < width; ++x)
	{
	for (int y = 0; y < height; ++y)
	{
	rgb = y * pbits.Stride + 3 * x;

	rgbValues[rgb + 2] = result[x, y].R;
	rgbValues[rgb + 1] = result[x, y].G;
	rgbValues[rgb + 0] = result[x, y].B;
	}
	}

	// Copy the RGB values back to the bitmap.
	System.Runtime.InteropServices.Marshal.Copy(rgbValues, 0, pbits.Scan0, bytes);
	// Release image bits.
	sharpenImage.UnlockBits(pbits);

	return sharpenImage;
	}
	#endregion
	}
	}