Last active
March 14, 2018 00:29
-
-
Save matthewjberger/f4f6fc5e56ec78057949bf8c76fbf977 to your computer and use it in GitHub Desktop.
Extract Image from pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Drawing; | |
using System.IO; | |
using iTextSharp.text.pdf; | |
using iTextSharp.text.pdf.parser; | |
using Tesseract; | |
using System.Drawing.Imaging; | |
namespace ExtractInvoice | |
{ | |
class Program | |
{ | |
// The header ends at 1600 pixels on all invoices | |
private const int headerHeight_ = 1600; | |
static void Main(string[] args) | |
{ | |
try | |
{ | |
string tessDataPath = | |
@"C:\Users\Berger_MA\Documents\visual studio 2015\Projects\ExtractInvoice\tessdata"; | |
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\a.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images"); | |
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\b.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images"); | |
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\c.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images"); | |
ExtractText(@"C:\Users\Berger_MA\Downloads\Invoices\Images\c0.jpg", tessDataPath); | |
} | |
catch (Exception ex) | |
{ | |
System.Console.WriteLine(ex.Message); | |
} | |
} | |
#region Methods | |
/// <summary> | |
/// Extract all images from a pdf, and store them in a list of Images. | |
/// </summary> | |
/// <param name="PDFSourcePath">Specify PDF Source Path</param> | |
/// <returns>List</returns> | |
private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath) | |
{ | |
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>(); | |
try | |
{ | |
RandomAccessFileOrArray RAFObj = null; | |
PdfReader PDFReaderObj = null; | |
PdfObject PDFObj = null; | |
PdfStream PDFStremObj = null; | |
RAFObj = new RandomAccessFileOrArray(PDFSourcePath); | |
PDFReaderObj = new PdfReader(RAFObj, null); | |
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++) | |
{ | |
PDFObj = PDFReaderObj.GetPdfObject(i); | |
if ((PDFObj != null) && PDFObj.IsStream()) | |
{ | |
PDFStremObj = (PdfStream)PDFObj; | |
PdfObject subtype = PDFStremObj.Get(PdfName.SUBTYPE); | |
if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString()) | |
{ | |
try | |
{ | |
PdfImageObject PdfImageObj = | |
new PdfImageObject((PRStream)PDFStremObj); | |
System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage(); | |
ImgList.Add(ImgPDF); | |
} | |
catch (Exception) { /* Fail silently */ } | |
} | |
} | |
} | |
PDFReaderObj.Close(); | |
} | |
catch (Exception ex) | |
{ | |
throw new Exception(ex.Message); | |
} | |
return ImgList; | |
} | |
/// <summary> | |
/// Extracts images from a pdf, and saves them to a file. | |
/// </summary> | |
private static void SaveImages(string pathToPdf, string outputPath) | |
{ | |
try | |
{ | |
string name = System.IO.Path.GetFileNameWithoutExtension(pathToPdf); | |
if (!Directory.Exists(outputPath)) Directory.CreateDirectory(outputPath); | |
// Get a List of Image | |
List<System.Drawing.Image> ListImage = ExtractImages(pathToPdf); | |
for (int i = 0; i < ListImage.Count; i++) | |
{ | |
try | |
{ | |
string currentName = name + i + ".jpg"; | |
Bitmap bmpImage = new Bitmap(ListImage[i]); | |
// White out logo | |
using (Graphics graphics = Graphics.FromImage(bmpImage)) | |
{ | |
graphics.FillRectangle(new SolidBrush(Color.White), 0, 0, 930, 540); | |
} | |
bmpImage = Sharpen(bmpImage); | |
// Crop the image | |
Rectangle cropRect = new Rectangle(); | |
cropRect.X = 0; | |
cropRect.Y = 0; | |
cropRect.Width = bmpImage.Width; | |
cropRect.Height = headerHeight_; | |
Image croppedimage = bmpImage.Clone(cropRect, bmpImage.PixelFormat); | |
// Save the image to a file | |
croppedimage.Save(System.IO.Path.Combine(outputPath, currentName), System.Drawing.Imaging.ImageFormat.Jpeg); | |
} | |
catch (Exception) | |
{ /* Fail silently and continue */ } | |
} | |
} | |
catch (Exception ex) | |
{ | |
throw new Exception(ex.Message); | |
} | |
} | |
/// <summary> | |
/// Extracts all the text from an image | |
/// </summary> | |
/// <param name="pathToImage">The path to the image to extract text from.</param> | |
/// <returns>The extracted text</returns> | |
private static string ExtractText(string pathToImage, string tessDataPath) | |
{ | |
try | |
{ | |
// Creating the tesseract OCR engine with English as the language | |
using (var tEngine = new TesseractEngine(tessDataPath, "eng", EngineMode.Default)) | |
{ | |
tEngine.SetVariable("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345789,-/@"); | |
using (var img = Pix.LoadFromFile(pathToImage)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure | |
{ | |
using (var page = tEngine.Process(img)) //process the specified image | |
{ | |
var text = page.GetText(); //Gets the image's content as plain text. | |
Console.WriteLine(text); //display the text | |
Console.WriteLine(page.GetMeanConfidence()); //Get's the mean confidence that as a percentage of the recognized text. | |
Console.ReadKey(); | |
} | |
} | |
} | |
} | |
catch (Exception e) | |
{ | |
Console.WriteLine("Unexpected Error: " + e.Message); | |
} | |
return ""; | |
} | |
public static Bitmap Sharpen(Bitmap image) | |
{ | |
Bitmap sharpenImage = (Bitmap)image.Clone(); | |
int filterWidth = 3; | |
int filterHeight = 3; | |
int width = image.Width; | |
int height = image.Height; | |
// Create sharpening filter. | |
double[,] filter = new double[filterWidth, filterHeight]; | |
filter[0, 0] = filter[0, 1] = filter[0, 2] = filter[1, 0] = filter[1, 2] = filter[2, 0] = filter[2, 1] = filter[2, 2] = -1; | |
filter[1, 1] = 9; | |
double factor = 1.0; | |
double bias = 0.0; | |
Color[,] result = new Color[image.Width, image.Height]; | |
// Lock image bits for read/write. | |
BitmapData pbits = sharpenImage.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb); | |
// Declare an array to hold the bytes of the bitmap. | |
int bytes = pbits.Stride * height; | |
byte[] rgbValues = new byte[bytes]; | |
// Copy the RGB values into the array. | |
System.Runtime.InteropServices.Marshal.Copy(pbits.Scan0, rgbValues, 0, bytes); | |
int rgb; | |
// Fill the color array with the new sharpened color values. | |
for (int x = 0; x < width; ++x) | |
{ | |
for (int y = 0; y < height; ++y) | |
{ | |
double red = 0.0, green = 0.0, blue = 0.0; | |
for (int filterX = 0; filterX < filterWidth; filterX++) | |
{ | |
for (int filterY = 0; filterY < filterHeight; filterY++) | |
{ | |
int imageX = (x - filterWidth / 2 + filterX + width) % width; | |
int imageY = (y - filterHeight / 2 + filterY + height) % height; | |
rgb = imageY * pbits.Stride + 3 * imageX; | |
red += rgbValues[rgb + 2] * filter[filterX, filterY]; | |
green += rgbValues[rgb + 1] * filter[filterX, filterY]; | |
blue += rgbValues[rgb + 0] * filter[filterX, filterY]; | |
} | |
int r = Math.Min(Math.Max((int)(factor * red + bias), 0), 255); | |
int g = Math.Min(Math.Max((int)(factor * green + bias), 0), 255); | |
int b = Math.Min(Math.Max((int)(factor * blue + bias), 0), 255); | |
result[x, y] = Color.FromArgb(r, g, b); | |
} | |
} | |
} | |
// Update the image with the sharpened pixels. | |
for (int x = 0; x < width; ++x) | |
{ | |
for (int y = 0; y < height; ++y) | |
{ | |
rgb = y * pbits.Stride + 3 * x; | |
rgbValues[rgb + 2] = result[x, y].R; | |
rgbValues[rgb + 1] = result[x, y].G; | |
rgbValues[rgb + 0] = result[x, y].B; | |
} | |
} | |
// Copy the RGB values back to the bitmap. | |
System.Runtime.InteropServices.Marshal.Copy(rgbValues, 0, pbits.Scan0, bytes); | |
// Release image bits. | |
sharpenImage.UnlockBits(pbits); | |
return sharpenImage; | |
} | |
#endregion | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @matthewjberger which version of library ItextSharp are you using ?