Last active
December 21, 2015 10:59
-
-
Save justinAurand/6296086 to your computer and use it in GitHub Desktop.
Run regex on PDF file. Return results to console.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Credit: http://stackoverflow.com/questions/17601176/read-a-pdf-and-find-a-specific-column-to-add-to-a-list | |
// You'll need to download the iTextSharp dll and add a reference to it. | |
using System; | |
using System.IO; | |
using System.Text; | |
using System.Collections.Generic; | |
using System.Text.RegularExpressions; | |
using iTextSharp.text.pdf; | |
using iTextSharp.text.pdf.parser; | |
class RegexOnPDF | |
{ | |
public static void Main() | |
{ | |
string pdfText = ReadPdfFile(@"C:\SomePDF.pdf"); | |
Regex regex = new Regex(@"Policy Number: (?<number>CSA\d{8})"); | |
foreach (Match match in regex.Matches(pdfText)) | |
Console.WriteLine(match.Groups["number"].Value); | |
Console.WriteLine("Complete."); | |
Console.ReadKey(); | |
} | |
public static string ReadPdfFile(string fileName) | |
{ | |
var stringBuilder = new StringBuilder(); | |
if (File.Exists(fileName)) | |
{ | |
var pdfReader = new PdfReader(fileName); | |
for (int page = 1; page <= pdfReader.NumberOfPages; page++) | |
{ | |
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); | |
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); | |
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); | |
stringBuilder.Append(currentText); | |
} | |
pdfReader.Close(); | |
} | |
return stringBuilder.ToString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment