Created
July 25, 2018 03:21
-
-
Save dburriss/a9a5dec2abde55fe4ab3d8364901e014 to your computer and use it in GitHub Desktop.
Helper F# script file for extracting text from a PDF file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "packages/PdfSharp/lib/net20/PdfSharp.dll" | |
open PdfSharp.Pdf.IO | |
open System.Text | |
open PdfSharp.Pdf.Content.Objects | |
open PdfSharp.Pdf.Content | |
let rec extractText(content:CObject, sb:StringBuilder) = | |
match content with | |
| :? CArray as xs -> for x in xs do extractText(x, sb) | |
| :? CComment -> () | |
| :? CInteger -> () | |
| :? CName -> () | |
| :? CNumber -> () | |
| :? COperator as op // Tj/TJ = Show text | |
when op.OpCode.OpCodeName = OpCodeName.Tj || | |
op.OpCode.OpCodeName = OpCodeName.TJ -> | |
for element in op.Operands do extractText(element, sb) | |
sb.Append(" | ") |> ignore | |
| :? COperator -> () | |
| :? CSequence as xs -> for x in xs do extractText(x, sb) | |
| :? CString as s -> sb.Append(s.Value) |> ignore | |
| x -> raise <| System.NotImplementedException(x.ToString()) | |
let readAllText password (pdfPath:string) = | |
use document = PdfReader.Open(pdfPath, password, PdfDocumentOpenMode.ReadOnly) | |
let result = StringBuilder() | |
for page in document.Pages do | |
let content = ContentReader.ReadContent(page) | |
extractText(content, result) | |
result.AppendLine() |> ignore | |
result.ToString() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment