Skip to content

Instantly share code, notes, and snippets.

@dburriss
Created July 25, 2018 03:21
Show Gist options
  • Save dburriss/a9a5dec2abde55fe4ab3d8364901e014 to your computer and use it in GitHub Desktop.
Save dburriss/a9a5dec2abde55fe4ab3d8364901e014 to your computer and use it in GitHub Desktop.
Helper F# script file for extracting text from a PDF file
#r "packages/PdfSharp/lib/net20/PdfSharp.dll"
open PdfSharp.Pdf.IO
open System.Text
open PdfSharp.Pdf.Content.Objects
open PdfSharp.Pdf.Content
let rec extractText(content:CObject, sb:StringBuilder) =
match content with
| :? CArray as xs -> for x in xs do extractText(x, sb)
| :? CComment -> ()
| :? CInteger -> ()
| :? CName -> ()
| :? CNumber -> ()
| :? COperator as op // Tj/TJ = Show text
when op.OpCode.OpCodeName = OpCodeName.Tj ||
op.OpCode.OpCodeName = OpCodeName.TJ ->
for element in op.Operands do extractText(element, sb)
sb.Append(" | ") |> ignore
| :? COperator -> ()
| :? CSequence as xs -> for x in xs do extractText(x, sb)
| :? CString as s -> sb.Append(s.Value) |> ignore
| x -> raise <| System.NotImplementedException(x.ToString())
let readAllText password (pdfPath:string) =
use document = PdfReader.Open(pdfPath, password, PdfDocumentOpenMode.ReadOnly)
let result = StringBuilder()
for page in document.Pages do
let content = ContentReader.ReadContent(page)
extractText(content, result)
result.AppendLine() |> ignore
result.ToString()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment