Created
July 11, 2021 17:35
-
-
Save facebookegypt/e59f8df09004351cbc77542d1d10e428 to your computer and use it in GitHub Desktop.
A text extraction renderer that keeps track of relative position of text on page '' * The resultant text will be relatively consistent with the physical layout that most '' * PDF files have on screen.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Namespace LocTextExtraction | |
Public Class LocTextExtractionStrategy | |
Implements ITextExtractionStrategy | |
'* set to true for debugging | |
Private _UndercontentCharacterSpacing = 0 | |
Private _UndercontentHorizontalScaling = 0 | |
Private ThisPdfDocFonts As SortedList(Of String, DocumentFont) | |
Public Shared DUMP_STATE As Boolean = False | |
'* a summary of all found text | |
Private locationalResult As New List(Of TextChunk)() | |
'Creates a new text extraction renderer. | |
Public Sub New() | |
ThisPdfDocFonts = New SortedList(Of String, DocumentFont) | |
End Sub | |
Public Overridable Sub BeginTextBlock() Implements ITextExtractionStrategy.BeginTextBlock | |
End Sub | |
Public Overridable Sub EndTextBlock() Implements ITextExtractionStrategy.EndTextBlock | |
End Sub | |
'@param str | |
'@return true if the string starts with a space character, | |
'false if the string is empty or starts with a non-space character | |
Private Function StartsWithSpace(ByVal str As String) As Boolean | |
If str.Length = 0 Then | |
Return False | |
End If | |
Return str(0) = " "c | |
End Function | |
' * @param str | |
' * @return true if the string ends with a space character, | |
'false if the string is empty or ends with a non-space character | |
Private Function EndsWithSpace(ByVal str As String) As Boolean | |
If str.Length = 0 Then | |
Return False | |
End If | |
Return str(str.Length - 1) = " "c | |
End Function | |
Public Property UndercontentCharacterSpacing | |
Get | |
Return _UndercontentCharacterSpacing | |
End Get | |
Set(ByVal value) | |
_UndercontentCharacterSpacing = value | |
End Set | |
End Property | |
Public Property UndercontentHorizontalScaling | |
Get | |
Return _UndercontentHorizontalScaling | |
End Get | |
Set(ByVal value) | |
_UndercontentHorizontalScaling = value | |
End Set | |
End Property | |
Public Overridable Function GetResultantText() As String Implements ITextExtractionStrategy.GetResultantText | |
If DUMP_STATE Then | |
DumpState() | |
End If | |
locationalResult.Sort() | |
Dim sb As New StringBuilder() | |
Dim lastChunk As TextChunk = Nothing | |
Try | |
For Each chunk As TextChunk In locationalResult | |
If lastChunk Is Nothing Then | |
sb.Append(chunk.text) | |
Else | |
If chunk.SameLine(lastChunk) Then | |
Dim dist As Single = chunk.DistanceFromEndOf(lastChunk) | |
If dist < -chunk.charSpaceWidth Then | |
sb.Append(" "c) | |
' we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space | |
ElseIf dist > chunk.charSpaceWidth / 2.0F AndAlso Not StartsWithSpace(chunk.text) AndAlso Not EndsWithSpace(lastChunk.text) Then | |
sb.Append(" "c) | |
End If | |
sb.Append(chunk.text) | |
Else | |
sb.Append(ControlChars.Lf) | |
sb.Append(chunk.text) | |
End If | |
End If | |
lastChunk = chunk | |
Next | |
Catch ex As Exception | |
MsgBox(ex.Message) | |
End Try | |
Return sb.ToString() | |
End Function | |
Public Function GetTextLocations(ByVal pSearchString As String, ByVal pStrComp As System.StringComparison) As List(Of iTextSharp.text.Rectangle) | |
Dim FoundMatches As New List(Of iTextSharp.text.Rectangle) | |
Dim sb As New StringBuilder() | |
Dim ThisLineChunks As List(Of TextChunk) = New List(Of TextChunk) | |
Dim bStart As Boolean, bEnd As Boolean | |
Dim FirstChunk As TextChunk = Nothing, LastChunk As TextChunk = Nothing | |
Dim sTextInUsedChunks As String = vbNullString | |
For Each chunk As TextChunk In locationalResult | |
If ThisLineChunks.Count > 0 AndAlso Not chunk.SameLine(ThisLineChunks.Last) Then | |
If sb.ToString.IndexOf(pSearchString, pStrComp) > -1 Then | |
Dim sLine As String = sb.ToString | |
'Check how many times the Search String is present in this line: | |
Dim iCount As Integer = 0 | |
Dim lPos As Integer | |
lPos = sLine.IndexOf(pSearchString, 0, pStrComp) | |
Do While lPos > -1 | |
iCount += 1 | |
If lPos + pSearchString.Length > sLine.Length Then Exit Do Else lPos = lPos + pSearchString.Length | |
lPos = sLine.IndexOf(pSearchString, lPos, pStrComp) | |
Loop | |
'Process each match found in this Text line: | |
Dim curPos As Integer = 0 | |
For i As Integer = 1 To iCount | |
Dim sCurrentText As String, iFromChar As Integer, iToChar As Integer | |
iFromChar = sLine.IndexOf(pSearchString, curPos, pStrComp) | |
curPos = iFromChar | |
iToChar = iFromChar + pSearchString.Length - 1 | |
sCurrentText = vbNullString | |
sTextInUsedChunks = vbNullString | |
FirstChunk = Nothing | |
LastChunk = Nothing | |
'Get first and last Chunks corresponding to this match found, from all Chunks in this line | |
For Each chk As TextChunk In ThisLineChunks | |
sCurrentText = sCurrentText & chk.text | |
'Check if we entered the part where we had found a matching String then get this Chunk (First Chunk) | |
If Not bStart AndAlso sCurrentText.Length - 1 >= iFromChar Then | |
FirstChunk = chk | |
bStart = True | |
End If | |
'Keep getting Text from Chunks while we are in the part where the matching String had been found | |
If bStart And Not bEnd Then | |
sTextInUsedChunks = sTextInUsedChunks & chk.text | |
End If | |
'If we get out the matching String part then get this Chunk (last Chunk) | |
If Not bEnd AndAlso sCurrentText.Length - 1 >= iToChar Then | |
LastChunk = chk | |
bEnd = True | |
End If | |
'If we already have first and last Chunks enclosing the Text where our String pSearchString has been found | |
'then it's time to get the rectangle, GetRectangleFromText Function below this Function, there we extract the pSearchString locations | |
If bStart And bEnd Then | |
FoundMatches.Add(GetRectangleFromText(FirstChunk, LastChunk, pSearchString, sTextInUsedChunks, iFromChar, iToChar, pStrComp)) | |
curPos = curPos + pSearchString.Length | |
bStart = False : bEnd = False | |
Exit For | |
End If | |
Next | |
Next | |
End If | |
sb.Clear() | |
ThisLineChunks.Clear() | |
End If | |
ThisLineChunks.Add(chunk) | |
sb.Append(chunk.text) | |
Next | |
Return FoundMatches | |
End Function | |
Private Function GetRectangleFromText(ByVal FirstChunk As TextChunk, ByVal LastChunk As TextChunk, ByVal pSearchString As String, _ | |
ByVal sTextinChunks As String, ByVal iFromChar As Integer, ByVal iToChar As Integer, ByVal pStrComp As System.StringComparison) As iTextSharp.text.Rectangle | |
'There are cases where Chunk contains extra text at begining and end, we don't want this text locations, we need to extract the pSearchString location inside | |
'for these cases we need to crop this String (left and Right), and measure this excedent at left and right, at this point we don't have any direct way to make a | |
'Transformation from text space points to User Space units, the matrix for making this transformation is not accesible from here, so for these special cases when | |
'the String needs to be cropped (Left/Right) We'll interpolate between the width from Text in Chunk (we have this value in User Space units), then i'll measure Text corresponding | |
'to the same String but in Text Space units, finally from the relation betweeenthese 2 values I get the TransformationValue I need to use for all cases | |
'Text Width in User Space Units | |
Dim LineRealWidth As Single = LastChunk.PosRight - FirstChunk.PosLeft | |
'Text Width in Text Units | |
Dim LineTextWidth As Single = GetStringWidth(sTextinChunks, LastChunk.curFontSize, _ | |
LastChunk.charSpaceWidth, _ | |
ThisPdfDocFonts.Values.ElementAt(LastChunk.FontIndex)) | |
'TransformationValue value for Interpolation | |
Dim TransformationValue As Single = LineRealWidth / LineTextWidth | |
'In the worst case, we'll need to crop left and right: | |
Dim iStart As Integer = sTextinChunks.IndexOf(pSearchString, pStrComp) | |
Dim iEnd As Integer = iStart + pSearchString.Length - 1 | |
Dim sLeft As String | |
If iStart = 0 Then sLeft = vbNullString Else sLeft = sTextinChunks.Substring(0, iStart) | |
Dim sRight As String | |
If iEnd = sTextinChunks.Length - 1 Then sRight = vbNullString Else sRight = sTextinChunks.Substring(iEnd + 1, sTextinChunks.Length - iEnd - 1) | |
'Measure cropped Text at left: | |
Dim LeftWidth As Single = 0 | |
If iStart > 0 Then | |
LeftWidth = GetStringWidth(sLeft, LastChunk.curFontSize, _ | |
LastChunk.charSpaceWidth, _ | |
ThisPdfDocFonts.Values.ElementAt(LastChunk.FontIndex)) | |
LeftWidth = LeftWidth * TransformationValue | |
End If | |
'Measure cropped Text at right: | |
Dim RightWidth As Single = 0 | |
If iEnd < sTextinChunks.Length - 1 Then | |
RightWidth = GetStringWidth(sRight, LastChunk.curFontSize, _ | |
LastChunk.charSpaceWidth, _ | |
ThisPdfDocFonts.Values.ElementAt(LastChunk.FontIndex)) | |
RightWidth = RightWidth * TransformationValue | |
End If | |
'LeftWidth is the text width at left we need to exclude, FirstChunk.distParallelStart is the distance to left margin, both together will give us this LeftOffset | |
Dim LeftOffset As Single = FirstChunk.distParallelStart + LeftWidth | |
'RightWidth is the text width at right we need to exclude, FirstChunk.distParallelEnd is the distance to right margin, we substract RightWidth from distParallelEnd to get RightOffset | |
Dim RightOffset As Single = LastChunk.distParallelEnd - RightWidth | |
'Return this Rectangle | |
Return New iTextSharp.text.Rectangle(LeftOffset, FirstChunk.PosBottom, RightOffset, FirstChunk.PosTop) | |
End Function | |
Private Function GetStringWidth(ByVal str As String, ByVal curFontSize As Single, ByVal pSingleSpaceWidth As Single, ByVal pFont As DocumentFont) As Single | |
Dim chars() As Char = str.ToCharArray() | |
Dim totalWidth As Single = 0 | |
Dim w As Single = 0 | |
For Each c As Char In chars | |
w = pFont.GetWidth(c) / 1000 | |
totalWidth += (w * curFontSize + Me.UndercontentCharacterSpacing) * Me.UndercontentHorizontalScaling / 100 | |
Next | |
Return totalWidth | |
End Function | |
Private Sub DumpState() | |
For Each location As TextChunk In locationalResult | |
location.PrintDiagnostics() | |
Console.WriteLine() | |
Next | |
End Sub | |
Public Overridable Sub RenderText(ByVal renderInfo As TextRenderInfo) Implements ITextExtractionStrategy.RenderText | |
Dim segment As LineSegment = renderInfo.GetBaseline() | |
Dim location As New TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth()) | |
With location | |
'Chunk Location: | |
Debug.Print(renderInfo.GetText) | |
.PosLeft = renderInfo.GetDescentLine.GetStartPoint(Vector.I1) | |
.PosRight = renderInfo.GetAscentLine.GetEndPoint(Vector.I1) | |
.PosBottom = renderInfo.GetDescentLine.GetStartPoint(Vector.I2) | |
.PosTop = renderInfo.GetAscentLine.GetEndPoint(Vector.I2) | |
'Chunk Font Size: (Height) | |
.curFontSize = .PosTop - segment.GetStartPoint()(Vector.I2) | |
'Use Font name and Size as Key in the SortedList | |
Dim StrKey As String = renderInfo.GetFont.PostscriptFontName & .curFontSize.ToString | |
'Add this font to ThisPdfDocFonts SortedList if it's not already present | |
If Not ThisPdfDocFonts.ContainsKey(StrKey) Then ThisPdfDocFonts.Add(StrKey, renderInfo.GetFont) | |
'Store the SortedList index in this Chunk, so we can get it later | |
.FontIndex = ThisPdfDocFonts.IndexOfKey(StrKey) | |
End With | |
locationalResult.Add(location) | |
End Sub | |
'* | |
' * Represents a chunk of text, it's orientation, and location relative to the orientation vector | |
' | |
Public Class TextChunk | |
Implements IComparable(Of TextChunk) | |
'* the text of the chunk | |
Friend text As [String] | |
'* the starting location of the chunk | |
Friend startLocation As Vector | |
'* the ending location of the chunk | |
Friend endLocation As Vector | |
'* unit vector in the orientation of the chunk | |
Friend orientationVector As Vector | |
'* the orientation as a scalar for quick sorting | |
Friend orientationMagnitude As Integer | |
'* perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system) | |
' * we round to the nearest integer to handle the fuzziness of comparing floats | |
Friend distPerpendicular As Integer | |
'* distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) | |
Friend distParallelStart As Single | |
'* distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) | |
Friend distParallelEnd As Single | |
'* the width of a single space character in the font of the chunk | |
Friend charSpaceWidth As Single | |
Private _PosLeft As Single | |
Private _PosRight As Single | |
Private _PosTop As Single | |
Private _PosBottom As Single | |
Private _curFontSize As Single | |
Private _FontIndex As Integer | |
Public Property FontIndex As Integer | |
Get | |
Return _FontIndex | |
End Get | |
Set(ByVal value As Integer) | |
_FontIndex = value | |
End Set | |
End Property | |
Public Property PosLeft As Single | |
Get | |
Return _PosLeft | |
End Get | |
Set(ByVal value As Single) | |
_PosLeft = value | |
End Set | |
End Property | |
Public Property PosRight As Single | |
Get | |
Return _PosRight | |
End Get | |
Set(ByVal value As Single) | |
_PosRight = value | |
End Set | |
End Property | |
Public Property PosTop As Single | |
Get | |
Return _PosTop | |
End Get | |
Set(ByVal value As Single) | |
_PosTop = value | |
End Set | |
End Property | |
Public Property PosBottom As Single | |
Get | |
Return _PosBottom | |
End Get | |
Set(ByVal value As Single) | |
_PosBottom = value | |
End Set | |
End Property | |
Public Property curFontSize As Single | |
Get | |
Return _curFontSize | |
End Get | |
Set(ByVal value As Single) | |
_curFontSize = value | |
End Set | |
End Property | |
Public Sub New(ByVal str As [String], ByVal startLocation As Vector, ByVal endLocation As Vector, ByVal charSpaceWidth As Single) | |
Me.text = str | |
Me.startLocation = startLocation | |
Me.endLocation = endLocation | |
Me.charSpaceWidth = charSpaceWidth | |
Dim oVector As Vector = endLocation.Subtract(startLocation) | |
If oVector.Length = 0 Then | |
oVector = New Vector(1, 0, 0) | |
End If | |
orientationVector = oVector.Normalize() | |
orientationMagnitude = CInt(Math.Truncate(Math.Atan2(orientationVector(Vector.I2), orientationVector(Vector.I1)) * 1000)) | |
Dim origin As New Vector(0, 0, 1) | |
distPerpendicular = CInt((startLocation.Subtract(origin)).Cross(orientationVector)(Vector.I3)) | |
distParallelStart = orientationVector.Dot(startLocation) | |
distParallelEnd = orientationVector.Dot(endLocation) | |
End Sub | |
Public Sub PrintDiagnostics() | |
Console.WriteLine("Text (@" & Convert.ToString(startLocation) & " -> " & Convert.ToString(endLocation) & "): " & text) | |
Console.WriteLine("orientationMagnitude: " & orientationMagnitude) | |
Console.WriteLine("distPerpendicular: " & distPerpendicular) | |
Console.WriteLine("distParallel: " & distParallelStart) | |
End Sub | |
'* | |
' * @param as the location to compare to | |
' * @return true is this location is on the the same line as the other | |
' | |
Public Function SameLine(ByVal a As TextChunk) As Boolean | |
If orientationMagnitude <> a.orientationMagnitude Then | |
Return False | |
End If | |
If distPerpendicular <> a.distPerpendicular Then | |
Return False | |
End If | |
Return True | |
End Function | |
'* | |
' * Computes the distance between the end of 'other' and the beginning of this chunk | |
' * in the direction of this chunk's orientation vector. Note that it's a bad idea | |
' * to call this for chunks that aren't on the same line and orientation, but we don't | |
' * explicitly check for that condition for performance reasons. | |
' * @param other | |
' * @return the number of spaces between the end of 'other' and the beginning of this chunk | |
' | |
Public Function DistanceFromEndOf(ByVal other As TextChunk) As Single | |
Dim distance As Single = distParallelStart - other.distParallelEnd | |
Return distance | |
End Function | |
'* | |
' * Compares based on orientation, perpendicular distance, then parallel distance | |
' * @see java.lang.Comparable#compareTo(java.lang.Object) | |
' | |
Public Function CompareTo(ByVal rhs As TextChunk) As Integer Implements System.IComparable(Of TextChunk).CompareTo | |
If Me Is rhs Then | |
Return 0 | |
End If | |
' not really needed, but just in case | |
Dim rslt As Integer | |
rslt = CompareInts(orientationMagnitude, rhs.orientationMagnitude) | |
If rslt <> 0 Then | |
Return rslt | |
End If | |
rslt = CompareInts(distPerpendicular, rhs.distPerpendicular) | |
If rslt <> 0 Then | |
Return rslt | |
End If | |
' note: it's never safe to check floating point numbers for equality, and if two chunks | |
' are truly right on top of each other, which one comes first or second just doesn't matter | |
' so we arbitrarily choose this way. | |
rslt = If(distParallelStart < rhs.distParallelStart, -1, 1) | |
Return rslt | |
End Function | |
'* | |
' * | |
' * @param int1 | |
' * @param int2 | |
' * @return comparison of the two integers | |
' | |
Private Shared Function CompareInts(ByVal int1 As Integer, ByVal int2 As Integer) As Integer | |
Return If(int1 = int2, 0, If(int1 < int2, -1, 1)) | |
End Function | |
End Class | |
'* | |
' * no-op method - this renderer isn't interested in image events | |
' * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo) | |
' * @since 5.0.1 | |
' | |
Public Sub RenderImage(ByVal renderInfo As ImageRenderInfo) Implements IRenderListener.RenderImage | |
' do nothing | |
End Sub | |
End Class | |
End Namespace |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment