Created
May 13, 2025 02:59
-
-
Save documentprocessing/abe76e3660fa698d30b1a2abc5fe80e7 to your computer and use it in GitHub Desktop.
Text Positional Analysis in .NET
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using UglyToad.PdfPig; | |
| using UglyToad.PdfPig.Content; | |
| class TableExtractor | |
| { | |
| public void AnalyzeDocument(string filePath) | |
| { | |
| using var document = PdfDocument.Open(filePath); | |
| foreach (var page in document.GetPages()) | |
| { | |
| // Group letters into words and lines | |
| var letters = page.Letters; | |
| var words = page.GetWords(); | |
| // Detect potential tables by aligned words | |
| var columnGroups = words.GroupBy(w => w.BoundingBox.Left) | |
| .Where(g => g.Count() > 3); | |
| foreach (var column in columnGroups) | |
| { | |
| Console.WriteLine($"Found column at X={column.Key}:"); | |
| foreach (var word in column.OrderBy(w => w.BoundingBox.Top)) | |
| { | |
| Console.WriteLine($" {word.Text}"); | |
| } | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment