Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save documentprocessing/abe76e3660fa698d30b1a2abc5fe80e7 to your computer and use it in GitHub Desktop.

Select an option

Save documentprocessing/abe76e3660fa698d30b1a2abc5fe80e7 to your computer and use it in GitHub Desktop.
Text Positional Analysis in .NET
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
class TableExtractor
{
public void AnalyzeDocument(string filePath)
{
using var document = PdfDocument.Open(filePath);
foreach (var page in document.GetPages())
{
// Group letters into words and lines
var letters = page.Letters;
var words = page.GetWords();
// Detect potential tables by aligned words
var columnGroups = words.GroupBy(w => w.BoundingBox.Left)
.Where(g => g.Count() > 3);
foreach (var column in columnGroups)
{
Console.WriteLine($"Found column at X={column.Key}:");
foreach (var word in column.OrderBy(w => w.BoundingBox.Top))
{
Console.WriteLine($" {word.Text}");
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment