Created
January 25, 2022 19:42
-
-
Save jhgbrt/f161badbfdd7c73fb544cdef81e1c32f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using iText.Kernel.Pdf; | |
using iText.Kernel.Pdf.Canvas.Parser; | |
using iText.Kernel.Pdf.Canvas.Parser.Listener; | |
using Net.Code.Csv; | |
using System.Globalization; | |
using System.Text.RegularExpressions; | |
var folder = args[0]; | |
var file = args[1]; | |
var cultureInfo = CultureInfo.GetCultureInfo("nl-BE"); | |
WriteCsv.ToFile(ExtractTransactions(folder, cultureInfo), | |
Path.Combine(folder, file), delimiter: ';', hasHeaders: true, cultureInfo: cultureInfo | |
); | |
static IEnumerable<Transaction> ExtractTransactions(string folder, CultureInfo cultureInfo) | |
=> from file in Directory.GetFiles(folder, "*.pdf") | |
from line in ExtractLines(file) | |
let tx = Transaction.Parse(line, cultureInfo) | |
where tx != null | |
select tx.Value; | |
static IEnumerable<string> ExtractLines(string file) | |
{ | |
using var pdfReader = new PdfReader(file); | |
using var pdfDoc = new PdfDocument(pdfReader); | |
for (int page = 1; page <= pdfDoc.GetNumberOfPages(); page++) | |
{ | |
var strategy = new SimpleTextExtractionStrategy(); | |
var pageContent = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy); | |
using var stringReader = new StringReader(pageContent); | |
while (stringReader.Peek() >= 0) | |
{ | |
yield return stringReader.ReadLine()!; | |
} | |
} | |
} | |
readonly record struct Transaction([CsvFormat("yyyy-MM-dd")]DateTime Datum, string Omschrijving, decimal Bedrag) | |
{ | |
private static readonly Regex regex = new (@"(?<date>\d\d/\d\d/\d\d\d\d) (?<description>.*) (?<amount>-?[.\d]+,\d{2})$"); | |
public static Transaction? Parse(string s, CultureInfo cultureInfo) | |
{ | |
var m = regex.Match(s); | |
return m.Success | |
? new Transaction( | |
DateTime.ParseExact(m.Groups["date"].Value, "dd/MM/yyyy", cultureInfo, DateTimeStyles.None), | |
m.Groups["description"].Value, | |
decimal.Parse(m.Groups["amount"].Value, cultureInfo) | |
) | |
: null; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Project Sdk="Microsoft.NET.Sdk"> | |
<PropertyGroup> | |
<OutputType>Exe</OutputType> | |
<TargetFramework>net6.0</TargetFramework> | |
<ImplicitUsings>enable</ImplicitUsings> | |
<Nullable>enable</Nullable> | |
</PropertyGroup> | |
<ItemGroup> | |
<PackageReference Include="itext7" Version="7.2.1" /> | |
<PackageReference Include="Net.Code.Csv" Version="5.0.0-preview.1" /> | |
</ItemGroup> | |
</Project> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
parses all pdf files in a folder, extracts lines of the form [dd/MM/yyyy] [description] [amount]