Created
November 13, 2024 00:01
-
-
Save rodrigorgs/7b40b33806008dabf84ba6e6f5ed0722 to your computer and use it in GitHub Desktop.
Converte extrato de cartão do Banco do Brasil (PDF) para CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf-lib/1.17.1/pdf-lib.min.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script> | |
<script> | |
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.worker.min.js'; | |
async function convertPdfToCsv() { | |
const fileInput = document.getElementById('pdfUpload'); | |
const output = document.getElementById('csvOutput'); | |
if (!fileInput.files.length) { | |
alert('Please upload a PDF file.'); | |
return; | |
} | |
const file = fileInput.files[0]; | |
const pdfData = await file.arrayBuffer(); | |
const pdf = await pdfjsLib.getDocument({ data: pdfData }).promise; | |
let csvLines = [] | |
for (let i = 1; i <= pdf.numPages; i++) { | |
const page = await pdf.getPage(i); | |
const textContent = await page.getTextContent(); | |
let pageText = textContent.items.map(item => item.str).join(' '); | |
// console.log(pageText); | |
// Regular expression to capture table rows | |
const regex = new RegExp( | |
/(\d{2}\/\d{2})/.source // Date (dd/MM) | |
+ /\s+(.+?)/.source // Description | |
+ /\s+([A-Z]{2})/.source // Country | |
+ /\s+(-?[\d.,]+)\s+/.source // Value + | |
); | |
let match; | |
match = regex.exec(pageText) | |
while ((match = regex.exec(pageText)) !== null) { | |
// console.log(match); | |
let [_, date, description, country, value] = match; | |
if (value.charAt(0) !== '-') { | |
console.log([date, description, country, value]); | |
csvLines.push(`${date}\t${description}\t${value}`); | |
} | |
pageText = pageText.slice(match.index + match[0].length); | |
} | |
} | |
csvLines.reverse(); | |
const csvContent = csvLines.join('\n'); | |
output.textContent = csvContent; | |
} | |
</script> | |
</head> | |
<body> | |
<input type="file" id="pdfUpload" accept=".pdf"> | |
<button onclick="convertPdfToCsv()">Convert to CSV</button> | |
<pre id="csvOutput"></pre> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment