Created
July 24, 2025 01:35
-
-
Save celsowm/94f08b37446c383558781ff8470c2e44 to your computer and use it in GitHub Desktop.
Js PDF Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// PDFParser.js - Parser de PDF com exportação para HTML | |
class PDFParser { | |
constructor() { | |
this.buffer = null; | |
this.position = 0; | |
this.objects = new Map(); | |
this.trailer = {}; | |
this.xref = {}; | |
this.pages = []; | |
this.fonts = new Map(); | |
this.graphicsState = {}; | |
this.images = new Map(); // Armazena informações de imagens | |
} | |
/** | |
* Carrega o PDF a partir de um ArrayBuffer | |
*/ | |
async loadPDF(arrayBuffer) { | |
this.buffer = new Uint8Array(arrayBuffer); | |
this.position = 0; | |
try { | |
await this.parsePDF(); | |
await this.extractPagesWithFullLayout(); | |
await this.extractImages(); // Extrai informações de imagens | |
return this; | |
} catch (error) { | |
throw new Error(`Erro ao parsear PDF: ${error.message}`); | |
} | |
} | |
/** | |
* Parseia o PDF completo | |
*/ | |
async parsePDF() { | |
if (!this.isValidPDF()) { | |
throw new Error('Arquivo não é um PDF válido'); | |
} | |
this.skipHeader(); | |
await this.parseXRef(); | |
this.parseTrailer(); | |
await this.parseObjects(); | |
} | |
/** | |
* Extrai páginas com layout completo | |
*/ | |
async extractPagesWithFullLayout() { | |
const rootRef = this.trailer.Root; | |
if (!rootRef) return; | |
const rootId = rootRef.split(' ')[0]; | |
const rootObj = this.getObject(rootId); | |
if (!rootObj || !rootObj.content) return; | |
const pagesRef = rootObj.content.Pages; | |
if (!pagesRef) return; | |
const pagesId = pagesRef.split(' ')[0]; | |
const pagesObj = this.getObject(pagesId); | |
if (!pagesObj || !pagesObj.content) return; | |
await this.extractPageFullLayouts(pagesObj.content.Kids || []); | |
} | |
/** | |
* Extrai layouts completos das páginas | |
*/ | |
async extractPageFullLayouts(pageRefs) { | |
for (const pageRef of pageRefs) { | |
const pageId = pageRef.split(' ')[0]; | |
const pageObj = this.getObject(pageId); | |
if (pageObj && pageObj.content) { | |
const pageLayout = await this.extractPageFullLayout(pageObj.content, pageObj.content.Resources || {}); | |
this.pages.push(pageLayout); | |
} | |
} | |
} | |
/** | |
* Extrai layout completo da página | |
*/ | |
async extractPageFullLayout(pageContent, resources) { | |
const pageLayout = { | |
number: this.pages.length + 1, | |
size: this.getPageSize(pageContent), | |
textElements: [], | |
graphics: [], | |
images: [], | |
transformations: [], | |
tables: [], | |
mediaBox: pageContent.MediaBox, | |
cropBox: pageContent.CropBox, | |
resources: resources | |
}; | |
if (resources && resources.Font) { | |
await this.loadPageFonts(resources.Font); | |
} | |
if (pageContent.Contents) { | |
const contentElements = await this.extractContentWithFullLayout(pageContent.Contents, resources); | |
pageLayout.textElements = contentElements.textElements || []; | |
pageLayout.graphics = contentElements.graphics || []; | |
pageLayout.images = contentElements.images || []; | |
pageLayout.transformations = contentElements.transformations || []; | |
// Detecta tabelas | |
pageLayout.tables = this.detectTables( | |
contentElements.textElements || [], | |
contentElements.graphics || [] | |
); | |
} | |
return pageLayout; | |
} | |
/** | |
* Extrai conteúdo com layout completo | |
*/ | |
async extractContentWithFullLayout(contentsRef, resources) { | |
const result = { | |
textElements: [], | |
graphics: [], | |
images: [], | |
transformations: [] | |
}; | |
try { | |
if (Array.isArray(contentsRef)) { | |
for (const ref of contentsRef) { | |
const contentData = await this.extractContentStreamWithLayout(ref, resources); | |
result.textElements.push(...contentData.textElements); | |
result.graphics.push(...contentData.graphics); | |
result.images.push(...contentData.images); | |
result.transformations.push(...contentData.transformations); | |
} | |
} else { | |
const contentData = await this.extractContentStreamWithLayout(contentsRef, resources); | |
result.textElements = contentData.textElements; | |
result.graphics = contentData.graphics; | |
result.images = contentData.images; | |
result.transformations = contentData.transformations; | |
} | |
} catch (error) { | |
console.warn('Erro ao extrair conteúdo com layout:', error.message); | |
} | |
return result; | |
} | |
/** | |
* Extrai informações de imagens do PDF | |
*/ | |
async extractImages() { | |
// Procura por objetos XObject do tipo imagem | |
for (const [objId, obj] of this.objects) { | |
if (obj.content && obj.content.Subtype === 'Image') { | |
this.images.set(objId, { | |
id: objId, | |
content: obj.content, | |
width: obj.content.Width, | |
height: obj.content.Height, | |
filter: obj.content.Filter, | |
colorSpace: obj.content.ColorSpace, | |
bitsPerComponent: obj.content.BitsPerComponent | |
}); | |
} | |
} | |
} | |
/** | |
* Exporta o PDF inteiro como HTML | |
*/ | |
exportToHTML(options = {}) { | |
const { | |
includeStyles = true, | |
includeMetadata = true, | |
preserveLayout = true, | |
includeImages = true, | |
includeTables = true | |
} = options; | |
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n'; | |
html += '<meta charset="UTF-8">\n'; | |
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'; | |
html += '<title>Documento PDF Convertido</title>\n'; | |
if (includeStyles) { | |
html += this.generateHTMLStyles(); | |
} | |
if (includeMetadata) { | |
const info = this.getDocumentInfo(); | |
if (info && info.Title) { | |
html += `<meta name="title" content="${this.escapeHtml(info.Title)}">\n`; | |
} | |
if (info && info.Author) { | |
html += `<meta name="author" content="${this.escapeHtml(info.Author)}">\n`; | |
} | |
} | |
html += '</head>\n<body>\n'; | |
if (includeMetadata) { | |
html += this.generateHTMLMetadata(); | |
} | |
// Adiciona conteúdo de todas as páginas | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
html += this.exportPageToHTML(i, { | |
preserveLayout, | |
includeImages, | |
includeTables | |
}); | |
} | |
html += '</body>\n</html>'; | |
return html; | |
} | |
/** | |
* Exporta uma página específica como HTML | |
*/ | |
exportPageToHTML(pageNumber, options = {}) { | |
const { | |
preserveLayout = true, | |
includeImages = true, | |
includeTables = true | |
} = options; | |
const page = this.getPageLayout(pageNumber); | |
let html = `<div class="pdf-page" data-page="${pageNumber}" `; | |
if (preserveLayout) { | |
html += `style="width: ${page.size.width}px; height: ${page.size.height}px; position: relative;"`; | |
} | |
html += '>\n'; | |
// Adiciona tabelas primeiro (se incluídas) | |
if (includeTables && page.tables && page.tables.length > 0) { | |
html += this.exportTablesToHTML(pageNumber); | |
} | |
// Adiciona elementos de texto | |
html += this.exportTextElementsToHTML(pageNumber, preserveLayout); | |
// Adiciona imagens | |
if (includeImages && page.images && page.images.length > 0) { | |
html += this.exportImagesToHTML(pageNumber); | |
} | |
// Adiciona elementos gráficos | |
html += this.exportGraphicsToHTML(pageNumber); | |
html += '</div>\n'; | |
return html; | |
} | |
/** | |
* Gera estilos CSS para o HTML | |
*/ | |
generateHTMLStyles() { | |
return ` | |
<style> | |
body { | |
font-family: Arial, sans-serif; | |
margin: 0; | |
padding: 20px; | |
background-color: #f5f5f5; | |
} | |
.pdf-document { | |
max-width: 100%; | |
margin: 0 auto; | |
} | |
.pdf-metadata { | |
background-color: #fff; | |
padding: 20px; | |
margin-bottom: 20px; | |
border-radius: 8px; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.pdf-page { | |
background-color: white; | |
margin: 20px auto; | |
box-shadow: 0 4px 8px rgba(0,0,0,0.1); | |
border: 1px solid #ddd; | |
} | |
.pdf-text { | |
position: absolute; | |
white-space: nowrap; | |
cursor: text; | |
} | |
.pdf-table { | |
border-collapse: collapse; | |
margin: 10px; | |
position: absolute; | |
} | |
.pdf-table-cell { | |
border: 1px solid #ddd; | |
padding: 4px 8px; | |
vertical-align: top; | |
} | |
.pdf-image { | |
position: absolute; | |
max-width: 100%; | |
height: auto; | |
} | |
.pdf-line { | |
position: absolute; | |
background-color: #000; | |
} | |
.pdf-rectangle { | |
position: absolute; | |
border: 1px solid #000; | |
} | |
.page-number { | |
text-align: center; | |
color: #666; | |
font-size: 12px; | |
margin-top: 10px; | |
} | |
@media print { | |
body { | |
background-color: white; | |
} | |
.pdf-page { | |
box-shadow: none; | |
border: none; | |
margin: 0; | |
page-break-after: always; | |
} | |
} | |
</style> | |
`; | |
} | |
/** | |
* Gera metadados HTML | |
*/ | |
generateHTMLMetadata() { | |
const info = this.getDocumentInfo(); | |
if (!info) return ''; | |
let html = '<div class="pdf-metadata">\n'; | |
html += '<h2>Metadados do Documento</h2>\n'; | |
if (info.Title) { | |
html += `<p><strong>Título:</strong> ${this.escapeHtml(info.Title)}</p>\n`; | |
} | |
if (info.Author) { | |
html += `<p><strong>Autor:</strong> ${this.escapeHtml(info.Author)}</p>\n`; | |
} | |
if (info.Subject) { | |
html += `<p><strong>Assunto:</strong> ${this.escapeHtml(info.Subject)}</p>\n`; | |
} | |
if (info.Creator) { | |
html += `<p><strong>Criador:</strong> ${this.escapeHtml(info.Creator)}</p>\n`; | |
} | |
if (info.Producer) { | |
html += `<p><strong>Produtor:</strong> ${this.escapeHtml(info.Producer)}</p>\n`; | |
} | |
if (info.CreationDate) { | |
html += `<p><strong>Data de Criação:</strong> ${this.escapeHtml(info.CreationDate)}</p>\n`; | |
} | |
html += `<p><strong>Número de Páginas:</strong> ${this.getPageCount()}</p>\n`; | |
html += '</div>\n'; | |
return html; | |
} | |
/** | |
* Exporta elementos de texto como HTML | |
*/ | |
exportTextElementsToHTML(pageNumber, preserveLayout) { | |
const page = this.getPageLayout(pageNumber); | |
let html = ''; | |
if (preserveLayout) { | |
// Texto com posicionamento preciso | |
const textElements = [...page.textElements].sort((a, b) => { | |
// Ordena por Y (de cima para baixo) e depois por X | |
if (Math.abs(b.position.y - a.position.y) > 5) { | |
return a.position.y - b.position.y; | |
} | |
return a.position.x - b.position.x; | |
}); | |
for (const element of textElements) { | |
const style = this.getTextElementStyle(element, page.size); | |
html += `<div class="pdf-text" style="${style}">`; | |
html += this.escapeHtml(element.text); | |
html += '</div>\n'; | |
} | |
} else { | |
// Texto como fluxo normal | |
const plainText = this.getPagePlainText(pageNumber); | |
const lines = plainText.split('\n'); | |
for (const line of lines) { | |
if (line.trim()) { | |
html += `<p>${this.escapeHtml(line)}</p>\n`; | |
} | |
} | |
} | |
return html; | |
} | |
/** | |
* Obtém estilo CSS para elemento de texto | |
*/ | |
getTextElementStyle(element, pageSize) { | |
const styles = []; | |
// Posicionamento | |
styles.push(`left: ${element.position.x}px`); | |
styles.push(`top: ${element.position.y}px`); | |
// Fonte | |
if (element.fontSize) { | |
styles.push(`font-size: ${element.fontSize}px`); | |
} | |
if (element.font) { | |
// Mapeia fontes PDF para CSS | |
const fontFamily = this.mapPDFFontToCSS(element.font); | |
styles.push(`font-family: ${fontFamily}`); | |
} | |
// Cor | |
if (element.color && element.color.r !== undefined) { | |
const color = `rgb(${element.color.r}, ${element.color.g}, ${element.color.b})`; | |
styles.push(`color: ${color}`); | |
} | |
return styles.join('; '); | |
} | |
/** | |
* Mapeia fontes PDF para CSS | |
*/ | |
mapPDFFontToCSS(pdfFont) { | |
const fontMap = { | |
'Helvetica': 'Helvetica, Arial, sans-serif', | |
'Times-Roman': 'Times New Roman, serif', | |
'Courier': 'Courier New, monospace', | |
'Arial': 'Arial, sans-serif', | |
'Times': 'Times New Roman, serif' | |
}; | |
return fontMap[pdfFont] || 'sans-serif'; | |
} | |
/** | |
* Exporta tabelas como HTML | |
*/ | |
exportTablesToHTML(pageNumber) { | |
const tables = this.getPageTables(pageNumber); | |
let html = ''; | |
tables.forEach((table, index) => { | |
const style = `left: ${table.bounds.x}px; top: ${table.bounds.y}px;`; | |
html += `<table class="pdf-table" style="${style}" data-table-index="${index}">\n`; | |
table.structure.forEach((row, rowIndex) => { | |
html += '<tr>\n'; | |
row.forEach((cell, cellIndex) => { | |
const cellStyle = cell.bounds ? | |
`width: ${cell.bounds.width}px;` : ''; | |
html += `<td class="pdf-table-cell" style="${cellStyle}">`; | |
html += this.escapeHtml(cell.text || ''); | |
html += '</td>\n'; | |
}); | |
html += '</tr>\n'; | |
}); | |
html += '</table>\n'; | |
}); | |
return html; | |
} | |
/** | |
* Exporta imagens como HTML | |
*/ | |
exportImagesToHTML(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
let html = ''; | |
// Para simplificação, cria placeholders para imagens | |
// Em implementação real, seria necessário extrair os dados binários | |
page.images.forEach((image, index) => { | |
const style = image.position ? | |
`left: ${image.position.x}px; top: ${image.position.y}px;` : ''; | |
html += `<div class="pdf-image" style="${style}" data-image-index="${index}">`; | |
html += `[Imagem ${index + 1}]`; // Placeholder | |
html += '</div>\n'; | |
}); | |
return html; | |
} | |
/** | |
* Exporta elementos gráficos como HTML | |
*/ | |
exportGraphicsToHTML(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
let html = ''; | |
page.graphics.forEach((graphic, index) => { | |
if (graphic.type === 'lineTo' && graphic.x !== undefined && graphic.y !== undefined) { | |
// Linhas simples (simplificadas) | |
html += `<div class="pdf-line" data-graphic-index="${index}" `; | |
html += `style="left: ${graphic.x}px; top: ${graphic.y}px; width: 100px; height: 1px;"></div>\n`; | |
} else if (graphic.type === 'rectangle' && graphic.width && graphic.height) { | |
// Retângulos | |
const style = `left: ${graphic.x}px; top: ${graphic.y}px; width: ${graphic.width}px; height: ${graphic.height}px;`; | |
html += `<div class="pdf-rectangle" style="${style}" data-graphic-index="${index}"></div>\n`; | |
} | |
}); | |
return html; | |
} | |
/** | |
* Exporta como HTML simplificado (sem posicionamento absoluto) | |
*/ | |
exportToSimpleHTML(options = {}) { | |
const { | |
includeMetadata = true, | |
includeTables = true | |
} = options; | |
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n'; | |
html += '<meta charset="UTF-8">\n'; | |
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'; | |
html += '<title>Documento PDF Convertido</title>\n'; | |
html += '<style>\n'; | |
html += ` | |
body { font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; } | |
.page { margin-bottom: 40px; padding: 20px; border: 1px solid #ddd; } | |
.page-header { text-align: center; color: #666; margin-bottom: 20px; } | |
table { border-collapse: collapse; margin: 10px 0; } | |
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; } | |
th { background-color: #f2f2f2; } | |
`; | |
html += '</style>\n</head>\n<body>\n'; | |
if (includeMetadata) { | |
html += this.generateHTMLMetadata(); | |
} | |
html += '<div class="pdf-document">\n'; | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
html += `<div class="page">\n`; | |
html += `<div class="page-header">Página ${i}</div>\n`; | |
// Texto da página | |
const plainText = this.getPagePlainText(i); | |
const lines = plainText.split('\n'); | |
lines.forEach(line => { | |
if (line.trim()) { | |
html += `<p>${this.escapeHtml(line)}</p>\n`; | |
} | |
}); | |
// Tabelas da página | |
if (includeTables) { | |
const tables = this.getPageTables(i); | |
tables.forEach((table, tableIndex) => { | |
html += '<table>\n'; | |
table.structure.forEach((row, rowIndex) => { | |
html += '<tr>\n'; | |
row.forEach(cell => { | |
const tag = rowIndex === 0 ? 'th' : 'td'; | |
html += `<${tag}>${this.escapeHtml(cell.text || '')}</${tag}>\n`; | |
}); | |
html += '</tr>\n'; | |
}); | |
html += '</table>\n'; | |
}); | |
} | |
html += '</div>\n'; | |
} | |
html += '</div>\n</body>\n</html>'; | |
return html; | |
} | |
/** | |
* Exporta como HTML com suporte a responsividade | |
*/ | |
exportToResponsiveHTML(options = {}) { | |
const { | |
maxWidth = 800, | |
includeStyles = true, | |
includeMetadata = true | |
} = options; | |
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n'; | |
html += '<meta charset="UTF-8">\n'; | |
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'; | |
html += '<title>Documento PDF Convertido</title>\n'; | |
if (includeStyles) { | |
html += this.generateResponsiveStyles(maxWidth); | |
} | |
html += '</head>\n<body>\n'; | |
if (includeMetadata) { | |
html += this.generateHTMLMetadata(); | |
} | |
html += '<div class="pdf-document">\n'; | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
html += `<section class="pdf-page" data-page="${i}">\n`; | |
html += `<header class="page-header">Página ${i}</header>\n`; | |
// Conteúdo da página | |
const plainText = this.getPagePlainText(i); | |
const lines = plainText.split('\n'); | |
lines.forEach(line => { | |
if (line.trim()) { | |
html += `<p>${this.escapeHtml(line)}</p>\n`; | |
} | |
}); | |
// Tabelas | |
const tables = this.getPageTables(i); | |
tables.forEach(table => { | |
html += '<div class="table-container">\n'; | |
html += '<table>\n'; | |
table.structure.forEach((row, rowIndex) => { | |
const tag = rowIndex === 0 ? 'thead' : 'tbody'; | |
if (rowIndex === 0) html += '<thead>\n'; | |
else if (rowIndex === 1) html += '<tbody>\n'; | |
html += '<tr>\n'; | |
row.forEach(cell => { | |
const cellTag = rowIndex === 0 ? 'th' : 'td'; | |
html += `<${cellTag}>${this.escapeHtml(cell.text || '')}</${cellTag}>\n`; | |
}); | |
html += '</tr>\n'; | |
if (rowIndex === 0) html += '</thead>\n'; | |
}); | |
html += '</tbody>\n</table>\n</div>\n'; | |
}); | |
html += '</section>\n'; | |
} | |
html += '</div>\n</body>\n</html>'; | |
return html; | |
} | |
/** | |
* Gera estilos responsivos | |
*/ | |
generateResponsiveStyles(maxWidth) { | |
return ` | |
<style> | |
body { | |
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; | |
margin: 0; | |
padding: 20px; | |
background-color: #f8f9fa; | |
color: #333; | |
line-height: 1.6; | |
} | |
.pdf-document { | |
max-width: ${maxWidth}px; | |
margin: 0 auto; | |
background: white; | |
border-radius: 8px; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
overflow: hidden; | |
} | |
.pdf-metadata { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 30px; | |
margin-bottom: 0; | |
} | |
.pdf-metadata h2 { | |
margin-top: 0; | |
color: white; | |
} | |
.pdf-metadata p { | |
margin: 10px 0; | |
opacity: 0.9; | |
} | |
.pdf-page { | |
padding: 30px; | |
border-bottom: 1px solid #eee; | |
} | |
.pdf-page:last-child { | |
border-bottom: none; | |
} | |
.page-header { | |
text-align: center; | |
color: #666; | |
font-size: 1.2em; | |
margin-bottom: 20px; | |
padding-bottom: 10px; | |
border-bottom: 2px solid #eee; | |
} | |
p { | |
margin: 0 0 15px 0; | |
text-align: justify; | |
} | |
.table-container { | |
overflow-x: auto; | |
margin: 20px 0; | |
} | |
table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 10px 0; | |
background: white; | |
box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
} | |
th, td { | |
padding: 12px 15px; | |
text-align: left; | |
border: 1px solid #ddd; | |
} | |
th { | |
background: #f8f9fa; | |
font-weight: 600; | |
color: #555; | |
} | |
tbody tr:nth-child(even) { | |
background-color: #f8f9fa; | |
} | |
tbody tr:hover { | |
background-color: #e9ecef; | |
} | |
@media (max-width: 768px) { | |
body { | |
padding: 10px; | |
} | |
.pdf-document { | |
border-radius: 0; | |
} | |
.pdf-page { | |
padding: 20px 15px; | |
} | |
th, td { | |
padding: 8px 10px; | |
font-size: 0.9em; | |
} | |
.pdf-metadata { | |
padding: 20px 15px; | |
} | |
} | |
@media print { | |
body { | |
background: white; | |
padding: 0; | |
} | |
.pdf-document { | |
box-shadow: none; | |
border-radius: 0; | |
} | |
.pdf-page { | |
page-break-inside: avoid; | |
} | |
} | |
</style> | |
`; | |
} | |
/** | |
* Escapa caracteres HTML | |
*/ | |
escapeHtml(text) { | |
if (typeof text !== 'string') return ''; | |
return text | |
.replace(/&/g, '&') | |
.replace(/</g, '<') | |
.replace(/>/g, '>') | |
.replace(/"/g, '"') | |
.replace(/'/g, '''); | |
} | |
// Métodos de detecção de tabelas (mantidos do código anterior) | |
detectTables(textElements, graphicsElements) { | |
const tables = []; | |
// Detecção baseada em texto | |
const textBasedTables = this.detectTextBasedTables(textElements); | |
tables.push(...textBasedTables); | |
// Detecção baseada em gráficos | |
const graphicBasedTables = this.detectGraphicBasedTables(graphicsElements, textElements); | |
tables.push(...graphicBasedTables); | |
return tables; | |
} | |
detectTextBasedTables(textElements) { | |
const tables = []; | |
const lines = this.groupTextIntoLines(textElements); | |
const patterns = this.findTablePatterns(lines); | |
for (const pattern of patterns) { | |
const table = this.createTextBasedTable(pattern); | |
if (table) { | |
tables.push(table); | |
} | |
} | |
return tables; | |
} | |
detectGraphicBasedTables(graphicsElements, textElements) { | |
const tables = []; | |
const tableLines = this.findTableLines(graphicsElements); | |
const lineGroups = this.groupLinesIntoTables(tableLines); | |
for (const lineGroup of lineGroups) { | |
const table = this.analyzeTableStructure(lineGroup, textElements); | |
if (table) { | |
tables.push(table); | |
} | |
} | |
return tables; | |
} | |
// Métodos auxiliares para tabelas (mantidos do código anterior) | |
groupTextIntoLines(textElements) { | |
const lines = []; | |
const grouped = new Set(); | |
const sortedElements = [...textElements].sort((a, b) => b.position.y - a.position.y); | |
for (let i = 0; i < sortedElements.length; i++) { | |
if (grouped.has(i)) continue; | |
const currentLine = [sortedElements[i]]; | |
grouped.add(i); | |
const baseY = sortedElements[i].position.y; | |
for (let j = i + 1; j < sortedElements.length; j++) { | |
if (grouped.has(j)) continue; | |
const elementY = sortedElements[j].position.y; | |
if (Math.abs(elementY - baseY) < 5) { | |
currentLine.push(sortedElements[j]); | |
grouped.add(j); | |
} | |
} | |
currentLine.sort((a, b) => a.position.x - b.position.x); | |
lines.push(currentLine); | |
} | |
return lines; | |
} | |
findTablePatterns(lines) { | |
const patterns = []; | |
const minTableLines = 3; | |
for (let i = 0; i <= lines.length - minTableLines; i++) { | |
const potentialTable = lines.slice(i, i + minTableLines); | |
if (this.isTablePattern(potentialTable)) { | |
let endIndex = i + minTableLines; | |
while (endIndex < lines.length && | |
this.extendsTablePattern(potentialTable, lines[endIndex])) { | |
potentialTable.push(lines[endIndex]); | |
endIndex++; | |
} | |
patterns.push({ | |
startIndex: i, | |
endIndex: endIndex - 1, | |
lines: potentialTable | |
}); | |
i = endIndex - 1; | |
} | |
} | |
return patterns; | |
} | |
isTablePattern(lines) { | |
if (lines.length < 3) return false; | |
const alignments = lines.map(line => this.getLineAlignments(line)); | |
let similarAlignments = 0; | |
for (let i = 1; i < alignments.length; i++) { | |
if (this.alignmentsAreSimilar(alignments[0], alignments[i])) { | |
similarAlignments++; | |
} | |
} | |
return similarAlignments >= Math.floor(lines.length * 0.7); | |
} | |
extendsTablePattern(pattern, line) { | |
if (!line) return false; | |
const patternAlignments = this.getLineAlignments(pattern[0]); | |
const lineAlignments = this.getLineAlignments(line); | |
return this.alignmentsAreSimilar(patternAlignments, lineAlignments); | |
} | |
getLineAlignments(line) { | |
return line.map(element => Math.round(element.position.x / 10) * 10); | |
} | |
alignmentsAreSimilar(alignments1, alignments2) { | |
if (alignments1.length === 0 || alignments2.length === 0) return false; | |
const intersection = alignments1.filter(a1 => | |
alignments2.some(a2 => Math.abs(a1 - a2) < 15) | |
); | |
const minLen = Math.min(alignments1.length, alignments2.length); | |
return intersection.length >= Math.floor(minLen * 0.6); | |
} | |
createTextBasedTable(pattern) { | |
try { | |
const table = []; | |
const bounds = this.calculatePatternBounds(pattern.lines); | |
for (const line of pattern.lines) { | |
const row = this.createTableRow(line, bounds); | |
table.push(row); | |
} | |
return { | |
bounds: bounds, | |
structure: table, | |
rowCount: table.length, | |
columnCount: Math.max(...table.map(row => row.length)), | |
type: 'text-based' | |
}; | |
} catch (error) { | |
return null; | |
} | |
} | |
calculatePatternBounds(lines) { | |
if (lines.length === 0) return { x: 0, y: 0, width: 0, height: 0 }; | |
let minX = Infinity; | |
let minY = Infinity; | |
let maxX = -Infinity; | |
let maxY = -Infinity; | |
for (const line of lines) { | |
for (const element of line) { | |
minX = Math.min(minX, element.position.x); | |
minY = Math.min(minY, element.position.y); | |
maxX = Math.max(maxX, element.position.x + (element.text.length * 6)); | |
maxY = Math.max(maxY, element.position.y + (element.fontSize || 12)); | |
} | |
} | |
return { | |
x: minX, | |
y: minY, | |
width: maxX - minX, | |
height: maxY - minY | |
}; | |
} | |
createTableRow(line, tableBounds) { | |
const columns = this.groupLineIntoColumns(line); | |
return columns.map(col => ({ | |
text: col.map(el => el.text).join(' '), | |
elements: col | |
})); | |
} | |
groupLineIntoColumns(line) { | |
const columns = []; | |
const grouped = new Set(); | |
for (let i = 0; i < line.length; i++) { | |
if (grouped.has(i)) continue; | |
const currentColumn = [line[i]]; | |
grouped.add(i); | |
const baseX = line[i].position.x; | |
for (let j = i + 1; j < line.length; j++) { | |
if (grouped.has(j)) continue; | |
const elementX = line[j].position.x; | |
if (Math.abs(elementX - baseX) < 30) { | |
currentColumn.push(line[j]); | |
grouped.add(j); | |
} | |
} | |
columns.push(currentColumn); | |
} | |
return columns; | |
} | |
// Métodos de detecção gráfica de tabelas | |
findTableLines(graphicsElements) { | |
const tableLines = []; | |
for (const element of graphicsElements) { | |
if (element.type === 'rectangle') { | |
if (this.isTableLikeRectangle(element)) { | |
tableLines.push({ | |
type: 'rectangle', | |
bounds: { | |
x: element.x, | |
y: element.y, | |
width: element.width, | |
height: element.height | |
}, | |
element: element | |
}); | |
} | |
} | |
} | |
return tableLines; | |
} | |
isTableLikeRectangle(rect) { | |
const minWidth = 20; | |
const minHeight = 15; | |
const maxWidth = 1000; | |
const maxHeight = 200; | |
return rect.width >= minWidth && | |
rect.height >= minHeight && | |
rect.width <= maxWidth && | |
rect.height <= maxHeight; | |
} | |
groupLinesIntoTables(lines) { | |
if (lines.length === 0) return []; | |
const groups = []; | |
const usedLines = new Set(); | |
for (let i = 0; i < lines.length; i++) { | |
if (usedLines.has(i)) continue; | |
const currentGroup = [lines[i]]; | |
usedLines.add(i); | |
for (let j = i + 1; j < lines.length; j++) { | |
if (usedLines.has(j)) continue; | |
if (this.areLinesRelated(lines[i], lines[j])) { | |
currentGroup.push(lines[j]); | |
usedLines.add(j); | |
} | |
} | |
if (currentGroup.length >= 2) { | |
groups.push(currentGroup); | |
} | |
} | |
return groups; | |
} | |
areLinesRelated(line1, line2) { | |
const bounds1 = line1.bounds; | |
const bounds2 = line2.bounds; | |
const maxDistance = 50; | |
const distance = Math.sqrt( | |
Math.pow(bounds1.x - bounds2.x, 2) + | |
Math.pow(bounds1.y - bounds2.y, 2) | |
); | |
return distance <= maxDistance; | |
} | |
analyzeTableStructure(lineGroup, textElements) { | |
try { | |
const bounds = this.calculateTableBounds(lineGroup); | |
const tableTextElements = this.findTextInBounds(textElements, bounds); | |
const cells = this.groupTextIntoCells(lineGroup, tableTextElements); | |
const { rows, cols } = this.determineTableDimensions(cells); | |
const tableStructure = this.organizeTableStructure(cells, rows, cols); | |
return { | |
bounds: bounds, | |
cells: cells, | |
structure: tableStructure, | |
rowCount: rows, | |
columnCount: cols, | |
textElements: tableTextElements | |
}; | |
} catch (error) { | |
return null; | |
} | |
} | |
calculateTableBounds(lineGroup) { | |
if (lineGroup.length === 0) { | |
return { x: 0, y: 0, width: 0, height: 0 }; | |
} | |
let minX = Infinity; | |
let minY = Infinity; | |
let maxX = -Infinity; | |
let maxY = -Infinity; | |
for (const line of lineGroup) { | |
const bounds = line.bounds; | |
minX = Math.min(minX, bounds.x); | |
minY = Math.min(minY, bounds.y); | |
maxX = Math.max(maxX, bounds.x + bounds.width); | |
maxY = Math.max(maxY, bounds.y + bounds.height); | |
} | |
return { | |
x: minX, | |
y: minY, | |
width: maxX - minX, | |
height: maxY - minY | |
}; | |
} | |
findTextInBounds(textElements, bounds) { | |
return textElements.filter(element => { | |
const elementBounds = { | |
x: element.position.x, | |
y: element.position.y, | |
width: element.text.length * (element.fontSize || 12) * 0.6, | |
height: element.fontSize || 12 | |
}; | |
return this.rectanglesIntersect(bounds, elementBounds); | |
}); | |
} | |
rectanglesIntersect(rect1, rect2) { | |
return rect1.x < rect2.x + rect2.width && | |
rect1.x + rect1.width > rect2.x && | |
rect1.y < rect2.y + rect2.height && | |
rect1.y + rect1.height > rect2.y; | |
} | |
groupTextIntoCells(lineGroup, textElements) { | |
const cells = []; | |
for (const line of lineGroup) { | |
const cellText = this.findTextInBounds(textElements, line.bounds); | |
if (cellText.length > 0) { | |
cells.push({ | |
bounds: line.bounds, | |
textElements: cellText, | |
text: this.extractCellText(cellText) | |
}); | |
} | |
} | |
return cells; | |
} | |
extractCellText(textElements) { | |
if (textElements.length === 0) return ''; | |
const sortedElements = [...textElements].sort((a, b) => { | |
if (Math.abs(b.position.y - a.position.y) > 5) { | |
return b.position.y - a.position.y; | |
} | |
return a.position.x - b.position.x; | |
}); | |
return sortedElements.map(el => el.text).join(' '); | |
} | |
determineTableDimensions(cells) { | |
if (cells.length === 0) { | |
return { rows: 0, cols: 0 }; | |
} | |
const rows = this.groupCellsIntoRows(cells); | |
const cols = this.groupCellsIntoColumns(cells); | |
return { | |
rows: rows.length, | |
cols: cols.length | |
}; | |
} | |
groupCellsIntoRows(cells) { | |
const rows = []; | |
const grouped = new Set(); | |
const sortedCells = [...cells].sort((a, b) => a.bounds.y - b.bounds.y); | |
for (let i = 0; i < sortedCells.length; i++) { | |
if (grouped.has(i)) continue; | |
const currentRow = [sortedCells[i]]; | |
grouped.add(i); | |
const baseY = sortedCells[i].bounds.y; | |
for (let j = i + 1; j < sortedCells.length; j++) { | |
if (grouped.has(j)) continue; | |
const cellY = sortedCells[j].bounds.y; | |
if (Math.abs(cellY - baseY) < 10) { | |
currentRow.push(sortedCells[j]); | |
grouped.add(j); | |
} | |
} | |
currentRow.sort((a, b) => a.bounds.x - b.bounds.x); | |
rows.push(currentRow); | |
} | |
return rows; | |
} | |
groupCellsIntoColumns(cells) { | |
const cols = []; | |
const grouped = new Set(); | |
const sortedCells = [...cells].sort((a, b) => a.bounds.x - b.bounds.x); | |
for (let i = 0; i < sortedCells.length; i++) { | |
if (grouped.has(i)) continue; | |
const currentCol = [sortedCells[i]]; | |
grouped.add(i); | |
const baseX = sortedCells[i].bounds.x; | |
for (let j = i + 1; j < sortedCells.length; j++) { | |
if (grouped.has(j)) continue; | |
const cellX = sortedCells[j].bounds.x; | |
if (Math.abs(cellX - baseX) < 20) { | |
currentCol.push(sortedCells[j]); | |
grouped.add(j); | |
} | |
} | |
cols.push(currentCol); | |
} | |
return cols; | |
} | |
organizeTableStructure(cells, rowCount, columnCount) { | |
if (cells.length === 0) return []; | |
const rows = this.groupCellsIntoRows(cells); | |
const table = []; | |
for (let i = 0; i < rows.length; i++) { | |
const row = []; | |
const rowCells = rows[i]; | |
for (let j = 0; j < rowCells.length; j++) { | |
row.push({ | |
text: rowCells[j].text, | |
bounds: rowCells[j].bounds | |
}); | |
} | |
table.push(row); | |
} | |
return table; | |
} | |
// Métodos existentes (mantidos do código anterior) | |
isValidPDF() { | |
const header = this.readString(8); | |
this.position = 0; | |
return header.startsWith('%PDF-'); | |
} | |
skipHeader() { | |
const headerEnd = this.findString('\n', 0); | |
this.position = headerEnd + 1; | |
} | |
async parseXRef() { | |
const xrefPos = this.findXRefPosition(); | |
if (xrefPos === -1) { | |
throw new Error('xref table não encontrado'); | |
} | |
this.position = xrefPos; | |
this.readString(4); | |
while (this.position < this.buffer.length) { | |
const line = this.readLine().trim(); | |
if (line === 'trailer') { | |
break; | |
} | |
if (this.isNumeric(line.split(' ')[0])) { | |
const [objNum, count] = line.split(' ').map(Number); | |
for (let i = 0; i < count; i++) { | |
const entry = this.readLine().trim(); | |
const [offset, genNum, flag] = entry.split(' '); | |
this.xref[`${objNum + i}`] = { | |
offset: parseInt(offset), | |
genNum: parseInt(genNum), | |
inUse: flag === 'n' | |
}; | |
} | |
} | |
} | |
} | |
findXRefPosition() { | |
for (let i = this.buffer.length - 10; i >= 0; i--) { | |
if (this.readStringAt(i, 9) === 'startxref') { | |
let pos = i + 9; | |
while (pos < this.buffer.length && this.isWhitespace(this.buffer[pos])) { | |
pos++; | |
} | |
let numStr = ''; | |
while (pos < this.buffer.length && this.isNumericChar(this.buffer[pos])) { | |
numStr += String.fromCharCode(this.buffer[pos]); | |
pos++; | |
} | |
return parseInt(numStr); | |
} | |
} | |
return -1; | |
} | |
parseTrailer() { | |
const trailerStart = this.findString('trailer'); | |
if (trailerStart === -1) { | |
throw new Error('trailer não encontrado'); | |
} | |
this.position = trailerStart + 7; | |
this.skipWhitespace(); | |
const trailerDict = this.parseDictionary(); | |
this.trailer = trailerDict; | |
} | |
async parseObjects() { | |
for (const [objId, xrefEntry] of Object.entries(this.xref)) { | |
if (xrefEntry.inUse) { | |
try { | |
const obj = await this.parseObjectAt(xrefEntry.offset, objId); | |
this.objects.set(objId, obj); | |
} catch (error) { | |
console.warn(`Erro ao parsear objeto ${objId}: ${error.message}`); | |
} | |
} | |
} | |
} | |
async parseObjectAt(offset, objId) { | |
this.position = offset; | |
const objLine = this.readLine(); | |
const objNumMatch = objLine.match(/^(\d+)\s+(\d+)\s+obj/); | |
if (!objNumMatch) { | |
throw new Error('Formato de objeto inválido'); | |
} | |
const objNum = objNumMatch[1]; | |
const genNum = objNumMatch[2]; | |
const content = this.parseObjectContent(); | |
return { | |
id: objNum, | |
gen: genNum, | |
content: content, | |
offset: offset | |
}; | |
} | |
parseObjectContent() { | |
const startPos = this.position; | |
let content = ''; | |
let inString = false; | |
while (this.position < this.buffer.length) { | |
const char = String.fromCharCode(this.buffer[this.position]); | |
if (char === 'e' && this.readStringAt(this.position, 6) === 'endobj') { | |
break; | |
} | |
if (char === '(' && !inString) { | |
inString = true; | |
} else if (char === ')' && inString) { | |
inString = false; | |
} | |
content += char; | |
this.position++; | |
} | |
try { | |
return this.parsePDFValue(content.trim()); | |
} catch (error) { | |
return content.trim(); | |
} | |
} | |
parsePDFValue(value) { | |
value = value.trim(); | |
if (value.startsWith('[') && value.endsWith(']')) { | |
return this.parseArray(value); | |
} | |
if (value.startsWith('<<') && value.endsWith('>>')) { | |
return this.parseDictionaryFromString(value); | |
} | |
if (value.startsWith('(') && value.endsWith(')')) { | |
return this.parseString(value); | |
} | |
if (value.startsWith('<') && value.endsWith('>') && !value.startsWith('<<')) { | |
return this.parseHexString(value); | |
} | |
if (value.startsWith('/')) { | |
return value.substring(1); | |
} | |
if (this.isNumeric(value)) { | |
return parseFloat(value); | |
} | |
if (value === 'true' || value === 'false') { | |
return value === 'true'; | |
} | |
if (value === 'null') { | |
return null; | |
} | |
return value; | |
} | |
parseArray(arrayStr) { | |
const content = arrayStr.substring(1, arrayStr.length - 1).trim(); | |
const items = []; | |
let current = ''; | |
let depth = 0; | |
let inString = false; | |
let inHexString = false; | |
for (let i = 0; i < content.length; i++) { | |
const char = content[i]; | |
if (char === '(' && !inString && !inHexString) { | |
inString = true; | |
current += char; | |
} else if (char === ')' && inString) { | |
inString = false; | |
current += char; | |
} else if (char === '<' && !inString && !inHexString && content[i+1] !== '<') { | |
inHexString = true; | |
current += char; | |
} else if (char === '>' && inHexString) { | |
inHexString = false; | |
current += char; | |
} else if (!inString && !inHexString && char === '[') { | |
depth++; | |
current += char; | |
} else if (!inString && !inHexString && char === ']') { | |
depth--; | |
current += char; | |
} else if (!inString && !inHexString && char === ' ' && depth === 0) { | |
if (current.trim()) { | |
items.push(this.parsePDFValue(current.trim())); | |
} | |
current = ''; | |
} else { | |
current += char; | |
} | |
} | |
if (current.trim()) { | |
items.push(this.parsePDFValue(current.trim())); | |
} | |
return items; | |
} | |
parseDictionaryFromString(dictStr) { | |
const dict = {}; | |
const content = dictStr.substring(2, dictStr.length - 2).trim(); | |
const pairs = this.splitDictionaryPairs(content); | |
for (const pair of pairs) { | |
const parts = pair.trim().split(/\s+/); | |
if (parts.length >= 2) { | |
const key = parts[0].substring(1); | |
const valueStr = parts.slice(1).join(' '); | |
dict[key] = this.parsePDFValue(valueStr); | |
} | |
} | |
return dict; | |
} | |
splitDictionaryPairs(content) { | |
const pairs = []; | |
let current = ''; | |
let depth = 0; | |
let inString = false; | |
let inHexString = false; | |
for (let i = 0; i < content.length; i++) { | |
const char = content[i]; | |
if (char === '(' && !inString && !inHexString) { | |
inString = true; | |
current += char; | |
} else if (char === ')' && inString) { | |
inString = false; | |
current += char; | |
} else if (char === '<' && !inString && !inHexString && content[i+1] !== '<') { | |
inHexString = true; | |
current += char; | |
} else if (char === '>' && inHexString) { | |
inHexString = false; | |
current += char; | |
} else if (!inString && !inHexString && (char === '[' || char === '<<')) { | |
depth++; | |
current += char; | |
} else if (!inString && !inHexString && (char === ']' || (char === '>' && content[i+1] === '>'))) { | |
depth--; | |
current += char; | |
if (char === '>' && content[i+1] === '>') i++; | |
} else if (!inString && !inHexString && char === '/' && depth === 0) { | |
if (current.trim()) { | |
pairs.push(current.trim()); | |
} | |
current = char; | |
} else { | |
current += char; | |
} | |
} | |
if (current.trim()) { | |
pairs.push(current.trim()); | |
} | |
return pairs; | |
} | |
parseString(str) { | |
let content = str.substring(1, str.length - 1); | |
content = content.replace(/\\n/g, '\n') | |
.replace(/\\r/g, '\r') | |
.replace(/\\t/g, '\t') | |
.replace(/\\\\/g, '\\') | |
.replace(/\\\)/g, ')') | |
.replace(/\\\(/g, '(') | |
.replace(/\\([0-7]{1,3})/g, (match, octal) => { | |
return String.fromCharCode(parseInt(octal, 8)); | |
}); | |
return content; | |
} | |
parseHexString(hexStr) { | |
const hexContent = hexStr.substring(1, hexStr.length - 1); | |
let result = ''; | |
for (let i = 0; i < hexContent.length; i += 2) { | |
const hexPair = hexContent.substr(i, 2); | |
const charCode = parseInt(hexPair, 16); | |
if (!isNaN(charCode)) { | |
result += String.fromCharCode(charCode); | |
} | |
} | |
return result; | |
} | |
async extractContentStreamWithLayout(contentRef, resources) { | |
const contentId = contentRef.split(' ')[0]; | |
const contentObj = this.getObject(contentId); | |
if (!contentObj) return { textElements: [], graphics: [], images: [], transformations: [] }; | |
let contentStream = ''; | |
if (typeof contentObj.content === 'string') { | |
contentStream = contentObj.content; | |
} else if (contentObj.content && contentObj.content.stream) { | |
contentStream = contentObj.content.stream; | |
} else if (contentObj.stream) { | |
contentStream = contentObj.stream; | |
} else { | |
contentStream = await this.extractRawStream(contentObj); | |
} | |
return this.parseContentStreamWithLayout(contentStream, resources); | |
} | |
parseContentStreamWithLayout(contentStream, resources) { | |
const result = { | |
textElements: [], | |
graphics: [], | |
images: [], | |
transformations: [] | |
}; | |
const lines = contentStream.split('\n'); | |
let textState = { | |
inTextObject: false, | |
currentFont: null, | |
fontSize: 12, | |
charSpacing: 0, | |
wordSpacing: 0, | |
textLeading: 0, | |
textMatrix: [1, 0, 0, 1, 0, 0], | |
textLineMatrix: [1, 0, 0, 1, 0, 0], | |
position: { x: 0, y: 0 }, | |
color: { r: 0, g: 0, b: 0 } | |
}; | |
let graphicsState = { | |
currentTransformationMatrix: [1, 0, 0, 1, 0, 0], | |
lineWidth: 1, | |
lineCap: 0, | |
lineJoin: 0, | |
fillColor: { r: 0, g: 0, b: 0 }, | |
strokeColor: { r: 0, g: 0, b: 0 } | |
}; | |
for (const line of lines) { | |
const trimmedLine = line.trim(); | |
const commands = this.parseLineCommands(trimmedLine); | |
for (const command of commands) { | |
this.processPDFCommand(command, textState, graphicsState, result); | |
} | |
} | |
return result; | |
} | |
parseLineCommands(line) { | |
const commands = []; | |
let currentCommand = ''; | |
let inString = false; | |
let stringDelimiter = ''; | |
for (let i = 0; i < line.length; i++) { | |
const char = line[i]; | |
if (!inString && (char === '(' || char === '<')) { | |
inString = true; | |
stringDelimiter = char; | |
currentCommand += char; | |
} else if (inString && | |
((stringDelimiter === '(' && char === ')') || | |
(stringDelimiter === '<' && char === '>'))) { | |
inString = false; | |
currentCommand += char; | |
} else if (!inString && char === ' ') { | |
if (currentCommand.trim()) { | |
commands.push(currentCommand.trim()); | |
} | |
currentCommand = ''; | |
} else { | |
currentCommand += char; | |
} | |
} | |
if (currentCommand.trim()) { | |
commands.push(currentCommand.trim()); | |
} | |
return commands; | |
} | |
processPDFCommand(command, textState, graphicsState, result) { | |
if (command === 'BT') { | |
textState.inTextObject = true; | |
textState.textMatrix = [1, 0, 0, 1, 0, 0]; | |
textState.textLineMatrix = [1, 0, 0, 1, 0, 0]; | |
} else if (command === 'ET') { | |
textState.inTextObject = false; | |
} else if (textState.inTextObject) { | |
this.processTextCommand(command, textState, result); | |
} else { | |
this.processGraphicsCommand(command, graphicsState, result); | |
} | |
} | |
processTextCommand(command, textState, result) { | |
if (command === 'Tm') { | |
const matrix = this.parseMatrixCommand(command); | |
if (matrix) { | |
textState.textMatrix = matrix; | |
textState.textLineMatrix = [...matrix]; | |
textState.position = { x: matrix[4], y: matrix[5] }; | |
} | |
} else if (command === 'Td' || command === 'TD') { | |
const coords = this.parseCoordinateCommand(command); | |
if (coords) { | |
textState.textMatrix[4] += coords.x; | |
textState.textMatrix[5] += coords.y; | |
textState.position = { x: textState.textMatrix[4], y: textState.textMatrix[5] }; | |
if (command === 'TD') { | |
textState.textLeading = -coords.y; | |
} | |
} | |
} else if (command === 'T*') { | |
textState.textMatrix[5] -= textState.textLeading; | |
textState.position = { x: textState.textMatrix[4], y: textState.textMatrix[5] }; | |
} else if (command.endsWith('Tf')) { | |
const fontMatch = command.match(/\/([^\s]+)\s+(-?\d+(?:\.\d+)?)\s+Tf/); | |
if (fontMatch) { | |
textState.currentFont = fontMatch[1]; | |
textState.fontSize = parseFloat(fontMatch[2]); | |
} | |
} else if (command.endsWith('Tj') || command.endsWith('TJ') || command.startsWith('(') || command.startsWith('<')) { | |
this.extractTextElement(command, textState, result); | |
} | |
} | |
processGraphicsCommand(command, graphicsState, result) { | |
if (command === 'cm') { | |
const matrix = this.parseMatrixCommand(command); | |
if (matrix) { | |
graphicsState.currentTransformationMatrix = matrix; | |
result.transformations.push({ | |
type: 'transformation', | |
matrix: matrix, | |
position: { x: matrix[4], y: matrix[5] } | |
}); | |
} | |
} else if (command.endsWith('w')) { | |
const width = parseFloat(command); | |
if (!isNaN(width)) { | |
graphicsState.lineWidth = width; | |
} | |
} else if (command.endsWith('rg')) { | |
const colors = command.split(' ').slice(0, 3).map(parseFloat); | |
if (colors.length === 3 && colors.every(c => !isNaN(c))) { | |
graphicsState.fillColor = { | |
r: Math.round(colors[0] * 255), | |
g: Math.round(colors[1] * 255), | |
b: Math.round(colors[2] * 255) | |
}; | |
} | |
} else if (command.endsWith('RG')) { | |
const colors = command.split(' ').slice(0, 3).map(parseFloat); | |
if (colors.length === 3 && colors.every(c => !isNaN(c))) { | |
graphicsState.strokeColor = { | |
r: Math.round(colors[0] * 255), | |
g: Math.round(colors[1] * 255), | |
b: Math.round(colors[2] * 255) | |
}; | |
} | |
} else if (command.endsWith('m')) { | |
const coords = this.parseCoordinateCommand(command); | |
if (coords) { | |
result.graphics.push({ | |
type: 'moveTo', | |
x: coords.x, | |
y: coords.y, | |
color: graphicsState.strokeColor | |
}); | |
} | |
} else if (command.endsWith('l')) { | |
const coords = this.parseCoordinateCommand(command); | |
if (coords) { | |
result.graphics.push({ | |
type: 'lineTo', | |
x: coords.x, | |
y: coords.y, | |
color: graphicsState.strokeColor | |
}); | |
} | |
} else if (command.endsWith('re')) { | |
const coords = this.parseRectangleCommand(command); | |
if (coords) { | |
result.graphics.push({ | |
type: 'rectangle', | |
x: coords.x, | |
y: coords.y, | |
width: coords.width, | |
height: coords.height, | |
fillColor: graphicsState.fillColor, | |
strokeColor: graphicsState.strokeColor | |
}); | |
} | |
} else if (command === 'Do') { | |
const xobjectMatch = command.match(/\/([^\s]+)/); | |
if (xobjectMatch) { | |
result.images.push({ | |
type: 'image', | |
name: xobjectMatch[1], | |
position: { x: 0, y: 0 } | |
}); | |
} | |
} | |
} | |
extractTextElement(command, textState, result) { | |
let text = ''; | |
let textCommands = []; | |
const stringMatches = command.match(/\(([^)]*)\)/g); | |
if (stringMatches) { | |
for (const match of stringMatches) { | |
const stringContent = match.substring(1, match.length - 1); | |
text += this.decodePDFString(stringContent) + ' '; | |
textCommands.push({ | |
text: this.decodePDFString(stringContent), | |
type: 'string' | |
}); | |
} | |
} | |
const hexMatches = command.match(/<([0-9A-Fa-f\s]*)>/g); | |
if (hexMatches) { | |
for (const match of hexMatches) { | |
const hexContent = match.substring(1, match.length - 1); | |
text += this.decodeHexString(hexContent) + ' '; | |
textCommands.push({ | |
text: this.decodeHexString(hexContent), | |
type: 'hex' | |
}); | |
} | |
} | |
if (command.endsWith('TJ')) { | |
const arrayMatch = command.match(/\[(.*?)\]/); | |
if (arrayMatch) { | |
const arrayContent = arrayMatch[1]; | |
text = this.parseTJArray(arrayContent); | |
textCommands = [{ text: text, type: 'TJ' }]; | |
} | |
} | |
if (text.trim()) { | |
const textElement = { | |
text: text.trim(), | |
font: textState.currentFont, | |
fontSize: textState.fontSize, | |
position: { ...textState.position }, | |
matrix: [...textState.textMatrix], | |
color: { ...textState.color }, | |
charSpacing: textState.charSpacing, | |
wordSpacing: textState.wordSpacing | |
}; | |
result.textElements.push(textElement); | |
} | |
} | |
parseTJArray(arrayContent) { | |
let text = ''; | |
const items = arrayContent.split(/\s+(?![^(]*\))/); | |
for (const item of items) { | |
if (item.startsWith('(') && item.endsWith(')')) { | |
const stringContent = item.substring(1, item.length - 1); | |
text += this.decodePDFString(stringContent); | |
} else if (item.startsWith('<') && item.endsWith('>')) { | |
const hexContent = item.substring(1, item.length - 1); | |
text += this.decodeHexString(hexContent); | |
} else if (this.isNumeric(item)) { | |
const adjustment = parseFloat(item); | |
if (Math.abs(adjustment) > 100) { | |
text += ' '; | |
} | |
} | |
} | |
return text; | |
} | |
parseMatrixCommand(commandLine) { | |
const numbers = commandLine.split(' ').slice(0, 6).map(parseFloat); | |
if (numbers.length === 6 && numbers.every(n => !isNaN(n))) { | |
return numbers; | |
} | |
return null; | |
} | |
parseCoordinateCommand(commandLine) { | |
const parts = commandLine.split(' '); | |
if (parts.length >= 2) { | |
const x = parseFloat(parts[parts.length - 2]); | |
const y = parseFloat(parts[parts.length - 1]); | |
if (!isNaN(x) && !isNaN(y)) { | |
return { x, y }; | |
} | |
} | |
return null; | |
} | |
parseRectangleCommand(commandLine) { | |
const parts = commandLine.split(' '); | |
if (parts.length >= 4) { | |
const coords = parts.slice(0, 4).map(parseFloat); | |
if (coords.every(c => !isNaN(c))) { | |
return { | |
x: coords[0], | |
y: coords[1], | |
width: coords[2], | |
height: coords[3] | |
}; | |
} | |
} | |
return null; | |
} | |
decodePDFString(str) { | |
return str.replace(/\\n/g, '\n') | |
.replace(/\\r/g, '\r') | |
.replace(/\\t/g, '\t') | |
.replace(/\\\\/g, '\\') | |
.replace(/\\\)/g, ')') | |
.replace(/\\\(/g, '(') | |
.replace(/\\([0-7]{1,3})/g, (match, octal) => { | |
return String.fromCharCode(parseInt(octal, 8)); | |
}); | |
} | |
decodeHexString(hexStr) { | |
hexStr = hexStr.replace(/\s/g, ''); | |
let result = ''; | |
for (let i = 0; i < hexStr.length; i += 2) { | |
const hexPair = hexStr.substr(i, 2); | |
const charCode = parseInt(hexPair, 16); | |
if (!isNaN(charCode)) { | |
result += String.fromCharCode(charCode); | |
} | |
} | |
return result; | |
} | |
async extractRawStream(obj) { | |
if (!obj || !obj.offset) return ''; | |
const objData = this.buffer.slice(obj.offset); | |
const streamStart = this.findStringInBuffer(objData, 'stream'); | |
const streamEnd = this.findStringInBuffer(objData, 'endstream'); | |
if (streamStart === -1 || streamEnd === -1) return ''; | |
let contentStart = streamStart + 6; | |
while (contentStart < objData.length && | |
(objData[contentStart] === 0x0A || objData[contentStart] === 0x0D)) { | |
contentStart++; | |
} | |
const contentBytes = objData.slice(contentStart, streamEnd); | |
return this.decodeStreamContent(contentBytes, obj.content?.Filter); | |
} | |
decodeStreamContent(bytes, filter) { | |
if (!filter) { | |
return this.bytesToString(bytes); | |
} | |
if (filter.includes('FlateDecode') || filter === 'FlateDecode') { | |
try { | |
return this.bytesToString(bytes); | |
} catch (error) { | |
console.warn('Falha na descompressão FlateDecode'); | |
return this.bytesToString(bytes); | |
} | |
} | |
return this.bytesToString(bytes); | |
} | |
bytesToString(bytes) { | |
return Array.from(bytes) | |
.map(byte => String.fromCharCode(byte)) | |
.join(''); | |
} | |
getPageSize(pageContent) { | |
let box = pageContent.MediaBox || pageContent.CropBox; | |
if (box) { | |
if (Array.isArray(box)) { | |
return { | |
width: box[2] - box[0], | |
height: box[3] - box[1], | |
x: box[0], | |
y: box[1] | |
}; | |
} | |
} | |
return { width: 612, height: 792 }; | |
} | |
async loadPageFonts(fontDict) { | |
for (const [fontName, fontRef] of Object.entries(fontDict)) { | |
if (typeof fontRef === 'string') { | |
const fontId = fontRef.split(' ')[0]; | |
const fontObj = this.getObject(fontId); | |
if (fontObj) { | |
this.fonts.set(fontName, { | |
id: fontId, | |
content: fontObj.content || {}, | |
encoding: this.getFontEncoding(fontObj.content) | |
}); | |
} | |
} | |
} | |
} | |
getFontEncoding(fontContent) { | |
if (!fontContent) return 'StandardEncoding'; | |
if (fontContent.Encoding) { | |
return fontContent.Encoding; | |
} | |
if (fontContent.BaseFont) { | |
const baseFont = fontContent.BaseFont; | |
if (baseFont.includes('WinAnsi')) return 'WinAnsiEncoding'; | |
if (baseFont.includes('MacRoman')) return 'MacRomanEncoding'; | |
} | |
return 'StandardEncoding'; | |
} | |
readString(length) { | |
let str = ''; | |
const endPos = Math.min(this.position + length, this.buffer.length); | |
for (let i = this.position; i < endPos; i++) { | |
str += String.fromCharCode(this.buffer[i]); | |
} | |
this.position += length; | |
return str; | |
} | |
readStringAt(position, length) { | |
let str = ''; | |
const endPos = Math.min(position + length, this.buffer.length); | |
for (let i = position; i < endPos; i++) { | |
str += String.fromCharCode(this.buffer[i]); | |
} | |
return str; | |
} | |
readLine() { | |
let line = ''; | |
while (this.position < this.buffer.length) { | |
const char = this.buffer[this.position]; | |
if (char === '\n'.charCodeAt(0) || char === '\r'.charCodeAt(0)) { | |
this.position++; | |
if (char === '\r'.charCodeAt(0) && this.buffer[this.position] === '\n'.charCodeAt(0)) { | |
this.position++; | |
} | |
break; | |
} | |
line += String.fromCharCode(char); | |
this.position++; | |
} | |
return line; | |
} | |
skipWhitespace() { | |
while (this.position < this.buffer.length) { | |
const char = this.buffer[this.position]; | |
if (!this.isWhitespace(char)) { | |
break; | |
} | |
this.position++; | |
} | |
} | |
findString(str, startPos = this.position) { | |
const encoder = new TextEncoder(); | |
const searchBytes = encoder.encode(str); | |
for (let i = startPos; i <= this.buffer.length - searchBytes.length; i++) { | |
let found = true; | |
for (let j = 0; j < searchBytes.length; j++) { | |
if (this.buffer[i + j] !== searchBytes[j]) { | |
found = false; | |
break; | |
} | |
} | |
if (found) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
findStringInBuffer(buffer, str) { | |
const encoder = new TextEncoder(); | |
const searchBytes = encoder.encode(str); | |
for (let i = 0; i <= buffer.length - searchBytes.length; i++) { | |
let found = true; | |
for (let j = 0; j < searchBytes.length; j++) { | |
if (buffer[i + j] !== searchBytes[j]) { | |
found = false; | |
break; | |
} | |
} | |
if (found) { | |
return i; | |
} | |
} | |
return -1; | |
} | |
isWhitespace(char) { | |
return char === 0x20 || char === 0x0A || char === 0x0D || char === 0x09 || char === 0x0C; | |
} | |
isNumericChar(char) { | |
return (char >= 0x30 && char <= 0x39) || char === 0x2D || char === 0x2E; | |
} | |
isNumeric(str) { | |
return /^-?\d+\.?\d*$/.test(str); | |
} | |
parseDictionary() { | |
const dict = {}; | |
this.skipWhitespace(); | |
if (this.readString(2) !== '<<') { | |
throw new Error('Dicionário deve começar com <<'); | |
} | |
while (this.position < this.buffer.length) { | |
this.skipWhitespace(); | |
if (this.readStringAt(this.position, 2) === '>>') { | |
this.position += 2; | |
break; | |
} | |
const key = this.parseName(); | |
if (key) { | |
this.skipWhitespace(); | |
const value = this.parseNextValue(); | |
dict[key] = value; | |
} | |
} | |
return dict; | |
} | |
parseName() { | |
if (this.buffer[this.position] !== '/'.charCodeAt(0)) { | |
return null; | |
} | |
this.position++; | |
let name = ''; | |
while (this.position < this.buffer.length) { | |
const char = this.buffer[this.position]; | |
if (this.isWhitespace(char) || char === '/'.charCodeAt(0) || | |
char === '('.charCodeAt(0) || char === '<'.charCodeAt(0)) { | |
break; | |
} | |
name += String.fromCharCode(char); | |
this.position++; | |
} | |
return name; | |
} | |
parseNextValue() { | |
this.skipWhitespace(); | |
const char = String.fromCharCode(this.buffer[this.position]); | |
if (char === '[') { | |
return this.parseArrayValue(); | |
} else if (char === '<' && this.buffer[this.position + 1] === '<'.charCodeAt(0)) { | |
return this.parseDictionary(); | |
} else if (char === '/') { | |
return this.parseName(); | |
} else if (char === '(') { | |
return this.parseStringValue(); | |
} else { | |
return this.parseSimpleValue(); | |
} | |
} | |
parseArrayValue() { | |
this.position++; | |
const array = []; | |
while (this.position < this.buffer.length) { | |
this.skipWhitespace(); | |
if (this.buffer[this.position] === ']'.charCodeAt(0)) { | |
this.position++; | |
break; | |
} | |
array.push(this.parseNextValue()); | |
} | |
return array; | |
} | |
parseStringValue() { | |
this.position++; | |
let str = ''; | |
let parenCount = 1; | |
while (this.position < this.buffer.length && parenCount > 0) { | |
const char = this.buffer[this.position]; | |
if (char === '('.charCodeAt(0)) { | |
parenCount++; | |
str += '('; | |
} else if (char === ')'.charCodeAt(0)) { | |
parenCount--; | |
if (parenCount > 0) { | |
str += ')'; | |
} | |
} else if (char === '\\'.charCodeAt(0)) { | |
this.position++; | |
if (this.position < this.buffer.length) { | |
const nextChar = this.buffer[this.position]; | |
switch (nextChar) { | |
case 'n'.charCodeAt(0): str += '\n'; break; | |
case 'r'.charCodeAt(0): str += '\r'; break; | |
case 't'.charCodeAt(0): str += '\t'; break; | |
default: str += String.fromCharCode(nextChar); | |
} | |
} | |
} else { | |
str += String.fromCharCode(char); | |
} | |
this.position++; | |
} | |
return str; | |
} | |
parseSimpleValue() { | |
let value = ''; | |
while (this.position < this.buffer.length) { | |
const char = this.buffer[this.position]; | |
if (this.isWhitespace(char) || char === '/'.charCodeAt(0) || | |
char === '('.charCodeAt(0) || char === '<'.charCodeAt(0) || | |
char === '['.charCodeAt(0) || char === ']'.charCodeAt(0)) { | |
break; | |
} | |
value += String.fromCharCode(char); | |
this.position++; | |
} | |
if (value === 'true') return true; | |
if (value === 'false') return false; | |
if (value === 'null') return null; | |
if (this.isNumeric(value)) return parseFloat(value); | |
return value; | |
} | |
// Métodos públicos para acesso aos dados | |
getObjects() { | |
return this.objects; | |
} | |
getObject(id) { | |
return this.objects.get(id.toString()); | |
} | |
getTrailer() { | |
return this.trailer; | |
} | |
getDocumentInfo() { | |
const infoRef = this.trailer.Info; | |
if (infoRef && typeof infoRef === 'string') { | |
const infoId = infoRef.split(' ')[0]; | |
const infoObj = this.getObject(infoId); | |
return infoObj ? infoObj.content : null; | |
} | |
return null; | |
} | |
getPageCount() { | |
return this.pages.length; | |
} | |
getPageLayout(pageNumber) { | |
if (pageNumber < 1 || pageNumber > this.pages.length) { | |
throw new Error(`Página ${pageNumber} não encontrada`); | |
} | |
return this.pages[pageNumber - 1]; | |
} | |
getPageTables(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
return page.tables || []; | |
} | |
getTable(pageNumber, tableIndex) { | |
const tables = this.getPageTables(pageNumber); | |
if (tableIndex < 0 || tableIndex >= tables.length) { | |
throw new Error(`Tabela ${tableIndex} não encontrada na página ${pageNumber}`); | |
} | |
return tables[tableIndex]; | |
} | |
getTableAsCSV(pageNumber, tableIndex) { | |
const table = this.getTable(pageNumber, tableIndex); | |
let csv = ''; | |
for (const row of table.structure) { | |
const csvRow = row.map(cell => { | |
let cellText = cell.text || ''; | |
if (cellText.includes(',') || cellText.includes('"') || cellText.includes('\n')) { | |
cellText = '"' + cellText.replace(/"/g, '""') + '"'; | |
} | |
return cellText; | |
}).join(','); | |
csv += csvRow + '\n'; | |
} | |
return csv.trim(); | |
} | |
getPageTablesAsCSV(pageNumber) { | |
const tables = this.getPageTables(pageNumber); | |
return tables.map((table, index) => ({ | |
index: index, | |
csv: this.getTableAsCSV(pageNumber, index), | |
bounds: table.bounds | |
})); | |
} | |
getTableAsJSON(pageNumber, tableIndex) { | |
const table = this.getTable(pageNumber, tableIndex); | |
const headers = table.structure[0]?.map(cell => cell.text) || []; | |
const rows = table.structure.slice(1); | |
const jsonArray = rows.map(row => { | |
const obj = {}; | |
row.forEach((cell, index) => { | |
const header = headers[index] || `col_${index}`; | |
obj[header] = cell.text || ''; | |
}); | |
return obj; | |
}); | |
return jsonArray; | |
} | |
getTableStatistics(pageNumber) { | |
const tables = this.getPageTables(pageNumber); | |
return tables.map((table, index) => ({ | |
index: index, | |
rowCount: table.rowCount, | |
columnCount: table.columnCount, | |
bounds: table.bounds, | |
cellCount: table.cells?.length || 0 | |
})); | |
} | |
searchTables(pageNumber, searchTerm) { | |
const tables = this.getPageTables(pageNumber); | |
const results = []; | |
const lowerSearchTerm = searchTerm.toLowerCase(); | |
tables.forEach((table, index) => { | |
let found = false; | |
for (const row of table.structure) { | |
for (const cell of row) { | |
if (cell.text && cell.text.toLowerCase().includes(lowerSearchTerm)) { | |
found = true; | |
break; | |
} | |
} | |
if (found) break; | |
} | |
if (found) { | |
results.push({ | |
tableIndex: index, | |
table: table | |
}); | |
} | |
}); | |
return results; | |
} | |
getTableAsString(pageNumber, tableIndex) { | |
const table = this.getTable(pageNumber, tableIndex); | |
let result = ''; | |
for (const row of table.structure) { | |
const rowText = row.map(cell => cell.text || '').join('\t'); | |
result += rowText + '\n'; | |
} | |
return result.trim(); | |
} | |
hasTables(pageNumber) { | |
const tables = this.getPageTables(pageNumber); | |
return tables.length > 0; | |
} | |
getAllTables() { | |
const allTables = []; | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
const pageTables = this.getPageTables(i); | |
pageTables.forEach((table, tableIndex) => { | |
allTables.push({ | |
page: i, | |
tableIndex: tableIndex, | |
table: table | |
}); | |
}); | |
} | |
return allTables; | |
} | |
exportAllTablesAsJSON() { | |
const allTables = this.getAllTables(); | |
const result = []; | |
allTables.forEach(({ page, tableIndex, table }) => { | |
result.push({ | |
page: page, | |
tableIndex: tableIndex, | |
bounds: table.bounds, | |
data: this.getTableAsJSON(page, tableIndex) | |
}); | |
}); | |
return result; | |
} | |
exportAllTablesAsCSV() { | |
let csvOutput = ''; | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
const pageTables = this.getPageTablesAsCSV(i); | |
if (pageTables.length > 0) { | |
csvOutput += `=== Página ${i} ===\n`; | |
pageTables.forEach(table => { | |
csvOutput += table.csv + '\n\n'; | |
}); | |
} | |
} | |
return csvOutput.trim(); | |
} | |
getPageTextWithPosition(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
return page.textElements.map(element => ({ | |
text: element.text, | |
x: element.position.x, | |
y: element.position.y, | |
font: element.font, | |
fontSize: element.fontSize | |
})); | |
} | |
getPagePlainText(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
const textElements = [...page.textElements]; | |
textElements.sort((a, b) => { | |
if (Math.abs(b.position.y - a.position.y) > 5) { | |
return b.position.y - a.position.y; | |
} | |
return a.position.x - b.position.x; | |
}); | |
const lines = []; | |
let currentLine = []; | |
let lastY = null; | |
for (const element of textElements) { | |
if (lastY === null || Math.abs(element.position.y - lastY) > 5) { | |
if (currentLine.length > 0) { | |
lines.push(currentLine); | |
} | |
currentLine = [element]; | |
lastY = element.position.y; | |
} else { | |
currentLine.push(element); | |
} | |
} | |
if (currentLine.length > 0) { | |
lines.push(currentLine); | |
} | |
let result = ''; | |
for (const line of lines) { | |
line.sort((a, b) => a.position.x - b.position.x); | |
const lineText = line.map(el => el.text).join(' '); | |
result += lineText + '\n'; | |
} | |
return result.trim(); | |
} | |
getDocumentTextFormatted() { | |
let result = ''; | |
for (let i = 1; i <= this.getPageCount(); i++) { | |
result += `=== Página ${i} ===\n`; | |
result += this.getPagePlainText(i) + '\n\n'; | |
} | |
return result.trim(); | |
} | |
searchPageText(pageNumber, searchTerm) { | |
const page = this.getPageLayout(pageNumber); | |
const results = []; | |
const lowerSearchTerm = searchTerm.toLowerCase(); | |
page.textElements.forEach((element, index) => { | |
if (element.text.toLowerCase().includes(lowerSearchTerm)) { | |
results.push({ | |
element: element, | |
index: index, | |
page: pageNumber | |
}); | |
} | |
}); | |
return results; | |
} | |
getPageGraphics(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
return page.graphics; | |
} | |
getPageTransformations(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
return page.transformations; | |
} | |
getPageBoundingBox(pageNumber) { | |
const page = this.getPageLayout(pageNumber); | |
return page.size; | |
} | |
convertToStandardCoordinates(pageNumber, x, y) { | |
const page = this.getPageLayout(pageNumber); | |
const pageSize = page.size; | |
return { | |
x: x, | |
y: pageSize.height - y | |
}; | |
} | |
} | |
// Exporta a classe | |
if (typeof module !== 'undefined' && module.exports) { | |
module.exports = PDFParser; | |
} else if (typeof window !== 'undefined') { | |
window.PDFParser = PDFParser; | |
} | |
// Exemplo de uso: | |
/* | |
async function exemploExportacaoHTML() { | |
try { | |
const buffer = await fs.readFile('documento.pdf'); | |
const parser = new PDFParser(); | |
await parser.loadPDF(buffer); | |
// Exporta como HTML completo com layout | |
const htmlCompleto = parser.exportToHTML({ | |
includeStyles: true, | |
includeMetadata: true, | |
preserveLayout: true, | |
includeImages: true, | |
includeTables: true | |
}); | |
// Salva o HTML | |
await fs.writeFile('documento.html', htmlCompleto); | |
console.log('HTML exportado com sucesso!'); | |
// Exporta como HTML simplificado | |
const htmlSimples = parser.exportToSimpleHTML({ | |
includeMetadata: true, | |
includeTables: true | |
}); | |
// Exporta como HTML responsivo | |
const htmlResponsivo = parser.exportToResponsiveHTML({ | |
maxWidth: 800, | |
includeStyles: true, | |
includeMetadata: true | |
}); | |
console.log('Todas as exportações concluídas!'); | |
} catch (error) { | |
console.error('Erro:', error.message); | |
} | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment