Skip to content

Instantly share code, notes, and snippets.

@celsowm
Created July 24, 2025 01:35
Show Gist options
  • Save celsowm/94f08b37446c383558781ff8470c2e44 to your computer and use it in GitHub Desktop.
Save celsowm/94f08b37446c383558781ff8470c2e44 to your computer and use it in GitHub Desktop.
Js PDF Parser
// PDFParser.js - Parser de PDF com exportação para HTML
class PDFParser {
constructor() {
this.buffer = null;
this.position = 0;
this.objects = new Map();
this.trailer = {};
this.xref = {};
this.pages = [];
this.fonts = new Map();
this.graphicsState = {};
this.images = new Map(); // Armazena informações de imagens
}
/**
* Carrega o PDF a partir de um ArrayBuffer
*/
async loadPDF(arrayBuffer) {
this.buffer = new Uint8Array(arrayBuffer);
this.position = 0;
try {
await this.parsePDF();
await this.extractPagesWithFullLayout();
await this.extractImages(); // Extrai informações de imagens
return this;
} catch (error) {
throw new Error(`Erro ao parsear PDF: ${error.message}`);
}
}
/**
* Parseia o PDF completo
*/
async parsePDF() {
if (!this.isValidPDF()) {
throw new Error('Arquivo não é um PDF válido');
}
this.skipHeader();
await this.parseXRef();
this.parseTrailer();
await this.parseObjects();
}
/**
* Extrai páginas com layout completo
*/
async extractPagesWithFullLayout() {
const rootRef = this.trailer.Root;
if (!rootRef) return;
const rootId = rootRef.split(' ')[0];
const rootObj = this.getObject(rootId);
if (!rootObj || !rootObj.content) return;
const pagesRef = rootObj.content.Pages;
if (!pagesRef) return;
const pagesId = pagesRef.split(' ')[0];
const pagesObj = this.getObject(pagesId);
if (!pagesObj || !pagesObj.content) return;
await this.extractPageFullLayouts(pagesObj.content.Kids || []);
}
/**
* Extrai layouts completos das páginas
*/
async extractPageFullLayouts(pageRefs) {
for (const pageRef of pageRefs) {
const pageId = pageRef.split(' ')[0];
const pageObj = this.getObject(pageId);
if (pageObj && pageObj.content) {
const pageLayout = await this.extractPageFullLayout(pageObj.content, pageObj.content.Resources || {});
this.pages.push(pageLayout);
}
}
}
/**
* Extrai layout completo da página
*/
async extractPageFullLayout(pageContent, resources) {
const pageLayout = {
number: this.pages.length + 1,
size: this.getPageSize(pageContent),
textElements: [],
graphics: [],
images: [],
transformations: [],
tables: [],
mediaBox: pageContent.MediaBox,
cropBox: pageContent.CropBox,
resources: resources
};
if (resources && resources.Font) {
await this.loadPageFonts(resources.Font);
}
if (pageContent.Contents) {
const contentElements = await this.extractContentWithFullLayout(pageContent.Contents, resources);
pageLayout.textElements = contentElements.textElements || [];
pageLayout.graphics = contentElements.graphics || [];
pageLayout.images = contentElements.images || [];
pageLayout.transformations = contentElements.transformations || [];
// Detecta tabelas
pageLayout.tables = this.detectTables(
contentElements.textElements || [],
contentElements.graphics || []
);
}
return pageLayout;
}
/**
* Extrai conteúdo com layout completo
*/
async extractContentWithFullLayout(contentsRef, resources) {
const result = {
textElements: [],
graphics: [],
images: [],
transformations: []
};
try {
if (Array.isArray(contentsRef)) {
for (const ref of contentsRef) {
const contentData = await this.extractContentStreamWithLayout(ref, resources);
result.textElements.push(...contentData.textElements);
result.graphics.push(...contentData.graphics);
result.images.push(...contentData.images);
result.transformations.push(...contentData.transformations);
}
} else {
const contentData = await this.extractContentStreamWithLayout(contentsRef, resources);
result.textElements = contentData.textElements;
result.graphics = contentData.graphics;
result.images = contentData.images;
result.transformations = contentData.transformations;
}
} catch (error) {
console.warn('Erro ao extrair conteúdo com layout:', error.message);
}
return result;
}
/**
* Extrai informações de imagens do PDF
*/
async extractImages() {
// Procura por objetos XObject do tipo imagem
for (const [objId, obj] of this.objects) {
if (obj.content && obj.content.Subtype === 'Image') {
this.images.set(objId, {
id: objId,
content: obj.content,
width: obj.content.Width,
height: obj.content.Height,
filter: obj.content.Filter,
colorSpace: obj.content.ColorSpace,
bitsPerComponent: obj.content.BitsPerComponent
});
}
}
}
/**
* Exporta o PDF inteiro como HTML
*/
exportToHTML(options = {}) {
const {
includeStyles = true,
includeMetadata = true,
preserveLayout = true,
includeImages = true,
includeTables = true
} = options;
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n';
html += '<meta charset="UTF-8">\n';
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n';
html += '<title>Documento PDF Convertido</title>\n';
if (includeStyles) {
html += this.generateHTMLStyles();
}
if (includeMetadata) {
const info = this.getDocumentInfo();
if (info && info.Title) {
html += `<meta name="title" content="${this.escapeHtml(info.Title)}">\n`;
}
if (info && info.Author) {
html += `<meta name="author" content="${this.escapeHtml(info.Author)}">\n`;
}
}
html += '</head>\n<body>\n';
if (includeMetadata) {
html += this.generateHTMLMetadata();
}
// Adiciona conteúdo de todas as páginas
for (let i = 1; i <= this.getPageCount(); i++) {
html += this.exportPageToHTML(i, {
preserveLayout,
includeImages,
includeTables
});
}
html += '</body>\n</html>';
return html;
}
/**
* Exporta uma página específica como HTML
*/
exportPageToHTML(pageNumber, options = {}) {
const {
preserveLayout = true,
includeImages = true,
includeTables = true
} = options;
const page = this.getPageLayout(pageNumber);
let html = `<div class="pdf-page" data-page="${pageNumber}" `;
if (preserveLayout) {
html += `style="width: ${page.size.width}px; height: ${page.size.height}px; position: relative;"`;
}
html += '>\n';
// Adiciona tabelas primeiro (se incluídas)
if (includeTables && page.tables && page.tables.length > 0) {
html += this.exportTablesToHTML(pageNumber);
}
// Adiciona elementos de texto
html += this.exportTextElementsToHTML(pageNumber, preserveLayout);
// Adiciona imagens
if (includeImages && page.images && page.images.length > 0) {
html += this.exportImagesToHTML(pageNumber);
}
// Adiciona elementos gráficos
html += this.exportGraphicsToHTML(pageNumber);
html += '</div>\n';
return html;
}
/**
* Gera estilos CSS para o HTML
*/
generateHTMLStyles() {
return `
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.pdf-document {
max-width: 100%;
margin: 0 auto;
}
.pdf-metadata {
background-color: #fff;
padding: 20px;
margin-bottom: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.pdf-page {
background-color: white;
margin: 20px auto;
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
border: 1px solid #ddd;
}
.pdf-text {
position: absolute;
white-space: nowrap;
cursor: text;
}
.pdf-table {
border-collapse: collapse;
margin: 10px;
position: absolute;
}
.pdf-table-cell {
border: 1px solid #ddd;
padding: 4px 8px;
vertical-align: top;
}
.pdf-image {
position: absolute;
max-width: 100%;
height: auto;
}
.pdf-line {
position: absolute;
background-color: #000;
}
.pdf-rectangle {
position: absolute;
border: 1px solid #000;
}
.page-number {
text-align: center;
color: #666;
font-size: 12px;
margin-top: 10px;
}
@media print {
body {
background-color: white;
}
.pdf-page {
box-shadow: none;
border: none;
margin: 0;
page-break-after: always;
}
}
</style>
`;
}
/**
* Gera metadados HTML
*/
generateHTMLMetadata() {
const info = this.getDocumentInfo();
if (!info) return '';
let html = '<div class="pdf-metadata">\n';
html += '<h2>Metadados do Documento</h2>\n';
if (info.Title) {
html += `<p><strong>Título:</strong> ${this.escapeHtml(info.Title)}</p>\n`;
}
if (info.Author) {
html += `<p><strong>Autor:</strong> ${this.escapeHtml(info.Author)}</p>\n`;
}
if (info.Subject) {
html += `<p><strong>Assunto:</strong> ${this.escapeHtml(info.Subject)}</p>\n`;
}
if (info.Creator) {
html += `<p><strong>Criador:</strong> ${this.escapeHtml(info.Creator)}</p>\n`;
}
if (info.Producer) {
html += `<p><strong>Produtor:</strong> ${this.escapeHtml(info.Producer)}</p>\n`;
}
if (info.CreationDate) {
html += `<p><strong>Data de Criação:</strong> ${this.escapeHtml(info.CreationDate)}</p>\n`;
}
html += `<p><strong>Número de Páginas:</strong> ${this.getPageCount()}</p>\n`;
html += '</div>\n';
return html;
}
/**
* Exporta elementos de texto como HTML
*/
exportTextElementsToHTML(pageNumber, preserveLayout) {
const page = this.getPageLayout(pageNumber);
let html = '';
if (preserveLayout) {
// Texto com posicionamento preciso
const textElements = [...page.textElements].sort((a, b) => {
// Ordena por Y (de cima para baixo) e depois por X
if (Math.abs(b.position.y - a.position.y) > 5) {
return a.position.y - b.position.y;
}
return a.position.x - b.position.x;
});
for (const element of textElements) {
const style = this.getTextElementStyle(element, page.size);
html += `<div class="pdf-text" style="${style}">`;
html += this.escapeHtml(element.text);
html += '</div>\n';
}
} else {
// Texto como fluxo normal
const plainText = this.getPagePlainText(pageNumber);
const lines = plainText.split('\n');
for (const line of lines) {
if (line.trim()) {
html += `<p>${this.escapeHtml(line)}</p>\n`;
}
}
}
return html;
}
/**
* Obtém estilo CSS para elemento de texto
*/
getTextElementStyle(element, pageSize) {
const styles = [];
// Posicionamento
styles.push(`left: ${element.position.x}px`);
styles.push(`top: ${element.position.y}px`);
// Fonte
if (element.fontSize) {
styles.push(`font-size: ${element.fontSize}px`);
}
if (element.font) {
// Mapeia fontes PDF para CSS
const fontFamily = this.mapPDFFontToCSS(element.font);
styles.push(`font-family: ${fontFamily}`);
}
// Cor
if (element.color && element.color.r !== undefined) {
const color = `rgb(${element.color.r}, ${element.color.g}, ${element.color.b})`;
styles.push(`color: ${color}`);
}
return styles.join('; ');
}
/**
* Mapeia fontes PDF para CSS
*/
mapPDFFontToCSS(pdfFont) {
const fontMap = {
'Helvetica': 'Helvetica, Arial, sans-serif',
'Times-Roman': 'Times New Roman, serif',
'Courier': 'Courier New, monospace',
'Arial': 'Arial, sans-serif',
'Times': 'Times New Roman, serif'
};
return fontMap[pdfFont] || 'sans-serif';
}
/**
* Exporta tabelas como HTML
*/
exportTablesToHTML(pageNumber) {
const tables = this.getPageTables(pageNumber);
let html = '';
tables.forEach((table, index) => {
const style = `left: ${table.bounds.x}px; top: ${table.bounds.y}px;`;
html += `<table class="pdf-table" style="${style}" data-table-index="${index}">\n`;
table.structure.forEach((row, rowIndex) => {
html += '<tr>\n';
row.forEach((cell, cellIndex) => {
const cellStyle = cell.bounds ?
`width: ${cell.bounds.width}px;` : '';
html += `<td class="pdf-table-cell" style="${cellStyle}">`;
html += this.escapeHtml(cell.text || '');
html += '</td>\n';
});
html += '</tr>\n';
});
html += '</table>\n';
});
return html;
}
/**
* Exporta imagens como HTML
*/
exportImagesToHTML(pageNumber) {
const page = this.getPageLayout(pageNumber);
let html = '';
// Para simplificação, cria placeholders para imagens
// Em implementação real, seria necessário extrair os dados binários
page.images.forEach((image, index) => {
const style = image.position ?
`left: ${image.position.x}px; top: ${image.position.y}px;` : '';
html += `<div class="pdf-image" style="${style}" data-image-index="${index}">`;
html += `[Imagem ${index + 1}]`; // Placeholder
html += '</div>\n';
});
return html;
}
/**
* Exporta elementos gráficos como HTML
*/
exportGraphicsToHTML(pageNumber) {
const page = this.getPageLayout(pageNumber);
let html = '';
page.graphics.forEach((graphic, index) => {
if (graphic.type === 'lineTo' && graphic.x !== undefined && graphic.y !== undefined) {
// Linhas simples (simplificadas)
html += `<div class="pdf-line" data-graphic-index="${index}" `;
html += `style="left: ${graphic.x}px; top: ${graphic.y}px; width: 100px; height: 1px;"></div>\n`;
} else if (graphic.type === 'rectangle' && graphic.width && graphic.height) {
// Retângulos
const style = `left: ${graphic.x}px; top: ${graphic.y}px; width: ${graphic.width}px; height: ${graphic.height}px;`;
html += `<div class="pdf-rectangle" style="${style}" data-graphic-index="${index}"></div>\n`;
}
});
return html;
}
/**
* Exporta como HTML simplificado (sem posicionamento absoluto)
*/
exportToSimpleHTML(options = {}) {
const {
includeMetadata = true,
includeTables = true
} = options;
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n';
html += '<meta charset="UTF-8">\n';
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n';
html += '<title>Documento PDF Convertido</title>\n';
html += '<style>\n';
html += `
body { font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; }
.page { margin-bottom: 40px; padding: 20px; border: 1px solid #ddd; }
.page-header { text-align: center; color: #666; margin-bottom: 20px; }
table { border-collapse: collapse; margin: 10px 0; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
`;
html += '</style>\n</head>\n<body>\n';
if (includeMetadata) {
html += this.generateHTMLMetadata();
}
html += '<div class="pdf-document">\n';
for (let i = 1; i <= this.getPageCount(); i++) {
html += `<div class="page">\n`;
html += `<div class="page-header">Página ${i}</div>\n`;
// Texto da página
const plainText = this.getPagePlainText(i);
const lines = plainText.split('\n');
lines.forEach(line => {
if (line.trim()) {
html += `<p>${this.escapeHtml(line)}</p>\n`;
}
});
// Tabelas da página
if (includeTables) {
const tables = this.getPageTables(i);
tables.forEach((table, tableIndex) => {
html += '<table>\n';
table.structure.forEach((row, rowIndex) => {
html += '<tr>\n';
row.forEach(cell => {
const tag = rowIndex === 0 ? 'th' : 'td';
html += `<${tag}>${this.escapeHtml(cell.text || '')}</${tag}>\n`;
});
html += '</tr>\n';
});
html += '</table>\n';
});
}
html += '</div>\n';
}
html += '</div>\n</body>\n</html>';
return html;
}
/**
* Exporta como HTML com suporte a responsividade
*/
exportToResponsiveHTML(options = {}) {
const {
maxWidth = 800,
includeStyles = true,
includeMetadata = true
} = options;
let html = '<!DOCTYPE html>\n<html lang="pt-br">\n<head>\n';
html += '<meta charset="UTF-8">\n';
html += '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n';
html += '<title>Documento PDF Convertido</title>\n';
if (includeStyles) {
html += this.generateResponsiveStyles(maxWidth);
}
html += '</head>\n<body>\n';
if (includeMetadata) {
html += this.generateHTMLMetadata();
}
html += '<div class="pdf-document">\n';
for (let i = 1; i <= this.getPageCount(); i++) {
html += `<section class="pdf-page" data-page="${i}">\n`;
html += `<header class="page-header">Página ${i}</header>\n`;
// Conteúdo da página
const plainText = this.getPagePlainText(i);
const lines = plainText.split('\n');
lines.forEach(line => {
if (line.trim()) {
html += `<p>${this.escapeHtml(line)}</p>\n`;
}
});
// Tabelas
const tables = this.getPageTables(i);
tables.forEach(table => {
html += '<div class="table-container">\n';
html += '<table>\n';
table.structure.forEach((row, rowIndex) => {
const tag = rowIndex === 0 ? 'thead' : 'tbody';
if (rowIndex === 0) html += '<thead>\n';
else if (rowIndex === 1) html += '<tbody>\n';
html += '<tr>\n';
row.forEach(cell => {
const cellTag = rowIndex === 0 ? 'th' : 'td';
html += `<${cellTag}>${this.escapeHtml(cell.text || '')}</${cellTag}>\n`;
});
html += '</tr>\n';
if (rowIndex === 0) html += '</thead>\n';
});
html += '</tbody>\n</table>\n</div>\n';
});
html += '</section>\n';
}
html += '</div>\n</body>\n</html>';
return html;
}
/**
* Gera estilos responsivos
*/
generateResponsiveStyles(maxWidth) {
return `
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
margin: 0;
padding: 20px;
background-color: #f8f9fa;
color: #333;
line-height: 1.6;
}
.pdf-document {
max-width: ${maxWidth}px;
margin: 0 auto;
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
overflow: hidden;
}
.pdf-metadata {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
margin-bottom: 0;
}
.pdf-metadata h2 {
margin-top: 0;
color: white;
}
.pdf-metadata p {
margin: 10px 0;
opacity: 0.9;
}
.pdf-page {
padding: 30px;
border-bottom: 1px solid #eee;
}
.pdf-page:last-child {
border-bottom: none;
}
.page-header {
text-align: center;
color: #666;
font-size: 1.2em;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #eee;
}
p {
margin: 0 0 15px 0;
text-align: justify;
}
.table-container {
overflow-x: auto;
margin: 20px 0;
}
table {
width: 100%;
border-collapse: collapse;
margin: 10px 0;
background: white;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
th, td {
padding: 12px 15px;
text-align: left;
border: 1px solid #ddd;
}
th {
background: #f8f9fa;
font-weight: 600;
color: #555;
}
tbody tr:nth-child(even) {
background-color: #f8f9fa;
}
tbody tr:hover {
background-color: #e9ecef;
}
@media (max-width: 768px) {
body {
padding: 10px;
}
.pdf-document {
border-radius: 0;
}
.pdf-page {
padding: 20px 15px;
}
th, td {
padding: 8px 10px;
font-size: 0.9em;
}
.pdf-metadata {
padding: 20px 15px;
}
}
@media print {
body {
background: white;
padding: 0;
}
.pdf-document {
box-shadow: none;
border-radius: 0;
}
.pdf-page {
page-break-inside: avoid;
}
}
</style>
`;
}
/**
* Escapa caracteres HTML
*/
escapeHtml(text) {
if (typeof text !== 'string') return '';
return text
.replace(/&/g, '&amp;')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
}
// Métodos de detecção de tabelas (mantidos do código anterior)
detectTables(textElements, graphicsElements) {
const tables = [];
// Detecção baseada em texto
const textBasedTables = this.detectTextBasedTables(textElements);
tables.push(...textBasedTables);
// Detecção baseada em gráficos
const graphicBasedTables = this.detectGraphicBasedTables(graphicsElements, textElements);
tables.push(...graphicBasedTables);
return tables;
}
detectTextBasedTables(textElements) {
const tables = [];
const lines = this.groupTextIntoLines(textElements);
const patterns = this.findTablePatterns(lines);
for (const pattern of patterns) {
const table = this.createTextBasedTable(pattern);
if (table) {
tables.push(table);
}
}
return tables;
}
detectGraphicBasedTables(graphicsElements, textElements) {
const tables = [];
const tableLines = this.findTableLines(graphicsElements);
const lineGroups = this.groupLinesIntoTables(tableLines);
for (const lineGroup of lineGroups) {
const table = this.analyzeTableStructure(lineGroup, textElements);
if (table) {
tables.push(table);
}
}
return tables;
}
// Métodos auxiliares para tabelas (mantidos do código anterior)
groupTextIntoLines(textElements) {
const lines = [];
const grouped = new Set();
const sortedElements = [...textElements].sort((a, b) => b.position.y - a.position.y);
for (let i = 0; i < sortedElements.length; i++) {
if (grouped.has(i)) continue;
const currentLine = [sortedElements[i]];
grouped.add(i);
const baseY = sortedElements[i].position.y;
for (let j = i + 1; j < sortedElements.length; j++) {
if (grouped.has(j)) continue;
const elementY = sortedElements[j].position.y;
if (Math.abs(elementY - baseY) < 5) {
currentLine.push(sortedElements[j]);
grouped.add(j);
}
}
currentLine.sort((a, b) => a.position.x - b.position.x);
lines.push(currentLine);
}
return lines;
}
findTablePatterns(lines) {
const patterns = [];
const minTableLines = 3;
for (let i = 0; i <= lines.length - minTableLines; i++) {
const potentialTable = lines.slice(i, i + minTableLines);
if (this.isTablePattern(potentialTable)) {
let endIndex = i + minTableLines;
while (endIndex < lines.length &&
this.extendsTablePattern(potentialTable, lines[endIndex])) {
potentialTable.push(lines[endIndex]);
endIndex++;
}
patterns.push({
startIndex: i,
endIndex: endIndex - 1,
lines: potentialTable
});
i = endIndex - 1;
}
}
return patterns;
}
isTablePattern(lines) {
if (lines.length < 3) return false;
const alignments = lines.map(line => this.getLineAlignments(line));
let similarAlignments = 0;
for (let i = 1; i < alignments.length; i++) {
if (this.alignmentsAreSimilar(alignments[0], alignments[i])) {
similarAlignments++;
}
}
return similarAlignments >= Math.floor(lines.length * 0.7);
}
extendsTablePattern(pattern, line) {
if (!line) return false;
const patternAlignments = this.getLineAlignments(pattern[0]);
const lineAlignments = this.getLineAlignments(line);
return this.alignmentsAreSimilar(patternAlignments, lineAlignments);
}
getLineAlignments(line) {
return line.map(element => Math.round(element.position.x / 10) * 10);
}
alignmentsAreSimilar(alignments1, alignments2) {
if (alignments1.length === 0 || alignments2.length === 0) return false;
const intersection = alignments1.filter(a1 =>
alignments2.some(a2 => Math.abs(a1 - a2) < 15)
);
const minLen = Math.min(alignments1.length, alignments2.length);
return intersection.length >= Math.floor(minLen * 0.6);
}
createTextBasedTable(pattern) {
try {
const table = [];
const bounds = this.calculatePatternBounds(pattern.lines);
for (const line of pattern.lines) {
const row = this.createTableRow(line, bounds);
table.push(row);
}
return {
bounds: bounds,
structure: table,
rowCount: table.length,
columnCount: Math.max(...table.map(row => row.length)),
type: 'text-based'
};
} catch (error) {
return null;
}
}
calculatePatternBounds(lines) {
if (lines.length === 0) return { x: 0, y: 0, width: 0, height: 0 };
let minX = Infinity;
let minY = Infinity;
let maxX = -Infinity;
let maxY = -Infinity;
for (const line of lines) {
for (const element of line) {
minX = Math.min(minX, element.position.x);
minY = Math.min(minY, element.position.y);
maxX = Math.max(maxX, element.position.x + (element.text.length * 6));
maxY = Math.max(maxY, element.position.y + (element.fontSize || 12));
}
}
return {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY
};
}
createTableRow(line, tableBounds) {
const columns = this.groupLineIntoColumns(line);
return columns.map(col => ({
text: col.map(el => el.text).join(' '),
elements: col
}));
}
groupLineIntoColumns(line) {
const columns = [];
const grouped = new Set();
for (let i = 0; i < line.length; i++) {
if (grouped.has(i)) continue;
const currentColumn = [line[i]];
grouped.add(i);
const baseX = line[i].position.x;
for (let j = i + 1; j < line.length; j++) {
if (grouped.has(j)) continue;
const elementX = line[j].position.x;
if (Math.abs(elementX - baseX) < 30) {
currentColumn.push(line[j]);
grouped.add(j);
}
}
columns.push(currentColumn);
}
return columns;
}
// Métodos de detecção gráfica de tabelas
findTableLines(graphicsElements) {
const tableLines = [];
for (const element of graphicsElements) {
if (element.type === 'rectangle') {
if (this.isTableLikeRectangle(element)) {
tableLines.push({
type: 'rectangle',
bounds: {
x: element.x,
y: element.y,
width: element.width,
height: element.height
},
element: element
});
}
}
}
return tableLines;
}
isTableLikeRectangle(rect) {
const minWidth = 20;
const minHeight = 15;
const maxWidth = 1000;
const maxHeight = 200;
return rect.width >= minWidth &&
rect.height >= minHeight &&
rect.width <= maxWidth &&
rect.height <= maxHeight;
}
groupLinesIntoTables(lines) {
if (lines.length === 0) return [];
const groups = [];
const usedLines = new Set();
for (let i = 0; i < lines.length; i++) {
if (usedLines.has(i)) continue;
const currentGroup = [lines[i]];
usedLines.add(i);
for (let j = i + 1; j < lines.length; j++) {
if (usedLines.has(j)) continue;
if (this.areLinesRelated(lines[i], lines[j])) {
currentGroup.push(lines[j]);
usedLines.add(j);
}
}
if (currentGroup.length >= 2) {
groups.push(currentGroup);
}
}
return groups;
}
areLinesRelated(line1, line2) {
const bounds1 = line1.bounds;
const bounds2 = line2.bounds;
const maxDistance = 50;
const distance = Math.sqrt(
Math.pow(bounds1.x - bounds2.x, 2) +
Math.pow(bounds1.y - bounds2.y, 2)
);
return distance <= maxDistance;
}
analyzeTableStructure(lineGroup, textElements) {
try {
const bounds = this.calculateTableBounds(lineGroup);
const tableTextElements = this.findTextInBounds(textElements, bounds);
const cells = this.groupTextIntoCells(lineGroup, tableTextElements);
const { rows, cols } = this.determineTableDimensions(cells);
const tableStructure = this.organizeTableStructure(cells, rows, cols);
return {
bounds: bounds,
cells: cells,
structure: tableStructure,
rowCount: rows,
columnCount: cols,
textElements: tableTextElements
};
} catch (error) {
return null;
}
}
calculateTableBounds(lineGroup) {
if (lineGroup.length === 0) {
return { x: 0, y: 0, width: 0, height: 0 };
}
let minX = Infinity;
let minY = Infinity;
let maxX = -Infinity;
let maxY = -Infinity;
for (const line of lineGroup) {
const bounds = line.bounds;
minX = Math.min(minX, bounds.x);
minY = Math.min(minY, bounds.y);
maxX = Math.max(maxX, bounds.x + bounds.width);
maxY = Math.max(maxY, bounds.y + bounds.height);
}
return {
x: minX,
y: minY,
width: maxX - minX,
height: maxY - minY
};
}
findTextInBounds(textElements, bounds) {
return textElements.filter(element => {
const elementBounds = {
x: element.position.x,
y: element.position.y,
width: element.text.length * (element.fontSize || 12) * 0.6,
height: element.fontSize || 12
};
return this.rectanglesIntersect(bounds, elementBounds);
});
}
rectanglesIntersect(rect1, rect2) {
return rect1.x < rect2.x + rect2.width &&
rect1.x + rect1.width > rect2.x &&
rect1.y < rect2.y + rect2.height &&
rect1.y + rect1.height > rect2.y;
}
groupTextIntoCells(lineGroup, textElements) {
const cells = [];
for (const line of lineGroup) {
const cellText = this.findTextInBounds(textElements, line.bounds);
if (cellText.length > 0) {
cells.push({
bounds: line.bounds,
textElements: cellText,
text: this.extractCellText(cellText)
});
}
}
return cells;
}
extractCellText(textElements) {
if (textElements.length === 0) return '';
const sortedElements = [...textElements].sort((a, b) => {
if (Math.abs(b.position.y - a.position.y) > 5) {
return b.position.y - a.position.y;
}
return a.position.x - b.position.x;
});
return sortedElements.map(el => el.text).join(' ');
}
determineTableDimensions(cells) {
if (cells.length === 0) {
return { rows: 0, cols: 0 };
}
const rows = this.groupCellsIntoRows(cells);
const cols = this.groupCellsIntoColumns(cells);
return {
rows: rows.length,
cols: cols.length
};
}
groupCellsIntoRows(cells) {
const rows = [];
const grouped = new Set();
const sortedCells = [...cells].sort((a, b) => a.bounds.y - b.bounds.y);
for (let i = 0; i < sortedCells.length; i++) {
if (grouped.has(i)) continue;
const currentRow = [sortedCells[i]];
grouped.add(i);
const baseY = sortedCells[i].bounds.y;
for (let j = i + 1; j < sortedCells.length; j++) {
if (grouped.has(j)) continue;
const cellY = sortedCells[j].bounds.y;
if (Math.abs(cellY - baseY) < 10) {
currentRow.push(sortedCells[j]);
grouped.add(j);
}
}
currentRow.sort((a, b) => a.bounds.x - b.bounds.x);
rows.push(currentRow);
}
return rows;
}
groupCellsIntoColumns(cells) {
const cols = [];
const grouped = new Set();
const sortedCells = [...cells].sort((a, b) => a.bounds.x - b.bounds.x);
for (let i = 0; i < sortedCells.length; i++) {
if (grouped.has(i)) continue;
const currentCol = [sortedCells[i]];
grouped.add(i);
const baseX = sortedCells[i].bounds.x;
for (let j = i + 1; j < sortedCells.length; j++) {
if (grouped.has(j)) continue;
const cellX = sortedCells[j].bounds.x;
if (Math.abs(cellX - baseX) < 20) {
currentCol.push(sortedCells[j]);
grouped.add(j);
}
}
cols.push(currentCol);
}
return cols;
}
organizeTableStructure(cells, rowCount, columnCount) {
if (cells.length === 0) return [];
const rows = this.groupCellsIntoRows(cells);
const table = [];
for (let i = 0; i < rows.length; i++) {
const row = [];
const rowCells = rows[i];
for (let j = 0; j < rowCells.length; j++) {
row.push({
text: rowCells[j].text,
bounds: rowCells[j].bounds
});
}
table.push(row);
}
return table;
}
// Métodos existentes (mantidos do código anterior)
isValidPDF() {
const header = this.readString(8);
this.position = 0;
return header.startsWith('%PDF-');
}
skipHeader() {
const headerEnd = this.findString('\n', 0);
this.position = headerEnd + 1;
}
async parseXRef() {
const xrefPos = this.findXRefPosition();
if (xrefPos === -1) {
throw new Error('xref table não encontrado');
}
this.position = xrefPos;
this.readString(4);
while (this.position < this.buffer.length) {
const line = this.readLine().trim();
if (line === 'trailer') {
break;
}
if (this.isNumeric(line.split(' ')[0])) {
const [objNum, count] = line.split(' ').map(Number);
for (let i = 0; i < count; i++) {
const entry = this.readLine().trim();
const [offset, genNum, flag] = entry.split(' ');
this.xref[`${objNum + i}`] = {
offset: parseInt(offset),
genNum: parseInt(genNum),
inUse: flag === 'n'
};
}
}
}
}
findXRefPosition() {
for (let i = this.buffer.length - 10; i >= 0; i--) {
if (this.readStringAt(i, 9) === 'startxref') {
let pos = i + 9;
while (pos < this.buffer.length && this.isWhitespace(this.buffer[pos])) {
pos++;
}
let numStr = '';
while (pos < this.buffer.length && this.isNumericChar(this.buffer[pos])) {
numStr += String.fromCharCode(this.buffer[pos]);
pos++;
}
return parseInt(numStr);
}
}
return -1;
}
parseTrailer() {
const trailerStart = this.findString('trailer');
if (trailerStart === -1) {
throw new Error('trailer não encontrado');
}
this.position = trailerStart + 7;
this.skipWhitespace();
const trailerDict = this.parseDictionary();
this.trailer = trailerDict;
}
async parseObjects() {
for (const [objId, xrefEntry] of Object.entries(this.xref)) {
if (xrefEntry.inUse) {
try {
const obj = await this.parseObjectAt(xrefEntry.offset, objId);
this.objects.set(objId, obj);
} catch (error) {
console.warn(`Erro ao parsear objeto ${objId}: ${error.message}`);
}
}
}
}
async parseObjectAt(offset, objId) {
this.position = offset;
const objLine = this.readLine();
const objNumMatch = objLine.match(/^(\d+)\s+(\d+)\s+obj/);
if (!objNumMatch) {
throw new Error('Formato de objeto inválido');
}
const objNum = objNumMatch[1];
const genNum = objNumMatch[2];
const content = this.parseObjectContent();
return {
id: objNum,
gen: genNum,
content: content,
offset: offset
};
}
parseObjectContent() {
const startPos = this.position;
let content = '';
let inString = false;
while (this.position < this.buffer.length) {
const char = String.fromCharCode(this.buffer[this.position]);
if (char === 'e' && this.readStringAt(this.position, 6) === 'endobj') {
break;
}
if (char === '(' && !inString) {
inString = true;
} else if (char === ')' && inString) {
inString = false;
}
content += char;
this.position++;
}
try {
return this.parsePDFValue(content.trim());
} catch (error) {
return content.trim();
}
}
parsePDFValue(value) {
value = value.trim();
if (value.startsWith('[') && value.endsWith(']')) {
return this.parseArray(value);
}
if (value.startsWith('<<') && value.endsWith('>>')) {
return this.parseDictionaryFromString(value);
}
if (value.startsWith('(') && value.endsWith(')')) {
return this.parseString(value);
}
if (value.startsWith('<') && value.endsWith('>') && !value.startsWith('<<')) {
return this.parseHexString(value);
}
if (value.startsWith('/')) {
return value.substring(1);
}
if (this.isNumeric(value)) {
return parseFloat(value);
}
if (value === 'true' || value === 'false') {
return value === 'true';
}
if (value === 'null') {
return null;
}
return value;
}
parseArray(arrayStr) {
const content = arrayStr.substring(1, arrayStr.length - 1).trim();
const items = [];
let current = '';
let depth = 0;
let inString = false;
let inHexString = false;
for (let i = 0; i < content.length; i++) {
const char = content[i];
if (char === '(' && !inString && !inHexString) {
inString = true;
current += char;
} else if (char === ')' && inString) {
inString = false;
current += char;
} else if (char === '<' && !inString && !inHexString && content[i+1] !== '<') {
inHexString = true;
current += char;
} else if (char === '>' && inHexString) {
inHexString = false;
current += char;
} else if (!inString && !inHexString && char === '[') {
depth++;
current += char;
} else if (!inString && !inHexString && char === ']') {
depth--;
current += char;
} else if (!inString && !inHexString && char === ' ' && depth === 0) {
if (current.trim()) {
items.push(this.parsePDFValue(current.trim()));
}
current = '';
} else {
current += char;
}
}
if (current.trim()) {
items.push(this.parsePDFValue(current.trim()));
}
return items;
}
parseDictionaryFromString(dictStr) {
const dict = {};
const content = dictStr.substring(2, dictStr.length - 2).trim();
const pairs = this.splitDictionaryPairs(content);
for (const pair of pairs) {
const parts = pair.trim().split(/\s+/);
if (parts.length >= 2) {
const key = parts[0].substring(1);
const valueStr = parts.slice(1).join(' ');
dict[key] = this.parsePDFValue(valueStr);
}
}
return dict;
}
splitDictionaryPairs(content) {
const pairs = [];
let current = '';
let depth = 0;
let inString = false;
let inHexString = false;
for (let i = 0; i < content.length; i++) {
const char = content[i];
if (char === '(' && !inString && !inHexString) {
inString = true;
current += char;
} else if (char === ')' && inString) {
inString = false;
current += char;
} else if (char === '<' && !inString && !inHexString && content[i+1] !== '<') {
inHexString = true;
current += char;
} else if (char === '>' && inHexString) {
inHexString = false;
current += char;
} else if (!inString && !inHexString && (char === '[' || char === '<<')) {
depth++;
current += char;
} else if (!inString && !inHexString && (char === ']' || (char === '>' && content[i+1] === '>'))) {
depth--;
current += char;
if (char === '>' && content[i+1] === '>') i++;
} else if (!inString && !inHexString && char === '/' && depth === 0) {
if (current.trim()) {
pairs.push(current.trim());
}
current = char;
} else {
current += char;
}
}
if (current.trim()) {
pairs.push(current.trim());
}
return pairs;
}
parseString(str) {
let content = str.substring(1, str.length - 1);
content = content.replace(/\\n/g, '\n')
.replace(/\\r/g, '\r')
.replace(/\\t/g, '\t')
.replace(/\\\\/g, '\\')
.replace(/\\\)/g, ')')
.replace(/\\\(/g, '(')
.replace(/\\([0-7]{1,3})/g, (match, octal) => {
return String.fromCharCode(parseInt(octal, 8));
});
return content;
}
parseHexString(hexStr) {
const hexContent = hexStr.substring(1, hexStr.length - 1);
let result = '';
for (let i = 0; i < hexContent.length; i += 2) {
const hexPair = hexContent.substr(i, 2);
const charCode = parseInt(hexPair, 16);
if (!isNaN(charCode)) {
result += String.fromCharCode(charCode);
}
}
return result;
}
async extractContentStreamWithLayout(contentRef, resources) {
const contentId = contentRef.split(' ')[0];
const contentObj = this.getObject(contentId);
if (!contentObj) return { textElements: [], graphics: [], images: [], transformations: [] };
let contentStream = '';
if (typeof contentObj.content === 'string') {
contentStream = contentObj.content;
} else if (contentObj.content && contentObj.content.stream) {
contentStream = contentObj.content.stream;
} else if (contentObj.stream) {
contentStream = contentObj.stream;
} else {
contentStream = await this.extractRawStream(contentObj);
}
return this.parseContentStreamWithLayout(contentStream, resources);
}
parseContentStreamWithLayout(contentStream, resources) {
const result = {
textElements: [],
graphics: [],
images: [],
transformations: []
};
const lines = contentStream.split('\n');
let textState = {
inTextObject: false,
currentFont: null,
fontSize: 12,
charSpacing: 0,
wordSpacing: 0,
textLeading: 0,
textMatrix: [1, 0, 0, 1, 0, 0],
textLineMatrix: [1, 0, 0, 1, 0, 0],
position: { x: 0, y: 0 },
color: { r: 0, g: 0, b: 0 }
};
let graphicsState = {
currentTransformationMatrix: [1, 0, 0, 1, 0, 0],
lineWidth: 1,
lineCap: 0,
lineJoin: 0,
fillColor: { r: 0, g: 0, b: 0 },
strokeColor: { r: 0, g: 0, b: 0 }
};
for (const line of lines) {
const trimmedLine = line.trim();
const commands = this.parseLineCommands(trimmedLine);
for (const command of commands) {
this.processPDFCommand(command, textState, graphicsState, result);
}
}
return result;
}
parseLineCommands(line) {
const commands = [];
let currentCommand = '';
let inString = false;
let stringDelimiter = '';
for (let i = 0; i < line.length; i++) {
const char = line[i];
if (!inString && (char === '(' || char === '<')) {
inString = true;
stringDelimiter = char;
currentCommand += char;
} else if (inString &&
((stringDelimiter === '(' && char === ')') ||
(stringDelimiter === '<' && char === '>'))) {
inString = false;
currentCommand += char;
} else if (!inString && char === ' ') {
if (currentCommand.trim()) {
commands.push(currentCommand.trim());
}
currentCommand = '';
} else {
currentCommand += char;
}
}
if (currentCommand.trim()) {
commands.push(currentCommand.trim());
}
return commands;
}
processPDFCommand(command, textState, graphicsState, result) {
if (command === 'BT') {
textState.inTextObject = true;
textState.textMatrix = [1, 0, 0, 1, 0, 0];
textState.textLineMatrix = [1, 0, 0, 1, 0, 0];
} else if (command === 'ET') {
textState.inTextObject = false;
} else if (textState.inTextObject) {
this.processTextCommand(command, textState, result);
} else {
this.processGraphicsCommand(command, graphicsState, result);
}
}
processTextCommand(command, textState, result) {
if (command === 'Tm') {
const matrix = this.parseMatrixCommand(command);
if (matrix) {
textState.textMatrix = matrix;
textState.textLineMatrix = [...matrix];
textState.position = { x: matrix[4], y: matrix[5] };
}
} else if (command === 'Td' || command === 'TD') {
const coords = this.parseCoordinateCommand(command);
if (coords) {
textState.textMatrix[4] += coords.x;
textState.textMatrix[5] += coords.y;
textState.position = { x: textState.textMatrix[4], y: textState.textMatrix[5] };
if (command === 'TD') {
textState.textLeading = -coords.y;
}
}
} else if (command === 'T*') {
textState.textMatrix[5] -= textState.textLeading;
textState.position = { x: textState.textMatrix[4], y: textState.textMatrix[5] };
} else if (command.endsWith('Tf')) {
const fontMatch = command.match(/\/([^\s]+)\s+(-?\d+(?:\.\d+)?)\s+Tf/);
if (fontMatch) {
textState.currentFont = fontMatch[1];
textState.fontSize = parseFloat(fontMatch[2]);
}
} else if (command.endsWith('Tj') || command.endsWith('TJ') || command.startsWith('(') || command.startsWith('<')) {
this.extractTextElement(command, textState, result);
}
}
processGraphicsCommand(command, graphicsState, result) {
if (command === 'cm') {
const matrix = this.parseMatrixCommand(command);
if (matrix) {
graphicsState.currentTransformationMatrix = matrix;
result.transformations.push({
type: 'transformation',
matrix: matrix,
position: { x: matrix[4], y: matrix[5] }
});
}
} else if (command.endsWith('w')) {
const width = parseFloat(command);
if (!isNaN(width)) {
graphicsState.lineWidth = width;
}
} else if (command.endsWith('rg')) {
const colors = command.split(' ').slice(0, 3).map(parseFloat);
if (colors.length === 3 && colors.every(c => !isNaN(c))) {
graphicsState.fillColor = {
r: Math.round(colors[0] * 255),
g: Math.round(colors[1] * 255),
b: Math.round(colors[2] * 255)
};
}
} else if (command.endsWith('RG')) {
const colors = command.split(' ').slice(0, 3).map(parseFloat);
if (colors.length === 3 && colors.every(c => !isNaN(c))) {
graphicsState.strokeColor = {
r: Math.round(colors[0] * 255),
g: Math.round(colors[1] * 255),
b: Math.round(colors[2] * 255)
};
}
} else if (command.endsWith('m')) {
const coords = this.parseCoordinateCommand(command);
if (coords) {
result.graphics.push({
type: 'moveTo',
x: coords.x,
y: coords.y,
color: graphicsState.strokeColor
});
}
} else if (command.endsWith('l')) {
const coords = this.parseCoordinateCommand(command);
if (coords) {
result.graphics.push({
type: 'lineTo',
x: coords.x,
y: coords.y,
color: graphicsState.strokeColor
});
}
} else if (command.endsWith('re')) {
const coords = this.parseRectangleCommand(command);
if (coords) {
result.graphics.push({
type: 'rectangle',
x: coords.x,
y: coords.y,
width: coords.width,
height: coords.height,
fillColor: graphicsState.fillColor,
strokeColor: graphicsState.strokeColor
});
}
} else if (command === 'Do') {
const xobjectMatch = command.match(/\/([^\s]+)/);
if (xobjectMatch) {
result.images.push({
type: 'image',
name: xobjectMatch[1],
position: { x: 0, y: 0 }
});
}
}
}
extractTextElement(command, textState, result) {
let text = '';
let textCommands = [];
const stringMatches = command.match(/\(([^)]*)\)/g);
if (stringMatches) {
for (const match of stringMatches) {
const stringContent = match.substring(1, match.length - 1);
text += this.decodePDFString(stringContent) + ' ';
textCommands.push({
text: this.decodePDFString(stringContent),
type: 'string'
});
}
}
const hexMatches = command.match(/<([0-9A-Fa-f\s]*)>/g);
if (hexMatches) {
for (const match of hexMatches) {
const hexContent = match.substring(1, match.length - 1);
text += this.decodeHexString(hexContent) + ' ';
textCommands.push({
text: this.decodeHexString(hexContent),
type: 'hex'
});
}
}
if (command.endsWith('TJ')) {
const arrayMatch = command.match(/\[(.*?)\]/);
if (arrayMatch) {
const arrayContent = arrayMatch[1];
text = this.parseTJArray(arrayContent);
textCommands = [{ text: text, type: 'TJ' }];
}
}
if (text.trim()) {
const textElement = {
text: text.trim(),
font: textState.currentFont,
fontSize: textState.fontSize,
position: { ...textState.position },
matrix: [...textState.textMatrix],
color: { ...textState.color },
charSpacing: textState.charSpacing,
wordSpacing: textState.wordSpacing
};
result.textElements.push(textElement);
}
}
parseTJArray(arrayContent) {
let text = '';
const items = arrayContent.split(/\s+(?![^(]*\))/);
for (const item of items) {
if (item.startsWith('(') && item.endsWith(')')) {
const stringContent = item.substring(1, item.length - 1);
text += this.decodePDFString(stringContent);
} else if (item.startsWith('<') && item.endsWith('>')) {
const hexContent = item.substring(1, item.length - 1);
text += this.decodeHexString(hexContent);
} else if (this.isNumeric(item)) {
const adjustment = parseFloat(item);
if (Math.abs(adjustment) > 100) {
text += ' ';
}
}
}
return text;
}
parseMatrixCommand(commandLine) {
const numbers = commandLine.split(' ').slice(0, 6).map(parseFloat);
if (numbers.length === 6 && numbers.every(n => !isNaN(n))) {
return numbers;
}
return null;
}
parseCoordinateCommand(commandLine) {
const parts = commandLine.split(' ');
if (parts.length >= 2) {
const x = parseFloat(parts[parts.length - 2]);
const y = parseFloat(parts[parts.length - 1]);
if (!isNaN(x) && !isNaN(y)) {
return { x, y };
}
}
return null;
}
parseRectangleCommand(commandLine) {
const parts = commandLine.split(' ');
if (parts.length >= 4) {
const coords = parts.slice(0, 4).map(parseFloat);
if (coords.every(c => !isNaN(c))) {
return {
x: coords[0],
y: coords[1],
width: coords[2],
height: coords[3]
};
}
}
return null;
}
decodePDFString(str) {
return str.replace(/\\n/g, '\n')
.replace(/\\r/g, '\r')
.replace(/\\t/g, '\t')
.replace(/\\\\/g, '\\')
.replace(/\\\)/g, ')')
.replace(/\\\(/g, '(')
.replace(/\\([0-7]{1,3})/g, (match, octal) => {
return String.fromCharCode(parseInt(octal, 8));
});
}
decodeHexString(hexStr) {
hexStr = hexStr.replace(/\s/g, '');
let result = '';
for (let i = 0; i < hexStr.length; i += 2) {
const hexPair = hexStr.substr(i, 2);
const charCode = parseInt(hexPair, 16);
if (!isNaN(charCode)) {
result += String.fromCharCode(charCode);
}
}
return result;
}
async extractRawStream(obj) {
if (!obj || !obj.offset) return '';
const objData = this.buffer.slice(obj.offset);
const streamStart = this.findStringInBuffer(objData, 'stream');
const streamEnd = this.findStringInBuffer(objData, 'endstream');
if (streamStart === -1 || streamEnd === -1) return '';
let contentStart = streamStart + 6;
while (contentStart < objData.length &&
(objData[contentStart] === 0x0A || objData[contentStart] === 0x0D)) {
contentStart++;
}
const contentBytes = objData.slice(contentStart, streamEnd);
return this.decodeStreamContent(contentBytes, obj.content?.Filter);
}
decodeStreamContent(bytes, filter) {
if (!filter) {
return this.bytesToString(bytes);
}
if (filter.includes('FlateDecode') || filter === 'FlateDecode') {
try {
return this.bytesToString(bytes);
} catch (error) {
console.warn('Falha na descompressão FlateDecode');
return this.bytesToString(bytes);
}
}
return this.bytesToString(bytes);
}
bytesToString(bytes) {
return Array.from(bytes)
.map(byte => String.fromCharCode(byte))
.join('');
}
getPageSize(pageContent) {
let box = pageContent.MediaBox || pageContent.CropBox;
if (box) {
if (Array.isArray(box)) {
return {
width: box[2] - box[0],
height: box[3] - box[1],
x: box[0],
y: box[1]
};
}
}
return { width: 612, height: 792 };
}
async loadPageFonts(fontDict) {
for (const [fontName, fontRef] of Object.entries(fontDict)) {
if (typeof fontRef === 'string') {
const fontId = fontRef.split(' ')[0];
const fontObj = this.getObject(fontId);
if (fontObj) {
this.fonts.set(fontName, {
id: fontId,
content: fontObj.content || {},
encoding: this.getFontEncoding(fontObj.content)
});
}
}
}
}
getFontEncoding(fontContent) {
if (!fontContent) return 'StandardEncoding';
if (fontContent.Encoding) {
return fontContent.Encoding;
}
if (fontContent.BaseFont) {
const baseFont = fontContent.BaseFont;
if (baseFont.includes('WinAnsi')) return 'WinAnsiEncoding';
if (baseFont.includes('MacRoman')) return 'MacRomanEncoding';
}
return 'StandardEncoding';
}
readString(length) {
let str = '';
const endPos = Math.min(this.position + length, this.buffer.length);
for (let i = this.position; i < endPos; i++) {
str += String.fromCharCode(this.buffer[i]);
}
this.position += length;
return str;
}
readStringAt(position, length) {
let str = '';
const endPos = Math.min(position + length, this.buffer.length);
for (let i = position; i < endPos; i++) {
str += String.fromCharCode(this.buffer[i]);
}
return str;
}
readLine() {
let line = '';
while (this.position < this.buffer.length) {
const char = this.buffer[this.position];
if (char === '\n'.charCodeAt(0) || char === '\r'.charCodeAt(0)) {
this.position++;
if (char === '\r'.charCodeAt(0) && this.buffer[this.position] === '\n'.charCodeAt(0)) {
this.position++;
}
break;
}
line += String.fromCharCode(char);
this.position++;
}
return line;
}
skipWhitespace() {
while (this.position < this.buffer.length) {
const char = this.buffer[this.position];
if (!this.isWhitespace(char)) {
break;
}
this.position++;
}
}
findString(str, startPos = this.position) {
const encoder = new TextEncoder();
const searchBytes = encoder.encode(str);
for (let i = startPos; i <= this.buffer.length - searchBytes.length; i++) {
let found = true;
for (let j = 0; j < searchBytes.length; j++) {
if (this.buffer[i + j] !== searchBytes[j]) {
found = false;
break;
}
}
if (found) {
return i;
}
}
return -1;
}
findStringInBuffer(buffer, str) {
const encoder = new TextEncoder();
const searchBytes = encoder.encode(str);
for (let i = 0; i <= buffer.length - searchBytes.length; i++) {
let found = true;
for (let j = 0; j < searchBytes.length; j++) {
if (buffer[i + j] !== searchBytes[j]) {
found = false;
break;
}
}
if (found) {
return i;
}
}
return -1;
}
isWhitespace(char) {
return char === 0x20 || char === 0x0A || char === 0x0D || char === 0x09 || char === 0x0C;
}
isNumericChar(char) {
return (char >= 0x30 && char <= 0x39) || char === 0x2D || char === 0x2E;
}
isNumeric(str) {
return /^-?\d+\.?\d*$/.test(str);
}
parseDictionary() {
const dict = {};
this.skipWhitespace();
if (this.readString(2) !== '<<') {
throw new Error('Dicionário deve começar com <<');
}
while (this.position < this.buffer.length) {
this.skipWhitespace();
if (this.readStringAt(this.position, 2) === '>>') {
this.position += 2;
break;
}
const key = this.parseName();
if (key) {
this.skipWhitespace();
const value = this.parseNextValue();
dict[key] = value;
}
}
return dict;
}
parseName() {
if (this.buffer[this.position] !== '/'.charCodeAt(0)) {
return null;
}
this.position++;
let name = '';
while (this.position < this.buffer.length) {
const char = this.buffer[this.position];
if (this.isWhitespace(char) || char === '/'.charCodeAt(0) ||
char === '('.charCodeAt(0) || char === '<'.charCodeAt(0)) {
break;
}
name += String.fromCharCode(char);
this.position++;
}
return name;
}
parseNextValue() {
this.skipWhitespace();
const char = String.fromCharCode(this.buffer[this.position]);
if (char === '[') {
return this.parseArrayValue();
} else if (char === '<' && this.buffer[this.position + 1] === '<'.charCodeAt(0)) {
return this.parseDictionary();
} else if (char === '/') {
return this.parseName();
} else if (char === '(') {
return this.parseStringValue();
} else {
return this.parseSimpleValue();
}
}
parseArrayValue() {
this.position++;
const array = [];
while (this.position < this.buffer.length) {
this.skipWhitespace();
if (this.buffer[this.position] === ']'.charCodeAt(0)) {
this.position++;
break;
}
array.push(this.parseNextValue());
}
return array;
}
parseStringValue() {
this.position++;
let str = '';
let parenCount = 1;
while (this.position < this.buffer.length && parenCount > 0) {
const char = this.buffer[this.position];
if (char === '('.charCodeAt(0)) {
parenCount++;
str += '(';
} else if (char === ')'.charCodeAt(0)) {
parenCount--;
if (parenCount > 0) {
str += ')';
}
} else if (char === '\\'.charCodeAt(0)) {
this.position++;
if (this.position < this.buffer.length) {
const nextChar = this.buffer[this.position];
switch (nextChar) {
case 'n'.charCodeAt(0): str += '\n'; break;
case 'r'.charCodeAt(0): str += '\r'; break;
case 't'.charCodeAt(0): str += '\t'; break;
default: str += String.fromCharCode(nextChar);
}
}
} else {
str += String.fromCharCode(char);
}
this.position++;
}
return str;
}
parseSimpleValue() {
let value = '';
while (this.position < this.buffer.length) {
const char = this.buffer[this.position];
if (this.isWhitespace(char) || char === '/'.charCodeAt(0) ||
char === '('.charCodeAt(0) || char === '<'.charCodeAt(0) ||
char === '['.charCodeAt(0) || char === ']'.charCodeAt(0)) {
break;
}
value += String.fromCharCode(char);
this.position++;
}
if (value === 'true') return true;
if (value === 'false') return false;
if (value === 'null') return null;
if (this.isNumeric(value)) return parseFloat(value);
return value;
}
// Métodos públicos para acesso aos dados
getObjects() {
return this.objects;
}
getObject(id) {
return this.objects.get(id.toString());
}
getTrailer() {
return this.trailer;
}
getDocumentInfo() {
const infoRef = this.trailer.Info;
if (infoRef && typeof infoRef === 'string') {
const infoId = infoRef.split(' ')[0];
const infoObj = this.getObject(infoId);
return infoObj ? infoObj.content : null;
}
return null;
}
getPageCount() {
return this.pages.length;
}
getPageLayout(pageNumber) {
if (pageNumber < 1 || pageNumber > this.pages.length) {
throw new Error(`Página ${pageNumber} não encontrada`);
}
return this.pages[pageNumber - 1];
}
getPageTables(pageNumber) {
const page = this.getPageLayout(pageNumber);
return page.tables || [];
}
getTable(pageNumber, tableIndex) {
const tables = this.getPageTables(pageNumber);
if (tableIndex < 0 || tableIndex >= tables.length) {
throw new Error(`Tabela ${tableIndex} não encontrada na página ${pageNumber}`);
}
return tables[tableIndex];
}
getTableAsCSV(pageNumber, tableIndex) {
const table = this.getTable(pageNumber, tableIndex);
let csv = '';
for (const row of table.structure) {
const csvRow = row.map(cell => {
let cellText = cell.text || '';
if (cellText.includes(',') || cellText.includes('"') || cellText.includes('\n')) {
cellText = '"' + cellText.replace(/"/g, '""') + '"';
}
return cellText;
}).join(',');
csv += csvRow + '\n';
}
return csv.trim();
}
getPageTablesAsCSV(pageNumber) {
const tables = this.getPageTables(pageNumber);
return tables.map((table, index) => ({
index: index,
csv: this.getTableAsCSV(pageNumber, index),
bounds: table.bounds
}));
}
getTableAsJSON(pageNumber, tableIndex) {
const table = this.getTable(pageNumber, tableIndex);
const headers = table.structure[0]?.map(cell => cell.text) || [];
const rows = table.structure.slice(1);
const jsonArray = rows.map(row => {
const obj = {};
row.forEach((cell, index) => {
const header = headers[index] || `col_${index}`;
obj[header] = cell.text || '';
});
return obj;
});
return jsonArray;
}
getTableStatistics(pageNumber) {
const tables = this.getPageTables(pageNumber);
return tables.map((table, index) => ({
index: index,
rowCount: table.rowCount,
columnCount: table.columnCount,
bounds: table.bounds,
cellCount: table.cells?.length || 0
}));
}
searchTables(pageNumber, searchTerm) {
const tables = this.getPageTables(pageNumber);
const results = [];
const lowerSearchTerm = searchTerm.toLowerCase();
tables.forEach((table, index) => {
let found = false;
for (const row of table.structure) {
for (const cell of row) {
if (cell.text && cell.text.toLowerCase().includes(lowerSearchTerm)) {
found = true;
break;
}
}
if (found) break;
}
if (found) {
results.push({
tableIndex: index,
table: table
});
}
});
return results;
}
getTableAsString(pageNumber, tableIndex) {
const table = this.getTable(pageNumber, tableIndex);
let result = '';
for (const row of table.structure) {
const rowText = row.map(cell => cell.text || '').join('\t');
result += rowText + '\n';
}
return result.trim();
}
hasTables(pageNumber) {
const tables = this.getPageTables(pageNumber);
return tables.length > 0;
}
getAllTables() {
const allTables = [];
for (let i = 1; i <= this.getPageCount(); i++) {
const pageTables = this.getPageTables(i);
pageTables.forEach((table, tableIndex) => {
allTables.push({
page: i,
tableIndex: tableIndex,
table: table
});
});
}
return allTables;
}
exportAllTablesAsJSON() {
const allTables = this.getAllTables();
const result = [];
allTables.forEach(({ page, tableIndex, table }) => {
result.push({
page: page,
tableIndex: tableIndex,
bounds: table.bounds,
data: this.getTableAsJSON(page, tableIndex)
});
});
return result;
}
exportAllTablesAsCSV() {
let csvOutput = '';
for (let i = 1; i <= this.getPageCount(); i++) {
const pageTables = this.getPageTablesAsCSV(i);
if (pageTables.length > 0) {
csvOutput += `=== Página ${i} ===\n`;
pageTables.forEach(table => {
csvOutput += table.csv + '\n\n';
});
}
}
return csvOutput.trim();
}
getPageTextWithPosition(pageNumber) {
const page = this.getPageLayout(pageNumber);
return page.textElements.map(element => ({
text: element.text,
x: element.position.x,
y: element.position.y,
font: element.font,
fontSize: element.fontSize
}));
}
getPagePlainText(pageNumber) {
const page = this.getPageLayout(pageNumber);
const textElements = [...page.textElements];
textElements.sort((a, b) => {
if (Math.abs(b.position.y - a.position.y) > 5) {
return b.position.y - a.position.y;
}
return a.position.x - b.position.x;
});
const lines = [];
let currentLine = [];
let lastY = null;
for (const element of textElements) {
if (lastY === null || Math.abs(element.position.y - lastY) > 5) {
if (currentLine.length > 0) {
lines.push(currentLine);
}
currentLine = [element];
lastY = element.position.y;
} else {
currentLine.push(element);
}
}
if (currentLine.length > 0) {
lines.push(currentLine);
}
let result = '';
for (const line of lines) {
line.sort((a, b) => a.position.x - b.position.x);
const lineText = line.map(el => el.text).join(' ');
result += lineText + '\n';
}
return result.trim();
}
getDocumentTextFormatted() {
let result = '';
for (let i = 1; i <= this.getPageCount(); i++) {
result += `=== Página ${i} ===\n`;
result += this.getPagePlainText(i) + '\n\n';
}
return result.trim();
}
searchPageText(pageNumber, searchTerm) {
const page = this.getPageLayout(pageNumber);
const results = [];
const lowerSearchTerm = searchTerm.toLowerCase();
page.textElements.forEach((element, index) => {
if (element.text.toLowerCase().includes(lowerSearchTerm)) {
results.push({
element: element,
index: index,
page: pageNumber
});
}
});
return results;
}
getPageGraphics(pageNumber) {
const page = this.getPageLayout(pageNumber);
return page.graphics;
}
getPageTransformations(pageNumber) {
const page = this.getPageLayout(pageNumber);
return page.transformations;
}
getPageBoundingBox(pageNumber) {
const page = this.getPageLayout(pageNumber);
return page.size;
}
convertToStandardCoordinates(pageNumber, x, y) {
const page = this.getPageLayout(pageNumber);
const pageSize = page.size;
return {
x: x,
y: pageSize.height - y
};
}
}
// Exporta a classe
if (typeof module !== 'undefined' && module.exports) {
module.exports = PDFParser;
} else if (typeof window !== 'undefined') {
window.PDFParser = PDFParser;
}
// Exemplo de uso:
/*
async function exemploExportacaoHTML() {
try {
const buffer = await fs.readFile('documento.pdf');
const parser = new PDFParser();
await parser.loadPDF(buffer);
// Exporta como HTML completo com layout
const htmlCompleto = parser.exportToHTML({
includeStyles: true,
includeMetadata: true,
preserveLayout: true,
includeImages: true,
includeTables: true
});
// Salva o HTML
await fs.writeFile('documento.html', htmlCompleto);
console.log('HTML exportado com sucesso!');
// Exporta como HTML simplificado
const htmlSimples = parser.exportToSimpleHTML({
includeMetadata: true,
includeTables: true
});
// Exporta como HTML responsivo
const htmlResponsivo = parser.exportToResponsiveHTML({
maxWidth: 800,
includeStyles: true,
includeMetadata: true
});
console.log('Todas as exportações concluídas!');
} catch (error) {
console.error('Erro:', error.message);
}
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment