Last active
June 30, 2025 02:15
-
-
Save celsowm/bc7aea119016e611f3249082eb731a5c to your computer and use it in GitHub Desktop.
html2pdf.c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <ctype.h> // For isspace | |
#include <libxml/HTMLparser.h> | |
#include <libxml/tree.h> | |
#include <hpdf.h> | |
// Global PDF settings | |
#define PAGE_WIDTH 595.28f // A4 paper width in points (210mm) | |
#define PAGE_HEIGHT 841.89f // A4 paper height in points (297mm) | |
#define MARGIN_TOP 50.0f | |
#define MARGIN_BOTTOM 50.0f | |
#define MARGIN_LEFT 50.0f | |
#define MARGIN_RIGHT 50.0f | |
#define LINE_SPACING 1.2f // Multiplier for line height | |
#define LIST_INDENT 20.0f | |
#define BLOCKQUOTE_INDENT 25.0f | |
// --- ESTRUTURAS DE DADOS REATORADAS --- | |
// (REFATORADO) O estado da fonte agora usa flags para negrito/itálico, | |
// tornando o aninhamento de estilos (ex: <b><i>) mais fácil de gerenciar. | |
typedef struct { | |
HPDF_Font font; | |
float size; | |
HPDF_RGBColor color; | |
HPDF_BOOL underline; | |
HPDF_BOOL is_bold; | |
HPDF_BOOL is_italic; | |
} FontState; | |
// A estrutura principal do estado do documento. Sem grandes mudanças. | |
typedef struct { | |
HPDF_Doc pdf; | |
HPDF_Page page; | |
float current_y; | |
float current_x; | |
int list_level; | |
int ol_counter[10]; // Suporta listas ordenadas aninhadas em até 10 níveis | |
HPDF_BOOL is_preformatted; | |
// Fontes pré-carregadas | |
HPDF_Font font_regular; | |
HPDF_Font font_bold; | |
HPDF_Font font_italic; | |
HPDF_Font font_bold_italic; | |
HPDF_Font font_mono; | |
// Estado de renderização atual | |
FontState current_style; | |
} PdfState; | |
// (NOVO) Enum para os tipos de fonte, para facilitar a configuração dos estilos. | |
typedef enum { | |
FONT_TYPE_REGULAR, | |
FONT_TYPE_BOLD, | |
FONT_TYPE_ITALIC, | |
FONT_TYPE_MONO | |
} FontType; | |
// (NOVO) Estrutura para definir o estilo de uma tag de bloco (h1, p, etc). | |
// Esta é a chave para a abordagem "data-driven". | |
typedef struct { | |
const char *tag_name; | |
FontType font_type; | |
float font_size; | |
float space_before; | |
float space_after; | |
} BlockTagStyle; | |
// (NOVO) Array de estilos para tags de bloco. | |
// Para adicionar uma nova tag ou alterar um estilo, basta editar aqui. | |
// Muito mais dinâmico e menos repetitivo. | |
static const BlockTagStyle g_block_styles[] = { | |
{"p", FONT_TYPE_REGULAR, 12.0f, 0.0f, 10.0f}, | |
{"h1", FONT_TYPE_BOLD, 24.0f, 10.0f, 12.0f}, | |
{"h2", FONT_TYPE_BOLD, 18.0f, 8.0f, 10.0f}, | |
{"h3", FONT_TYPE_BOLD, 14.0f, 6.0f, 8.0f }, | |
{"h4", FONT_TYPE_BOLD, 12.0f, 5.0f, 6.0f }, | |
{"h5", FONT_TYPE_REGULAR, 11.0f, 5.0f, 5.0f }, | |
{"h6", FONT_TYPE_ITALIC, 10.0f, 5.0f, 5.0f }, | |
{NULL, FONT_TYPE_REGULAR, 0, 0, 0 } // Terminador | |
}; | |
// --- DECLARAÇÕES DE FUNÇÕES --- | |
void traverse_and_draw(xmlNode *node, PdfState *state); | |
void draw_inline_nodes(xmlNode *node, PdfState *state); | |
// --- FUNÇÕES AUXILIARES --- | |
void haru_error_handler(HPDF_STATUS error_no, HPDF_STATUS detail_no, void *user_data) { | |
fprintf(stderr, "ERROR: libharu code=0x%04X, detail=0x%04X\n", (unsigned int)error_no, (unsigned int)detail_no); | |
exit(1); | |
} | |
// (NOVO) Retorna a fonte HPDF com base no nosso enum FontType. | |
HPDF_Font get_font_by_type(PdfState *state, FontType type) { | |
switch (type) { | |
case FONT_TYPE_BOLD: return state->font_bold; | |
case FONT_TYPE_ITALIC: return state->font_italic; | |
case FONT_TYPE_MONO: return state->font_mono; | |
case FONT_TYPE_REGULAR: | |
default: return state->font_regular; | |
} | |
} | |
// (NOVO - Refatorado) Atualiza a fonte no estado atual com base nas flags is_bold e is_italic. | |
// Isso resolve o aninhamento de <b> e <i> de forma limpa. | |
void update_font_from_style(PdfState *state) { | |
if (state->current_style.is_bold && state->current_style.is_italic) { | |
state->current_style.font = state->font_bold_italic; | |
} else if (state->current_style.is_bold) { | |
state->current_style.font = state->font_bold; | |
} else if (state->current_style.is_italic) { | |
state->current_style.font = state->font_italic; | |
} else { | |
state->current_style.font = state->font_regular; | |
} | |
} | |
// (NOVO) Procura o estilo de uma tag no nosso array de estilos. | |
const BlockTagStyle* find_block_style_by_tag(const char *tag) { | |
for (int i = 0; g_block_styles[i].tag_name != NULL; ++i) { | |
if (strcmp(tag, g_block_styles[i].tag_name) == 0) { | |
return &g_block_styles[i]; | |
} | |
} | |
return NULL; // Não encontrado | |
} | |
// --- GERENCIAMENTO DE PÁGINA E ESTADO --- | |
void add_new_page(PdfState *state) { | |
state->page = HPDF_AddPage(state->pdf); | |
HPDF_Page_SetSize(state->page, HPDF_PAGE_SIZE_A4, HPDF_PORTRAIT); | |
state->current_y = PAGE_HEIGHT - MARGIN_TOP; | |
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT); | |
} | |
void check_and_add_page(PdfState *state, float space_needed) { | |
if (state->current_y - space_needed < MARGIN_BOTTOM) { | |
add_new_page(state); | |
} | |
} | |
void begin_new_block(PdfState *state, float space_before, float font_size) { | |
state->current_y -= space_before; | |
check_and_add_page(state, font_size * LINE_SPACING); | |
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT) + (state->is_preformatted ? BLOCKQUOTE_INDENT : 0); | |
} | |
// --- DESENHO DE CONTEÚDO --- | |
// (Sem grandes alterações nas funções de desenho, mas seu uso será mais modular) | |
void append_text(PdfState *state, const char *text) { | |
char *line_copy = strdup(text); | |
if (!line_copy) return; | |
char *word = strtok(line_copy, " \t\n"); | |
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size); | |
HPDF_Page_SetRGBFill(state->page, state->current_style.color.r, state->current_style.color.g, state->current_style.color.b); | |
while (word != NULL) { | |
float word_width = HPDF_Page_TextWidth(state->page, word); | |
float space_width = HPDF_Page_TextWidth(state->page, " "); | |
if (state->current_x + word_width > PAGE_WIDTH - MARGIN_RIGHT) { | |
state->current_y -= state->current_style.size * LINE_SPACING; | |
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT); | |
check_and_add_page(state, state->current_style.size * LINE_SPACING); | |
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size); | |
HPDF_Page_SetRGBFill(state->page, state->current_style.color.r, state->current_style.color.g, state->current_style.color.b); | |
} | |
HPDF_Page_BeginText(state->page); | |
HPDF_Page_MoveTextPos(state->page, state->current_x, state->current_y); | |
HPDF_Page_ShowText(state->page, word); | |
HPDF_Page_EndText(state->page); | |
if (state->current_style.underline) { | |
HPDF_Page_MoveTo(state->page, state->current_x, state->current_y - 2); | |
HPDF_Page_LineTo(state->page, state->current_x + word_width, state->current_y - 2); | |
HPDF_Page_Stroke(state->page); | |
} | |
state->current_x += word_width + space_width; | |
word = strtok(NULL, " \t\n"); | |
} | |
free(line_copy); | |
} | |
void draw_preformatted_text(PdfState *state, const char *text) { | |
char *full_text = strdup(text); | |
if(!full_text) return; | |
char *line = strtok(full_text, "\n"); | |
float line_height = state->current_style.size * LINE_SPACING; | |
begin_new_block(state, 5, state->current_style.size); | |
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size); | |
HPDF_Page_SetRGBFill(state->page, 0, 0, 0); | |
while(line != NULL) { | |
check_and_add_page(state, line_height); | |
HPDF_Page_BeginText(state->page); | |
HPDF_Page_MoveTextPos(state->page, state->current_x, state->current_y); | |
HPDF_Page_ShowText(state->page, line); | |
HPDF_Page_EndText(state->page); | |
state->current_y -= line_height; | |
line = strtok(NULL, "\n"); | |
} | |
state->current_y -= 10; | |
free(full_text); | |
} | |
void draw_hr(PdfState *state) { | |
begin_new_block(state, 10, 10); | |
HPDF_Page_SetLineWidth(state->page, 1.0); | |
HPDF_Page_SetRGBStroke(state->page, 0.5, 0.5, 0.5); | |
HPDF_Page_MoveTo(state->page, MARGIN_LEFT, state->current_y); | |
HPDF_Page_LineTo(state->page, PAGE_WIDTH - MARGIN_RIGHT, state->current_y); | |
HPDF_Page_Stroke(state->page); | |
state->current_y -= 10; | |
} | |
void draw_image(PdfState *state, xmlNode *node) { | |
xmlChar *src_attr = xmlGetProp(node, (const xmlChar *)"src"); | |
if (!src_attr) return; | |
const char *path = (const char *)src_attr; | |
HPDF_Image image = NULL; | |
if (strstr(path, ".jpg") || strstr(path, ".jpeg")) { | |
image = HPDF_LoadJpegImageFromFile(state->pdf, path); | |
} else if (strstr(path, ".png")) { | |
image = HPDF_LoadPngImageFromFile(state->pdf, path); | |
} else { | |
fprintf(stderr, "Warning: Unsupported image type for %s\n", path); | |
} | |
if (image) { | |
float img_w = HPDF_Image_GetWidth(image); | |
float img_h = HPDF_Image_GetHeight(image); | |
float available_width = PAGE_WIDTH - MARGIN_LEFT - MARGIN_RIGHT - (state->list_level * LIST_INDENT); | |
if (img_w > available_width) { | |
float ratio = available_width / img_w; | |
img_w = available_width; | |
img_h *= ratio; | |
} | |
begin_new_block(state, 10, img_h); | |
check_and_add_page(state, img_h + 10); | |
state->current_y -= img_h; | |
HPDF_Page_DrawImage(state->page, image, state->current_x, state->current_y, img_w, img_h); | |
state->current_y -= 10; | |
} else { | |
fprintf(stderr, "Warning: Could not load image from %s\n", path); | |
} | |
xmlFree(src_attr); | |
} | |
// --- TRAVESSIA DO DOM (FUNÇÕES PRINCIPAIS REFATORADAS) --- | |
// (REFATORADO) Lida com elementos inline. A lógica agora é mais limpa. | |
void draw_inline_nodes(xmlNode *node, PdfState *state) { | |
for (xmlNode *cur_node = node; cur_node; cur_node = cur_node->next) { | |
if (cur_node->type == XML_TEXT_NODE) { | |
char *content = (char *)xmlNodeGetContent(cur_node); | |
if (content) { | |
for(char *p = content; *p; ++p) if (*p == '\n' || *p == '\r') *p = ' '; | |
append_text(state, content); | |
xmlFree(content); | |
} | |
} else if (cur_node->type == XML_ELEMENT_NODE) { | |
const char *tag = (const char *)cur_node->name; | |
FontState previous_style = state->current_style; // Salva o estilo | |
// Modifica o estado com base na tag | |
if (strcmp(tag, "b") == 0 || strcmp(tag, "strong") == 0) { | |
state->current_style.is_bold = HPDF_TRUE; | |
} else if (strcmp(tag, "i") == 0 || strcmp(tag, "em") == 0) { | |
state->current_style.is_italic = HPDF_TRUE; | |
} else if (strcmp(tag, "u") == 0) { | |
state->current_style.underline = HPDF_TRUE; | |
} else if (strcmp(tag, "code") == 0) { | |
state->current_style.font = state->font_mono; // Mono é um caso especial | |
state->current_style.color = (HPDF_RGBColor){0.8, 0.1, 0.1}; | |
} else if (strcmp(tag, "a") == 0) { | |
state->current_style.color = (HPDF_RGBColor){0.0, 0.0, 0.9}; | |
state->current_style.underline = HPDF_TRUE; | |
} | |
// Atualiza a fonte (ex: negrito + itálico = negrito-itálico) | |
update_font_from_style(state); | |
// Tratamento especial para links, para criar a área clicável | |
if (strcmp(tag, "a") == 0) { | |
xmlChar* href = xmlGetProp(cur_node, (const xmlChar*)"href"); | |
if (href) { | |
float start_x = state->current_x; | |
float start_y = state->current_y; | |
draw_inline_nodes(cur_node->children, state); // Desenha o texto do link | |
float end_x = state->current_x; | |
float end_y = state->current_y; | |
// A anotação precisa de um retângulo. Se o texto quebrar a linha, isso pode não ser perfeito. | |
HPDF_Rect rect = {start_x, end_y, end_x, start_y + 2}; | |
HPDF_Page_CreateURILinkAnnot(state->page, rect, (const char *)href); | |
xmlFree(href); | |
} else { | |
draw_inline_nodes(cur_node->children, state); // Link sem href | |
} | |
} else { | |
draw_inline_nodes(cur_node->children, state); // Recorre para outras tags | |
} | |
state->current_style = previous_style; // Restaura o estilo | |
} | |
} | |
} | |
// (NOVO) Função genérica para processar tags de bloco (h1, p, etc.). | |
// Remove toda a lógica repetida da função principal. | |
void process_generic_block(xmlNode *node, PdfState *state, const BlockTagStyle *style) { | |
// Configura o estado para este elemento de bloco | |
begin_new_block(state, style->space_before, style->font_size); | |
// Zera o estilo e aplica as novas propriedades | |
memset(&state->current_style, 0, sizeof(FontState)); | |
state->current_style.font = get_font_by_type(state, style->font_type); | |
state->current_style.size = style->font_size; | |
state->current_style.color = (HPDF_RGBColor){0, 0, 0}; | |
// Processa os filhos (texto, <b>, <i>, etc.) | |
draw_inline_nodes(node->children, state); | |
// Finaliza o bloco com espaçamento | |
state->current_y -= (style->font_size * LINE_SPACING * 0.5); | |
state->current_y -= style->space_after; | |
} | |
// (REFATORADO) Função principal de travessia. Muito mais limpa e organizada. | |
void traverse_and_draw(xmlNode *node, PdfState *state) { | |
for (xmlNode *cur_node = node; cur_node; cur_node = cur_node->next) { | |
if (cur_node->type != XML_ELEMENT_NODE) { | |
continue; | |
} | |
const char *tag = (const char *)cur_node->name; | |
// Tenta encontrar um estilo pré-definido para a tag | |
const BlockTagStyle *style = find_block_style_by_tag(tag); | |
if (style) { | |
// Se encontrou (h1, p, etc.), usa a função de processamento genérica | |
process_generic_block(cur_node, state, style); | |
} | |
// Se não, trata os casos especiais | |
else if (strcmp(tag, "hr") == 0) { draw_hr(state); } | |
else if (strcmp(tag, "img") == 0) { draw_image(state, cur_node); } | |
else if (strcmp(tag, "ul") == 0 || strcmp(tag, "ol") == 0) { | |
begin_new_block(state, 5, 12); | |
state->list_level++; | |
if (strcmp(tag, "ol") == 0) state->ol_counter[state->list_level] = 1; | |
traverse_and_draw(cur_node->children, state); // Recorre para os <li> | |
if (strcmp(tag, "ol") == 0) state->ol_counter[state->list_level] = 0; | |
state->list_level--; | |
state->current_y -= 5; | |
} | |
else if (strcmp(tag, "li") == 0) { | |
begin_new_block(state, 2, 12); | |
state->current_x -= LIST_INDENT; // Recua para desenhar o marcador | |
char prefix[32]; | |
if (state->ol_counter[state->list_level] > 0) { | |
sprintf(prefix, "%d.", state->ol_counter[state->list_level]++); | |
} else { | |
strcpy(prefix, "•"); // Unicode para bullet | |
} | |
// Desenha o marcador/número | |
FontState temp_style = state->current_style; | |
state->current_style.font = state->font_bold; // Marcadores em negrito | |
state->current_style.size = 12.0f; | |
append_text(state, prefix); | |
state->current_style = temp_style; // Restaura o estilo para o conteúdo | |
state->current_x += 5; // Espaço após o marcador | |
// Processa o conteúdo do <li> | |
begin_new_block(state, 0, 12); // Zera o espaço antes, pois já foi adicionado | |
memset(&state->current_style, 0, sizeof(FontState)); | |
state->current_style.font = state->font_regular; | |
state->current_style.size = 12.0f; | |
state->current_style.color = (HPDF_RGBColor){0,0,0}; | |
draw_inline_nodes(cur_node->children, state); | |
state->current_y -= 5; // Pequeno espaço após o item | |
} | |
else if (strcmp(tag, "blockquote") == 0) { | |
begin_new_block(state, 8, 12); | |
float start_y_quote = state->current_y; | |
state->is_preformatted = HPDF_TRUE; // Usa a flag para indentar | |
traverse_and_draw(cur_node->children, state); | |
state->is_preformatted = HPDF_FALSE; | |
// Desenha a linha vertical | |
HPDF_Page_SetRGBStroke(state->page, 0.8, 0.8, 0.8); | |
HPDF_Page_SetLineWidth(state->page, 2.0); | |
HPDF_Page_MoveTo(state->page, MARGIN_LEFT + (state->list_level * LIST_INDENT) + 10, start_y_quote); | |
HPDF_Page_LineTo(state->page, MARGIN_LEFT + (state->list_level * LIST_INDENT) + 10, state->current_y + 12); | |
HPDF_Page_Stroke(state->page); | |
state->current_y -= 8; | |
} | |
else if(strcmp(tag, "pre") == 0) { | |
xmlNode *code_node = cur_node->children; | |
if (code_node && strcmp((const char*)code_node->name, "code") == 0) { | |
code_node = code_node->children; | |
} | |
if (code_node && code_node->type == XML_TEXT_NODE) { | |
char *content = (char *)xmlNodeGetContent(code_node); | |
state->current_style.font = state->font_mono; | |
state->current_style.size = 9.0f; | |
state->current_style.color = (HPDF_RGBColor){0,0,0}; | |
draw_preformatted_text(state, content); | |
xmlFree(content); | |
} | |
} | |
else { | |
// Para outras tags de bloco desconhecidas (ex: <div>), apenas processa seus filhos | |
traverse_and_draw(cur_node->children, state); | |
} | |
} | |
} | |
int main(int argc, char *argv[]) { | |
if (argc != 3) { | |
fprintf(stderr, "Usage: %s <input.html> <output.pdf>\n", argv[0]); | |
return 1; | |
} | |
const char *html_file = argv[1]; | |
const char *pdf_file = argv[2]; | |
htmlDocPtr doc = htmlReadFile(html_file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET); | |
if (doc == NULL) { | |
fprintf(stderr, "Error: could not parse file %s\n", html_file); | |
return 1; | |
} | |
xmlNode *root_element = xmlDocGetRootElement(doc); | |
xmlNode *body_node = NULL; | |
for (xmlNode* cur = root_element; cur; cur = cur->next) { | |
if (cur->type == XML_ELEMENT_NODE && strcmp((const char*)cur->name, "body") == 0) { | |
body_node = cur; | |
break; | |
} | |
body_node = xmlFindChild(cur, (const xmlChar*)"body", NULL); | |
if (body_node) break; | |
} | |
if (body_node == NULL) { | |
fprintf(stderr, "Error: Could not find <body> tag in HTML file.\n"); | |
xmlFreeDoc(doc); | |
return 1; | |
} | |
PdfState state; | |
memset(&state, 0, sizeof(PdfState)); | |
state.pdf = HPDF_New(haru_error_handler, NULL); | |
if (!state.pdf) { | |
fprintf(stderr, "Error: could not create PDF object.\n"); | |
xmlFreeDoc(doc); | |
return 1; | |
} | |
HPDF_SetCompressionMode(state.pdf, HPDF_COMP_ALL); | |
state.font_regular = HPDF_GetFont(state.pdf, "Helvetica", "WinAnsiEncoding"); | |
state.font_bold = HPDF_GetFont(state.pdf, "Helvetica-Bold", "WinAnsiEncoding"); | |
state.font_italic = HPDF_GetFont(state.pdf, "Helvetica-Oblique", "WinAnsiEncoding"); | |
state.font_bold_italic = HPDF_GetFont(state.pdf, "Helvetica-BoldOblique", "WinAnsiEncoding"); | |
state.font_mono = HPDF_GetFont(state.pdf, "Courier", "WinAnsiEncoding"); | |
add_new_page(&state); | |
traverse_and_draw(body_node->children, &state); | |
HPDF_SaveToFile(state.pdf, pdf_file); | |
HPDF_Free(state.pdf); | |
xmlFreeDoc(doc); | |
xmlCleanupParser(); | |
printf("Successfully converted %s to %s\n", html_file, pdf_file); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment