Skip to content

Instantly share code, notes, and snippets.

@celsowm
Last active June 30, 2025 02:15
Show Gist options
  • Save celsowm/bc7aea119016e611f3249082eb731a5c to your computer and use it in GitHub Desktop.
Save celsowm/bc7aea119016e611f3249082eb731a5c to your computer and use it in GitHub Desktop.
html2pdf.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h> // For isspace
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <hpdf.h>
// Global PDF settings
#define PAGE_WIDTH 595.28f // A4 paper width in points (210mm)
#define PAGE_HEIGHT 841.89f // A4 paper height in points (297mm)
#define MARGIN_TOP 50.0f
#define MARGIN_BOTTOM 50.0f
#define MARGIN_LEFT 50.0f
#define MARGIN_RIGHT 50.0f
#define LINE_SPACING 1.2f // Multiplier for line height
#define LIST_INDENT 20.0f
#define BLOCKQUOTE_INDENT 25.0f
// --- ESTRUTURAS DE DADOS REATORADAS ---
// (REFATORADO) O estado da fonte agora usa flags para negrito/itálico,
// tornando o aninhamento de estilos (ex: <b><i>) mais fácil de gerenciar.
typedef struct {
HPDF_Font font;
float size;
HPDF_RGBColor color;
HPDF_BOOL underline;
HPDF_BOOL is_bold;
HPDF_BOOL is_italic;
} FontState;
// A estrutura principal do estado do documento. Sem grandes mudanças.
typedef struct {
HPDF_Doc pdf;
HPDF_Page page;
float current_y;
float current_x;
int list_level;
int ol_counter[10]; // Suporta listas ordenadas aninhadas em até 10 níveis
HPDF_BOOL is_preformatted;
// Fontes pré-carregadas
HPDF_Font font_regular;
HPDF_Font font_bold;
HPDF_Font font_italic;
HPDF_Font font_bold_italic;
HPDF_Font font_mono;
// Estado de renderização atual
FontState current_style;
} PdfState;
// (NOVO) Enum para os tipos de fonte, para facilitar a configuração dos estilos.
typedef enum {
FONT_TYPE_REGULAR,
FONT_TYPE_BOLD,
FONT_TYPE_ITALIC,
FONT_TYPE_MONO
} FontType;
// (NOVO) Estrutura para definir o estilo de uma tag de bloco (h1, p, etc).
// Esta é a chave para a abordagem "data-driven".
typedef struct {
const char *tag_name;
FontType font_type;
float font_size;
float space_before;
float space_after;
} BlockTagStyle;
// (NOVO) Array de estilos para tags de bloco.
// Para adicionar uma nova tag ou alterar um estilo, basta editar aqui.
// Muito mais dinâmico e menos repetitivo.
static const BlockTagStyle g_block_styles[] = {
{"p", FONT_TYPE_REGULAR, 12.0f, 0.0f, 10.0f},
{"h1", FONT_TYPE_BOLD, 24.0f, 10.0f, 12.0f},
{"h2", FONT_TYPE_BOLD, 18.0f, 8.0f, 10.0f},
{"h3", FONT_TYPE_BOLD, 14.0f, 6.0f, 8.0f },
{"h4", FONT_TYPE_BOLD, 12.0f, 5.0f, 6.0f },
{"h5", FONT_TYPE_REGULAR, 11.0f, 5.0f, 5.0f },
{"h6", FONT_TYPE_ITALIC, 10.0f, 5.0f, 5.0f },
{NULL, FONT_TYPE_REGULAR, 0, 0, 0 } // Terminador
};
// --- DECLARAÇÕES DE FUNÇÕES ---
void traverse_and_draw(xmlNode *node, PdfState *state);
void draw_inline_nodes(xmlNode *node, PdfState *state);
// --- FUNÇÕES AUXILIARES ---
void haru_error_handler(HPDF_STATUS error_no, HPDF_STATUS detail_no, void *user_data) {
fprintf(stderr, "ERROR: libharu code=0x%04X, detail=0x%04X\n", (unsigned int)error_no, (unsigned int)detail_no);
exit(1);
}
// (NOVO) Retorna a fonte HPDF com base no nosso enum FontType.
HPDF_Font get_font_by_type(PdfState *state, FontType type) {
switch (type) {
case FONT_TYPE_BOLD: return state->font_bold;
case FONT_TYPE_ITALIC: return state->font_italic;
case FONT_TYPE_MONO: return state->font_mono;
case FONT_TYPE_REGULAR:
default: return state->font_regular;
}
}
// (NOVO - Refatorado) Atualiza a fonte no estado atual com base nas flags is_bold e is_italic.
// Isso resolve o aninhamento de <b> e <i> de forma limpa.
void update_font_from_style(PdfState *state) {
if (state->current_style.is_bold && state->current_style.is_italic) {
state->current_style.font = state->font_bold_italic;
} else if (state->current_style.is_bold) {
state->current_style.font = state->font_bold;
} else if (state->current_style.is_italic) {
state->current_style.font = state->font_italic;
} else {
state->current_style.font = state->font_regular;
}
}
// (NOVO) Procura o estilo de uma tag no nosso array de estilos.
const BlockTagStyle* find_block_style_by_tag(const char *tag) {
for (int i = 0; g_block_styles[i].tag_name != NULL; ++i) {
if (strcmp(tag, g_block_styles[i].tag_name) == 0) {
return &g_block_styles[i];
}
}
return NULL; // Não encontrado
}
// --- GERENCIAMENTO DE PÁGINA E ESTADO ---
void add_new_page(PdfState *state) {
state->page = HPDF_AddPage(state->pdf);
HPDF_Page_SetSize(state->page, HPDF_PAGE_SIZE_A4, HPDF_PORTRAIT);
state->current_y = PAGE_HEIGHT - MARGIN_TOP;
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT);
}
void check_and_add_page(PdfState *state, float space_needed) {
if (state->current_y - space_needed < MARGIN_BOTTOM) {
add_new_page(state);
}
}
void begin_new_block(PdfState *state, float space_before, float font_size) {
state->current_y -= space_before;
check_and_add_page(state, font_size * LINE_SPACING);
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT) + (state->is_preformatted ? BLOCKQUOTE_INDENT : 0);
}
// --- DESENHO DE CONTEÚDO ---
// (Sem grandes alterações nas funções de desenho, mas seu uso será mais modular)
void append_text(PdfState *state, const char *text) {
char *line_copy = strdup(text);
if (!line_copy) return;
char *word = strtok(line_copy, " \t\n");
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size);
HPDF_Page_SetRGBFill(state->page, state->current_style.color.r, state->current_style.color.g, state->current_style.color.b);
while (word != NULL) {
float word_width = HPDF_Page_TextWidth(state->page, word);
float space_width = HPDF_Page_TextWidth(state->page, " ");
if (state->current_x + word_width > PAGE_WIDTH - MARGIN_RIGHT) {
state->current_y -= state->current_style.size * LINE_SPACING;
state->current_x = MARGIN_LEFT + (state->list_level * LIST_INDENT);
check_and_add_page(state, state->current_style.size * LINE_SPACING);
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size);
HPDF_Page_SetRGBFill(state->page, state->current_style.color.r, state->current_style.color.g, state->current_style.color.b);
}
HPDF_Page_BeginText(state->page);
HPDF_Page_MoveTextPos(state->page, state->current_x, state->current_y);
HPDF_Page_ShowText(state->page, word);
HPDF_Page_EndText(state->page);
if (state->current_style.underline) {
HPDF_Page_MoveTo(state->page, state->current_x, state->current_y - 2);
HPDF_Page_LineTo(state->page, state->current_x + word_width, state->current_y - 2);
HPDF_Page_Stroke(state->page);
}
state->current_x += word_width + space_width;
word = strtok(NULL, " \t\n");
}
free(line_copy);
}
void draw_preformatted_text(PdfState *state, const char *text) {
char *full_text = strdup(text);
if(!full_text) return;
char *line = strtok(full_text, "\n");
float line_height = state->current_style.size * LINE_SPACING;
begin_new_block(state, 5, state->current_style.size);
HPDF_Page_SetFontAndSize(state->page, state->current_style.font, state->current_style.size);
HPDF_Page_SetRGBFill(state->page, 0, 0, 0);
while(line != NULL) {
check_and_add_page(state, line_height);
HPDF_Page_BeginText(state->page);
HPDF_Page_MoveTextPos(state->page, state->current_x, state->current_y);
HPDF_Page_ShowText(state->page, line);
HPDF_Page_EndText(state->page);
state->current_y -= line_height;
line = strtok(NULL, "\n");
}
state->current_y -= 10;
free(full_text);
}
void draw_hr(PdfState *state) {
begin_new_block(state, 10, 10);
HPDF_Page_SetLineWidth(state->page, 1.0);
HPDF_Page_SetRGBStroke(state->page, 0.5, 0.5, 0.5);
HPDF_Page_MoveTo(state->page, MARGIN_LEFT, state->current_y);
HPDF_Page_LineTo(state->page, PAGE_WIDTH - MARGIN_RIGHT, state->current_y);
HPDF_Page_Stroke(state->page);
state->current_y -= 10;
}
void draw_image(PdfState *state, xmlNode *node) {
xmlChar *src_attr = xmlGetProp(node, (const xmlChar *)"src");
if (!src_attr) return;
const char *path = (const char *)src_attr;
HPDF_Image image = NULL;
if (strstr(path, ".jpg") || strstr(path, ".jpeg")) {
image = HPDF_LoadJpegImageFromFile(state->pdf, path);
} else if (strstr(path, ".png")) {
image = HPDF_LoadPngImageFromFile(state->pdf, path);
} else {
fprintf(stderr, "Warning: Unsupported image type for %s\n", path);
}
if (image) {
float img_w = HPDF_Image_GetWidth(image);
float img_h = HPDF_Image_GetHeight(image);
float available_width = PAGE_WIDTH - MARGIN_LEFT - MARGIN_RIGHT - (state->list_level * LIST_INDENT);
if (img_w > available_width) {
float ratio = available_width / img_w;
img_w = available_width;
img_h *= ratio;
}
begin_new_block(state, 10, img_h);
check_and_add_page(state, img_h + 10);
state->current_y -= img_h;
HPDF_Page_DrawImage(state->page, image, state->current_x, state->current_y, img_w, img_h);
state->current_y -= 10;
} else {
fprintf(stderr, "Warning: Could not load image from %s\n", path);
}
xmlFree(src_attr);
}
// --- TRAVESSIA DO DOM (FUNÇÕES PRINCIPAIS REFATORADAS) ---
// (REFATORADO) Lida com elementos inline. A lógica agora é mais limpa.
void draw_inline_nodes(xmlNode *node, PdfState *state) {
for (xmlNode *cur_node = node; cur_node; cur_node = cur_node->next) {
if (cur_node->type == XML_TEXT_NODE) {
char *content = (char *)xmlNodeGetContent(cur_node);
if (content) {
for(char *p = content; *p; ++p) if (*p == '\n' || *p == '\r') *p = ' ';
append_text(state, content);
xmlFree(content);
}
} else if (cur_node->type == XML_ELEMENT_NODE) {
const char *tag = (const char *)cur_node->name;
FontState previous_style = state->current_style; // Salva o estilo
// Modifica o estado com base na tag
if (strcmp(tag, "b") == 0 || strcmp(tag, "strong") == 0) {
state->current_style.is_bold = HPDF_TRUE;
} else if (strcmp(tag, "i") == 0 || strcmp(tag, "em") == 0) {
state->current_style.is_italic = HPDF_TRUE;
} else if (strcmp(tag, "u") == 0) {
state->current_style.underline = HPDF_TRUE;
} else if (strcmp(tag, "code") == 0) {
state->current_style.font = state->font_mono; // Mono é um caso especial
state->current_style.color = (HPDF_RGBColor){0.8, 0.1, 0.1};
} else if (strcmp(tag, "a") == 0) {
state->current_style.color = (HPDF_RGBColor){0.0, 0.0, 0.9};
state->current_style.underline = HPDF_TRUE;
}
// Atualiza a fonte (ex: negrito + itálico = negrito-itálico)
update_font_from_style(state);
// Tratamento especial para links, para criar a área clicável
if (strcmp(tag, "a") == 0) {
xmlChar* href = xmlGetProp(cur_node, (const xmlChar*)"href");
if (href) {
float start_x = state->current_x;
float start_y = state->current_y;
draw_inline_nodes(cur_node->children, state); // Desenha o texto do link
float end_x = state->current_x;
float end_y = state->current_y;
// A anotação precisa de um retângulo. Se o texto quebrar a linha, isso pode não ser perfeito.
HPDF_Rect rect = {start_x, end_y, end_x, start_y + 2};
HPDF_Page_CreateURILinkAnnot(state->page, rect, (const char *)href);
xmlFree(href);
} else {
draw_inline_nodes(cur_node->children, state); // Link sem href
}
} else {
draw_inline_nodes(cur_node->children, state); // Recorre para outras tags
}
state->current_style = previous_style; // Restaura o estilo
}
}
}
// (NOVO) Função genérica para processar tags de bloco (h1, p, etc.).
// Remove toda a lógica repetida da função principal.
void process_generic_block(xmlNode *node, PdfState *state, const BlockTagStyle *style) {
// Configura o estado para este elemento de bloco
begin_new_block(state, style->space_before, style->font_size);
// Zera o estilo e aplica as novas propriedades
memset(&state->current_style, 0, sizeof(FontState));
state->current_style.font = get_font_by_type(state, style->font_type);
state->current_style.size = style->font_size;
state->current_style.color = (HPDF_RGBColor){0, 0, 0};
// Processa os filhos (texto, <b>, <i>, etc.)
draw_inline_nodes(node->children, state);
// Finaliza o bloco com espaçamento
state->current_y -= (style->font_size * LINE_SPACING * 0.5);
state->current_y -= style->space_after;
}
// (REFATORADO) Função principal de travessia. Muito mais limpa e organizada.
void traverse_and_draw(xmlNode *node, PdfState *state) {
for (xmlNode *cur_node = node; cur_node; cur_node = cur_node->next) {
if (cur_node->type != XML_ELEMENT_NODE) {
continue;
}
const char *tag = (const char *)cur_node->name;
// Tenta encontrar um estilo pré-definido para a tag
const BlockTagStyle *style = find_block_style_by_tag(tag);
if (style) {
// Se encontrou (h1, p, etc.), usa a função de processamento genérica
process_generic_block(cur_node, state, style);
}
// Se não, trata os casos especiais
else if (strcmp(tag, "hr") == 0) { draw_hr(state); }
else if (strcmp(tag, "img") == 0) { draw_image(state, cur_node); }
else if (strcmp(tag, "ul") == 0 || strcmp(tag, "ol") == 0) {
begin_new_block(state, 5, 12);
state->list_level++;
if (strcmp(tag, "ol") == 0) state->ol_counter[state->list_level] = 1;
traverse_and_draw(cur_node->children, state); // Recorre para os <li>
if (strcmp(tag, "ol") == 0) state->ol_counter[state->list_level] = 0;
state->list_level--;
state->current_y -= 5;
}
else if (strcmp(tag, "li") == 0) {
begin_new_block(state, 2, 12);
state->current_x -= LIST_INDENT; // Recua para desenhar o marcador
char prefix[32];
if (state->ol_counter[state->list_level] > 0) {
sprintf(prefix, "%d.", state->ol_counter[state->list_level]++);
} else {
strcpy(prefix, "•"); // Unicode para bullet
}
// Desenha o marcador/número
FontState temp_style = state->current_style;
state->current_style.font = state->font_bold; // Marcadores em negrito
state->current_style.size = 12.0f;
append_text(state, prefix);
state->current_style = temp_style; // Restaura o estilo para o conteúdo
state->current_x += 5; // Espaço após o marcador
// Processa o conteúdo do <li>
begin_new_block(state, 0, 12); // Zera o espaço antes, pois já foi adicionado
memset(&state->current_style, 0, sizeof(FontState));
state->current_style.font = state->font_regular;
state->current_style.size = 12.0f;
state->current_style.color = (HPDF_RGBColor){0,0,0};
draw_inline_nodes(cur_node->children, state);
state->current_y -= 5; // Pequeno espaço após o item
}
else if (strcmp(tag, "blockquote") == 0) {
begin_new_block(state, 8, 12);
float start_y_quote = state->current_y;
state->is_preformatted = HPDF_TRUE; // Usa a flag para indentar
traverse_and_draw(cur_node->children, state);
state->is_preformatted = HPDF_FALSE;
// Desenha a linha vertical
HPDF_Page_SetRGBStroke(state->page, 0.8, 0.8, 0.8);
HPDF_Page_SetLineWidth(state->page, 2.0);
HPDF_Page_MoveTo(state->page, MARGIN_LEFT + (state->list_level * LIST_INDENT) + 10, start_y_quote);
HPDF_Page_LineTo(state->page, MARGIN_LEFT + (state->list_level * LIST_INDENT) + 10, state->current_y + 12);
HPDF_Page_Stroke(state->page);
state->current_y -= 8;
}
else if(strcmp(tag, "pre") == 0) {
xmlNode *code_node = cur_node->children;
if (code_node && strcmp((const char*)code_node->name, "code") == 0) {
code_node = code_node->children;
}
if (code_node && code_node->type == XML_TEXT_NODE) {
char *content = (char *)xmlNodeGetContent(code_node);
state->current_style.font = state->font_mono;
state->current_style.size = 9.0f;
state->current_style.color = (HPDF_RGBColor){0,0,0};
draw_preformatted_text(state, content);
xmlFree(content);
}
}
else {
// Para outras tags de bloco desconhecidas (ex: <div>), apenas processa seus filhos
traverse_and_draw(cur_node->children, state);
}
}
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <input.html> <output.pdf>\n", argv[0]);
return 1;
}
const char *html_file = argv[1];
const char *pdf_file = argv[2];
htmlDocPtr doc = htmlReadFile(html_file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET);
if (doc == NULL) {
fprintf(stderr, "Error: could not parse file %s\n", html_file);
return 1;
}
xmlNode *root_element = xmlDocGetRootElement(doc);
xmlNode *body_node = NULL;
for (xmlNode* cur = root_element; cur; cur = cur->next) {
if (cur->type == XML_ELEMENT_NODE && strcmp((const char*)cur->name, "body") == 0) {
body_node = cur;
break;
}
body_node = xmlFindChild(cur, (const xmlChar*)"body", NULL);
if (body_node) break;
}
if (body_node == NULL) {
fprintf(stderr, "Error: Could not find <body> tag in HTML file.\n");
xmlFreeDoc(doc);
return 1;
}
PdfState state;
memset(&state, 0, sizeof(PdfState));
state.pdf = HPDF_New(haru_error_handler, NULL);
if (!state.pdf) {
fprintf(stderr, "Error: could not create PDF object.\n");
xmlFreeDoc(doc);
return 1;
}
HPDF_SetCompressionMode(state.pdf, HPDF_COMP_ALL);
state.font_regular = HPDF_GetFont(state.pdf, "Helvetica", "WinAnsiEncoding");
state.font_bold = HPDF_GetFont(state.pdf, "Helvetica-Bold", "WinAnsiEncoding");
state.font_italic = HPDF_GetFont(state.pdf, "Helvetica-Oblique", "WinAnsiEncoding");
state.font_bold_italic = HPDF_GetFont(state.pdf, "Helvetica-BoldOblique", "WinAnsiEncoding");
state.font_mono = HPDF_GetFont(state.pdf, "Courier", "WinAnsiEncoding");
add_new_page(&state);
traverse_and_draw(body_node->children, &state);
HPDF_SaveToFile(state.pdf, pdf_file);
HPDF_Free(state.pdf);
xmlFreeDoc(doc);
xmlCleanupParser();
printf("Successfully converted %s to %s\n", html_file, pdf_file);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment