Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save documentprocessing/d81010195968427eeba161c2f3eedd2a to your computer and use it in GitHub Desktop.
Save documentprocessing/d81010195968427eeba161c2f3eedd2a to your computer and use it in GitHub Desktop.
HTML to DOCX Conversion using Java
import com.github.davidmoten.pandoc.Pandoc;
import java.io.*;
/**
* Converts HTML business reports to MS Word (DOCX) with corporate styling.
* Features:
* - Preserves tables, images, and CSS classes from HTML
* - Applies custom DOCX template (e.g., company-branded styles)
* - Handles embedded base64 images
* - Post-processes with Office-Open-XML (OOXML) hooks
*/
public class HtmlToDocxConverter {
public static void main(String[] args) {
// 1. Initialize Pandoc with performance tuning
Pandoc pandoc = Pandoc.create()
.memoryLimit(2048); // Increase memory for complex docs
// 2. Define I/O paths
String htmlInput = "quarterly_report.html";
String docxOutput = "Q3_Report.docx";
String companyTemplate = "templates/corporate_template.docx";
try (InputStream html = new FileInputStream(htmlInput);
OutputStream docx = new FileOutputStream(docxOutput)) {
// 3. Execute conversion with business doc features
pandoc.from("html")
.to("docx")
.option("--reference-doc=" + companyTemplate) // Branded template
.option("--embed-resources") // Inline images/CSS
.option("--standalone") // Full document
.option("--columns=80") // Control line wraps
.filter("pandoc-secnos") // Add section numbers
.execute(html, docx);
System.out.println("DOCX generated: " + docxOutput);
// 4. Post-processing (e.g., inject live Excel data)
// Docx4j or Apache POI can modify the DOCX here
} catch (Exception e) {
System.err.println("Conversion failed: " + e.getMessage());
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment