Created
June 21, 2025 08:03
-
-
Save documentprocessing/d81010195968427eeba161c2f3eedd2a to your computer and use it in GitHub Desktop.
HTML to DOCX Conversion using Java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.github.davidmoten.pandoc.Pandoc; | |
import java.io.*; | |
/** | |
* Converts HTML business reports to MS Word (DOCX) with corporate styling. | |
* Features: | |
* - Preserves tables, images, and CSS classes from HTML | |
* - Applies custom DOCX template (e.g., company-branded styles) | |
* - Handles embedded base64 images | |
* - Post-processes with Office-Open-XML (OOXML) hooks | |
*/ | |
public class HtmlToDocxConverter { | |
public static void main(String[] args) { | |
// 1. Initialize Pandoc with performance tuning | |
Pandoc pandoc = Pandoc.create() | |
.memoryLimit(2048); // Increase memory for complex docs | |
// 2. Define I/O paths | |
String htmlInput = "quarterly_report.html"; | |
String docxOutput = "Q3_Report.docx"; | |
String companyTemplate = "templates/corporate_template.docx"; | |
try (InputStream html = new FileInputStream(htmlInput); | |
OutputStream docx = new FileOutputStream(docxOutput)) { | |
// 3. Execute conversion with business doc features | |
pandoc.from("html") | |
.to("docx") | |
.option("--reference-doc=" + companyTemplate) // Branded template | |
.option("--embed-resources") // Inline images/CSS | |
.option("--standalone") // Full document | |
.option("--columns=80") // Control line wraps | |
.filter("pandoc-secnos") // Add section numbers | |
.execute(html, docx); | |
System.out.println("DOCX generated: " + docxOutput); | |
// 4. Post-processing (e.g., inject live Excel data) | |
// Docx4j or Apache POI can modify the DOCX here | |
} catch (Exception e) { | |
System.err.println("Conversion failed: " + e.getMessage()); | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment