Last active
August 29, 2015 14:19
-
-
Save alexandervantrijffel/22becfb924c4aa7159a3 to your computer and use it in GitHub Desktop.
HtmlToWordPdfConverter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Fact] | |
public void GenerateWordDocument_should_generate() | |
{ | |
var fileName = "mydoc.html"; | |
var texts = new List<string>(); | |
using (var sr = File.OpenText(fileName)) | |
texts.Add(sr.ReadToEnd()); | |
new Generator().Generate(fileName, "OUT-PUT",texts); | |
File.Exists("OUT-PUT.docx").Should().BeTrue(); | |
File.Exists("OUT-PUT.pdf").Should().BeTrue(); | |
} | |
public class Generator | |
{ | |
public void Generate(string fileName, string outputFileTitle, IEnumerable<string> documentBlockTexts) | |
{ | |
var htmlGenerator = new HtmlGenerator(fileName, "OUT-PUT",documentBlockTexts).Generate(); | |
new HtmlToWordConverter("OUT-PUT").Generate(); | |
File.Delete((htmlGenerator.GeneratedFilePath)); | |
foreach(var path in htmlGenerator.GeneratedImages) | |
File.Delete(path); | |
} | |
} | |
public class HtmlToWordConverter | |
{ | |
private readonly string _outputFileTitle; | |
public HtmlToWordConverter(string outputFileTitle) | |
{ | |
_outputFileTitle = outputFileTitle; | |
} | |
public void Generate() | |
{ | |
var fileTitle = GetFilePathRelativeToAssembly(string.Format("{0}.html", _outputFileTitle)); | |
var app = new Application(); | |
try | |
{ | |
var document = app.Documents.Open(fileTitle, ReadOnly:false); | |
var outputDocxFile = GetFilePathRelativeToAssembly(string.Format("{0}.docx", _outputFileTitle)); | |
foreach (var shape in document.InlineShapes) | |
{ | |
var inlineShape = shape as InlineShape; | |
if (inlineShape == null || inlineShape.LinkFormat == null) | |
continue; | |
inlineShape.LinkFormat.SavePictureWithDocument = true; | |
} | |
// Save | |
document.SaveAs2(outputDocxFile,WdSaveFormat.wdFormatXMLDocument); | |
var outputPDFFile = GetFilePathRelativeToAssembly(string.Format("{0}.pdf", _outputFileTitle)); | |
document.ExportAsFixedFormat(outputPDFFile, WdExportFormat.wdExportFormatPDF); | |
} | |
finally | |
{ | |
app.Quit(); | |
} | |
} | |
private static string GetFilePathRelativeToAssembly(string relativePath) | |
{ | |
var codeBaseUrl = new Uri(Assembly.GetExecutingAssembly().CodeBase); | |
var codeBasePath = Uri.UnescapeDataString(codeBaseUrl.AbsolutePath); | |
var dirPath = Path.GetDirectoryName(codeBasePath); | |
return Path.Combine(dirPath, relativePath); | |
} | |
} | |
public class HtmlGenerator | |
{ | |
private readonly string _fileName; | |
private readonly string _outputFileTitle; | |
private readonly IEnumerable<string> documentBlockTexts; | |
public IList<string> GeneratedImages { get; set; } | |
public string GeneratedFilePath { get; set; } | |
public HtmlGenerator(string fileName, string outputFileTitle, IEnumerable<string> documentBlockTexts) | |
{ | |
_fileName = fileName; | |
_outputFileTitle = outputFileTitle; | |
this.documentBlockTexts = documentBlockTexts; | |
GeneratedImages = new List<string>(); | |
} | |
public HtmlGenerator Generate() | |
{ | |
//Log.Debug("Generating output file {0}", _fileName) | |
var newDoc = new HtmlDocument(); | |
var html = newDoc.CreateElement("html"); | |
html.AppendChild(newDoc.CreateElement("head")); | |
var body = newDoc.CreateElement("body"); | |
html.AppendChild(body); | |
newDoc.DocumentNode.AppendChild(html); | |
foreach (var text in documentBlockTexts) body.AppendChildren(this.GetDocumentAsNodeCollection(text)); | |
var outputFileName = string.Format("{0}.html", _outputFileTitle); | |
newDoc.Save(outputFileName); | |
GeneratedFilePath = outputFileName; | |
return this; | |
} | |
private HtmlNodeCollection GetDocumentAsNodeCollection(string html) | |
{ | |
var doc = new HtmlDocument { OptionFixNestedTags = true }; | |
doc.LoadHtml(html); | |
if (doc.ParseErrors.Any()) | |
{ | |
throw new Exception(string.Concat(doc.ParseErrors.Select(p => p.ToString()))); | |
} | |
var imgCounter = 0; | |
foreach (var imageNode in doc.DocumentNode.SelectNodes("//img[@src]")) | |
{ | |
var data = imageNode.Attributes["src"].Value.Split(new[] { ';' }); | |
var extension = MimeTypeMap.MimeTypeMap.GetExtension(data[0].Replace("data:", string.Empty)); | |
var dataType = data[1].Substring(0, 7) != "base64,"; | |
if (dataType) | |
{ | |
throw new Exception("Expected base64, as image data, not: " + dataType); | |
} | |
var bytes = Convert.FromBase64String(data[1].Substring(7)); | |
var outputFile = string.Format("{0}{1}{2}", _outputFileTitle, ++imgCounter, extension); | |
using (var imageFile = new FileStream(outputFile, FileMode.Create)) | |
{ | |
imageFile.Write(bytes, 0, bytes.Length); | |
imageFile.Flush(); | |
} | |
GeneratedImages.Add(outputFile); | |
imageNode.SetAttributeValue("src", outputFile); | |
} | |
return doc.DocumentNode.ChildNodes; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment