Last active
March 19, 2021 01:05
-
-
Save reZach/bc0cbadd95584fa205b5b443a554c8bc to your computer and use it in GitHub Desktop.
C# Merging Runs in a Microsoft Word.docx file, and converting it to HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Cleans up runs that can be merged, ie: | |
/* | |
<w:p w:rsidR="00D242F1" w:rsidP="00D242F1" w:rsidRDefault="005F6285" w14:paraId="66169407" w14:textId="101467E0" | |
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"> | |
<w:pPr> | |
<w:spacing w:after="0" w:line="240" w:lineRule="auto" /> | |
<w:jc w:val="center" /> | |
<w:rPr> | |
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" /> | |
</w:rPr> | |
</w:pPr> | |
<w:r> | |
<w:rPr> | |
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" /> | |
</w:rPr> | |
<w:t>hey, I</w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" /> | |
</w:rPr> | |
<w:t> am split and don't need to be!</w:t> | |
</w:r> | |
</w:p> | |
into | |
<w:p w:rsidR="00D242F1" w:rsidP="00D242F1" w:rsidRDefault="005F6285" w14:paraId="66169407" w14:textId="101467E0" | |
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml"> | |
<w:pPr> | |
<w:spacing w:after="0" w:line="240" w:lineRule="auto" /> | |
<w:jc w:val="center" /> | |
<w:rPr> | |
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" /> | |
</w:rPr> | |
</w:pPr> | |
<w:r> | |
<w:rPr> | |
<w:rFonts w:ascii="Garamond" w:hAnsi="Garamond" /> | |
</w:rPr> | |
<w:t>hey, I am split and don't need to be!</w:t> | |
</w:r> | |
</w:p> | |
*/ | |
using DocumentFormat.OpenXml; | |
using DocumentFormat.OpenXml.Packaging; | |
using DocumentFormat.OpenXml.Wordprocessing; | |
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
namespace openxml | |
{ | |
public class Program | |
{ | |
public static void Main(string[] args) | |
{ | |
FillDocument("Official.docx"); | |
} | |
public static void FillDocument(string filename) | |
{ | |
using (WordprocessingDocument wordDoc = WordprocessingDocument.Open(filename, true)) | |
{ | |
MergeRuns(wordDoc); | |
ConvertDocxToHTML(wordDoc); | |
} | |
} | |
public static string ConvertDocxToHTML(WordprocessingDocument document) | |
{ | |
string html = @" | |
<!DOCTYPE html> | |
<html lang='en-US'> | |
<head></head> | |
<body>"; | |
Document doc = document.MainDocumentPart.Document; | |
OpenXmlElementList bodyElements = doc.ChildElements[0].ChildElements; | |
foreach (OpenXmlElement xmlElement in bodyElements) | |
{ | |
if (xmlElement is Paragraph) | |
html += RenderParagraph((Paragraph)xmlElement); | |
else if (xmlElement is Table) | |
html += RenderTable((Table)xmlElement); | |
} | |
html += @"</body></html>"; | |
return html; | |
} | |
public static void MergeRuns(WordprocessingDocument document) | |
{ | |
OpenXmlElementList bodyElements = document.MainDocumentPart.Document.ChildElements[0].ChildElements; | |
// For each element in the body | |
foreach (OpenXmlElement xmlElement in bodyElements) | |
{ | |
if (xmlElement is Paragraph) | |
{ | |
Paragraph p = (Paragraph)xmlElement; | |
// For each element in each paragraph | |
MergeRunsParagraph(p); | |
} | |
else if (xmlElement is Table) | |
{ | |
// Merge runs within paragraphs within table elements | |
Table table = (Table)xmlElement; | |
List<TableRow> tableRows = table.Elements<TableRow>().ToList(); | |
foreach (TableRow tableRow in tableRows) | |
{ | |
List<TableCell> tableCells = tableRow.Elements<TableCell>().ToList(); | |
foreach (TableCell tableCell in tableCells) | |
{ | |
Paragraph tableCellParagraph = tableCell.GetFirstChild<Paragraph>(); | |
if (tableCellParagraph != null) | |
MergeRunsParagraph(tableCellParagraph); | |
} | |
} | |
} | |
} | |
// Save updates | |
document.MainDocumentPart.Document.Save(); | |
} | |
public static void MergeRunsParagraph(Paragraph paragraph) | |
{ | |
// For each element in each paragraph | |
OpenXmlElementList children = paragraph.ChildElements; | |
List<Run> paragraphRuns = new List<Run>(); | |
foreach (OpenXmlElement element in children) | |
{ | |
// Keep track of consecutive runs | |
if (element is Run) | |
paragraphRuns.Add((Run)element); | |
else if (paragraphRuns.Count > 1) | |
{ | |
MergeRunsParagraphRuns(paragraphRuns, paragraph); | |
paragraphRuns.Clear(); | |
} | |
} | |
// Merge any existing runs once we finish | |
// iterating over child elements | |
if (paragraphRuns.Count > 1) | |
MergeRunsParagraphRuns(paragraphRuns, paragraph); | |
} | |
public static void MergeRunsParagraphRuns(List<Run> paragraphRuns, Paragraph root) | |
{ | |
// Compare properties of runs; | |
// only merge runs if all properties are the same | |
List<RunProperties> paragraphRunProperties = new List<RunProperties>(); | |
foreach (Run run in paragraphRuns) | |
paragraphRunProperties.Add(run.GetFirstChild<RunProperties>()); | |
bool paragraphRunPropertiesMatch = true; | |
for (int i = 0; i < paragraphRunProperties.Count; i++) | |
for (int j = 0; j < paragraphRunProperties.Count; j++) | |
{ | |
if (i != j) | |
{ | |
// todo - should do a deep compare here | |
if (!string.Equals(paragraphRunProperties[i].RunFonts.Ascii.Value, paragraphRunProperties[j].RunFonts.Ascii.Value, System.StringComparison.OrdinalIgnoreCase)) | |
paragraphRunPropertiesMatch = false; | |
} | |
} | |
// Properties match, can merge runs | |
if (paragraphRunPropertiesMatch) | |
{ | |
string combinedText = string.Empty; | |
// Save off combined text | |
foreach (Run run in paragraphRuns) | |
{ | |
foreach (OpenXmlElement runChild in run.ChildElements) | |
{ | |
if (runChild is Break) | |
combinedText += "<br />"; | |
else if (runChild is Text) | |
combinedText += run.InnerText; | |
} | |
} | |
// Remove runs | |
foreach (Run run in paragraphRuns) | |
run.Remove(); | |
// Create new run with properties | |
Run newRun = new Run(); | |
RunProperties newRunProperties = (RunProperties)paragraphRunProperties[0].Clone(); | |
Text newText = new Text(combinedText); | |
newRun.AddChild(newRunProperties); | |
newRun.AddChild(newText); | |
root.AddChild(newRun); | |
} | |
} | |
public static string RenderParagraph(Paragraph paragraph) | |
{ | |
ParagraphProperties paragraphProperties = paragraph.GetFirstChild<ParagraphProperties>(); | |
Run run = paragraph.GetFirstChild<Run>(); | |
// If we have no text in the paragraph, | |
// render an empty paragraph | |
if (run == null) | |
return "<p style='margin:0px;height:16px;'></p>"; | |
RunProperties runProperties = run.GetFirstChild<RunProperties>(); | |
string text = run.InnerText; | |
List<string> styles = new List<string>(); | |
// Paragraph properties | |
string textAlign = paragraphProperties?.Justification?.Val ?? string.Empty; | |
if (!string.IsNullOrEmpty(textAlign)) | |
styles.Add($"text-align:{textAlign}"); | |
string marginTop = paragraphProperties?.SpacingBetweenLines?.Before ?? string.Empty; | |
string marginBottom = paragraphProperties?.SpacingBetweenLines?.After ?? string.Empty; | |
if (string.IsNullOrEmpty(marginTop)) | |
{ | |
styles.Add($"margin-top:0px"); | |
} | |
if (!string.IsNullOrEmpty(marginBottom)) | |
{ | |
if (string.Equals(marginBottom, "0", StringComparison.OrdinalIgnoreCase)) | |
styles.Add($"margin-bottom:0px"); | |
} | |
// Run properties | |
if (runProperties.Bold != null) | |
text = AddTag("strong", text); | |
if (runProperties.RunFonts.Ascii.HasValue) | |
styles.Add($"font-family:'{runProperties.RunFonts.Ascii.Value}'"); | |
return @$"<p style=""{string.Join(';', styles)}""> | |
{text} | |
</p>"; | |
} | |
public static string RenderTable(Table table) | |
{ | |
TableProperties tableProperties = table.GetFirstChild<TableProperties>(); | |
TableGrid tableGrid = table.GetFirstChild<TableGrid>(); | |
List<TableRow> tableRowElements = table.Elements<TableRow>().ToList(); | |
decimal tableWidth = tableProperties.TableWidth.Width.HasValue ? decimal.Parse(tableProperties.TableWidth.Width.Value) : 0; | |
List<int> cellWidths = new List<int>(tableGrid.ChildElements.Count); | |
foreach (GridColumn gridColumn in tableGrid.ChildElements) | |
{ | |
decimal cellW = decimal.Parse(gridColumn.Width.HasValue ? gridColumn.Width.Value : "0"); | |
if (cellW > 0) | |
cellWidths.Add((int)Math.Floor((cellW / tableWidth) * 100m)); | |
} | |
string row = string.Empty; | |
List<string> tableRows = new List<string>(); | |
foreach (TableRow tr in tableRowElements) | |
{ | |
TableRowProperties trProperties = tr.GetFirstChild<TableRowProperties>(); | |
List<TableCell> tableCells = tr.Elements<TableCell>().ToList(); | |
row += "<tr>"; | |
for (int i = 0; i < tableCells.Count; i++) | |
{ | |
List<Paragraph> tablecellParagraphs = tableCells[i].Elements<Paragraph>().ToList(); | |
row += $"<td style=\"width:{cellWidths[i]}%\">"; | |
foreach (Paragraph tcParagraph in tablecellParagraphs) | |
{ | |
row += RenderParagraph(tcParagraph); | |
} | |
row += "</td>"; | |
} | |
row += "</tr>"; | |
tableRows.Add(row); | |
row = string.Empty; | |
} | |
return @$"<table> | |
<tbody> | |
{string.Join('\n', tableRows)} | |
</tbody> | |
</table>"; | |
} | |
private static string AddTag(string tag, string source) | |
{ | |
return $"<{tag}>{source}</{tag}>"; | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment