Last active
May 25, 2022 15:18
-
-
Save jackreichert/4052029 to your computer and use it in GitHub Desktop.
Convert Docx XML to HTML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<w:document xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" | |
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" | |
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" | |
xmlns:w10="urn:schemas-microsoft-com:office:word" | |
xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" | |
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" | |
xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" | |
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" | |
xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" | |
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" mc:Ignorable="w14"> | |
<w:background w:color="FFFFFF"/> | |
<w:body> | |
<w:p> | |
<w:pPr> | |
<w:pStyle w:val="Body A"/> | |
</w:pPr> | |
</w:p> | |
<w:p> | |
<w:pPr> | |
<w:pStyle w:val="Title"/> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
</w:rPr> | |
</w:pPr> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="it-IT"/> | |
</w:rPr> | |
<w:t>Hello World</w:t> | |
</w:r> | |
</w:p> | |
<w:p> | |
<w:pPr> | |
<w:pStyle w:val="Default"/> | |
<w:spacing w:line="280" w:lineRule="atLeast"/> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
</w:rPr> | |
</w:pPr> | |
</w:p> | |
<w:p> | |
<w:pPr> | |
<w:pStyle w:val="Default"/> | |
<w:spacing w:line="280" w:lineRule="atLeast"/> | |
</w:pPr> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t xml:space="preserve">This is a </w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:b w:val="1"/> | |
<w:bCs w:val="1"/> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t>very short</w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t xml:space="preserve"> paragraph. It only contains </w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:i w:val="1"/> | |
<w:iCs w:val="1"/> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t>three</w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t xml:space="preserve"> sentences. This is the </w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:u w:val="single"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t>third sentence</w:t> | |
</w:r> | |
<w:r> | |
<w:rPr> | |
<w:sz w:val="24"/> | |
<w:szCs w:val="24"/> | |
<w:rtl w:val="0"/> | |
<w:lang w:val="en-US"/> | |
</w:rPr> | |
<w:t>.</w:t> | |
</w:r> | |
</w:p> | |
<w:sectPr> | |
<w:headerReference w:type="default" r:id="rId4"/> | |
<w:footerReference w:type="default" r:id="rId5"/> | |
<w:pgSz w:w="12240" w:h="15840" w:orient="portrait"/> | |
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="864"/> | |
<w:bidi w:val="0"/> | |
</w:sectPr> | |
</w:body> | |
</w:document> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// set location of docx text content file | |
$xmlFile = $targetDir."/word/document.xml"; | |
$reader = new XMLReader; | |
$reader->open($xmlFile); | |
// set up variables for formatting | |
$text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0; | |
// loop through docx xml dom | |
while ($reader->read()){ | |
// look for new paragraphs | |
if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){ | |
// set up new instance of XMLReader for parsing paragraph independantly | |
$paragraph = new XMLReader; | |
$p = $reader->readOuterXML(); | |
$paragraph->xml($p); | |
// search for heading | |
preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches); | |
switch($matches[1]){ | |
case 'Heading1': $formatting['header'] = 1; break; | |
case 'Heading2': $formatting['header'] = 2; break; | |
case 'Heading3': $formatting['header'] = 3; break; | |
case 'Heading4': $formatting['header'] = 4; break; | |
case 'Heading5': $formatting['header'] = 5; break; | |
case 'Heading6': $formatting['header'] = 6; break; | |
default: $formatting['header'] = 0; break; | |
} | |
// open h-tag or paragraph | |
$text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>'; | |
// loop through paragraph dom | |
while ($paragraph->read()){ | |
// look for elements | |
if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){ | |
$node = trim($paragraph->readInnerXML()); | |
// add <br> tags | |
if (strstr($node,'<w:br ')) $text .= '<br>'; | |
// look for formatting tags | |
$formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']); | |
$formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']); | |
$formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']); | |
// build text string of doc | |
$text .= (($formatting['bold'] == 'open') ? '<strong>' : ''). | |
(($formatting['italic'] == 'open') ? '<em>' : ''). | |
(($formatting['underline'] == 'open') ? '<u>' : ''). | |
htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)). | |
(($formatting['underline'] == 'close') ? '</u>' : ''). | |
(($formatting['italic'] == 'close') ? '</em>' : ''). | |
(($formatting['bold'] == 'close') ? '</strong>' : ''); | |
// reset formatting variables | |
foreach ($formatting as $key=>$format){ | |
if ($format == 'open') $formatting[$key] = 'opened'; | |
if ($format == 'close') $formatting[$key] = 'closed'; | |
} | |
} | |
} | |
$text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>'; | |
} | |
} | |
$reader->close(); | |
// suppress warnings. loadHTML does not require valid HTML but still warns against it... | |
// fix invalid html | |
$doc = new DOMDocument(); | |
$doc->encoding = 'UTF-8'; | |
@$doc->loadHTML($text); | |
$goodHTML = simplexml_import_dom($doc)->asXML(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Jackreichert,
This is a nice piece of work.
How can I make this to run on my local server, because I am getting this "error: Warning: XMLReader::open(): Unable to open source data".
Thanks,