Last active
August 29, 2015 14:01
-
-
Save chriscorwin/2daa1a6f3da2553d6a02 to your computer and use it in GitHub Desktop.
First found at http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php, saved here so I can get to it later.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function CleanWordHTML(str) { | |
| /* | |
| Function to strip Microsoft Word HTML formatting | |
| As developers, we've all been there you give someone an HTML-enabled editor yet they STILL paste in content from Word, thereby ruining all your dedicated efforts at styling and making text look consistent and pretty (damn you and your Comic Sans, Bill!). | |
| So, bearing this in mind, we put together a wee script which removes all the standard formatting of the mso:xml/html and other formatting which Microsoft Word stuffs in to retain the formatted document. Don't get me wrong, Word is very good at what it was designed for writing word documents not creating web pages! | |
| The function explained: basically in the function below, we pass the main body of the text or passage into the js function (note that we are running this on the server to get the database elements parsed prior to passing it to the client) then, in a linear fashion systematically to do a series of "replaces". The nature of the replacing of some lines has experienced difficulties in earlier versions of Internet Explorer hence the addition of some RegEx functions, merely a belt-and-braces approach to ensure that the block of text can be fully processed. | |
| */ | |
| str = str.replace(/<o:p>\s*<\/o:p>/g, ""); | |
| str = str.replace(/<o:p>.*?<\/o:p>/g, " "); | |
| str = str.replace(/\s*mso-[^:]+:[^;"]+;?/gi, ""); | |
| str = str.replace(/\s*MARGIN: 0cm 0cm 0pt\s*;/gi, ""); | |
| str = str.replace(/\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\""); | |
| str = str.replace(/\s*TEXT-INDENT: 0cm\s*;/gi, ""); | |
| str = str.replace(/\s*TEXT-INDENT: 0cm\s*"/gi, "\""); | |
| str = str.replace(/\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\""); | |
| str = str.replace(/\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\""); | |
| str = str.replace(/\s*FONT-VARIANT: [^\s;]+;?"/gi, "\""); | |
| str = str.replace(/\s*tab-stops:[^;"]*;?/gi, ""); | |
| str = str.replace(/\s*tab-stops:[^"]*/gi, ""); | |
| str = str.replace(/\s*face="[^"]*"/gi, ""); | |
| str = str.replace(/\s*face=[^ >]*/gi, ""); | |
| str = str.replace(/\s*FONT-FAMILY:[^;"]*;?/gi, ""); | |
| str = str.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3"); | |
| str = str.replace(/<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3"); | |
| str = str.replace(/\s*style="\s*"/gi, ''); | |
| str = str.replace(/<SPAN\s*[^>]*>\s* \s*<\/SPAN>/gi, ' '); | |
| str = str.replace(/<SPAN\s*[^>]*><\/SPAN>/gi, ''); | |
| str = str.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3"); | |
| str = str.replace(/<SPAN\s*>(.*?)<\/SPAN>/gi, '$1'); | |
| str = str.replace(/<FONT\s*>(.*?)<\/FONT>/gi, '$1'); | |
| str = str.replace(/<\\?\?xml[^>]*>/gi, ""); | |
| str = str.replace(/<\/?\w+:[^>]*>/gi, ""); | |
| str = str.replace(/<H\d>\s*<\/H\d>/gi, ''); | |
| str = str.replace(/<H1([^>]*)>/gi, ''); | |
| str = str.replace(/<H2([^>]*)>/gi, ''); | |
| str = str.replace(/<H3([^>]*)>/gi, ''); | |
| str = str.replace(/<H4([^>]*)>/gi, ''); | |
| str = str.replace(/<H5([^>]*)>/gi, ''); | |
| str = str.replace(/<H6([^>]*)>/gi, ''); | |
| str = str.replace(/<\/H\d>/gi, '<br>'); //remove this to take out breaks where Heading tags were | |
| str = str.replace(/<(U|I|STRIKE)> <\/\1>/g, ' '); | |
| str = str.replace(/<(B|b)> <\/\b|B>/g, ''); | |
| str = str.replace(/<([^\s>]+)[^>]*>\s*<\/\1>/g, ''); | |
| str = str.replace(/<([^\s>]+)[^>]*>\s*<\/\1>/g, ''); | |
| str = str.replace(/<([^\s>]+)[^>]*>\s*<\/\1>/g, ''); | |
| //some RegEx code for the picky browsers | |
| var re = new RegExp("(<P)([^>]*>.*?)(<\/P>)", "gi"); | |
| str = str.replace(re, "<div$2</div>"); | |
| var re2 = new RegExp("(<font|<FONT)([^*>]*>.*?)(<\/FONT>|<\/font>)", "gi"); | |
| str = str.replace(re2, "<div$2</div>"); | |
| str = str.replace(/size|SIZE = ([\d]{1})/g, ''); | |
| return str; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment