Created
March 27, 2013 22:20
-
-
Save fakedarren/5258633 to your computer and use it in GitHub Desktop.
Just a few regexs I wrote / stole when creating a wysiwyg text editor a few years ago. God give me strength. Related: http://stackoverflow.com/a/1732454/299237
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
UI.TextEditor.implement({ | |
cleanHTML: function() { | |
var html = this.textarea.value; | |
// Remove double new lines | |
html = html.replace(/\n\n+/g, "\n"); | |
// Stupid apple-style-spans etc | |
html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />"); | |
html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1'); | |
html = html.replace(/ class="Apple-style-span"/gi, ''); | |
// Replace uppercase element names with lowercase | |
html = html.replace(/<[^> ]*/g, function(match) { | |
return match.toLowerCase(); | |
}); | |
// Replace uppercase attribute names with lowercase | |
html = html.replace(/<[^>]*>/g, function(match) { | |
match = match.replace(/ [^=]+=/g, function(match2) { | |
return match2.toLowerCase(); | |
}); | |
return match; | |
}); | |
// Put quotes around unquoted attributes | |
html = html.replace(/<[^>]*>/g, function(match) { | |
match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\""); | |
return match; | |
}); | |
// Convert inline styles to <strong> / <em> tags etc | |
//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>'); | |
//html = html.replace(/<b\b[^>]*>(.*?)<\/b[^>]*>/gi, '<strong>$1</strong>'); | |
//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>'); | |
//html = html.replace(/<i\b[^>]*>(.*?)<\/i[^>]*>/gi, '<em>$1</em>'); | |
html = html.replace(/<u\b[^>]*>(.*?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>'); | |
// strip stupid attributes and all classes | |
html = html.replace(/ (border|valign|align|width|language|height|class)=\"([^\"]*)\"/gi, ""); | |
// More complex semantics | |
html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>'); | |
html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>'); | |
html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>'); | |
html = html.replace(/<p>[\s\n]*(<(?:ul|ol)>.*?<\/(?:ul|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>'); | |
html = html.replace(/<\/(ol|ul)>\s*(?!<(?:p|ol|ul|img).*?>)((?:<[^>]*>)?\w.*)$/g, '</$1><p>$2</p>'); | |
// Any <script> tags | |
html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, ""); | |
this.textarea.value = html; | |
if(/(class=\"?Mso|style=\"[^\"]*\bmso\-|w:WordDocument)/.test(html)) { | |
this.cleanPasteFromWord(); | |
} | |
var html = this.textarea.value; | |
// Convert <br> to <br /> | |
html = html.replace(/(<br>)/g, "<br />"); | |
// Sort out leading / trailing / useless / invalid <br />s | |
html = html.replace(/<br ?\/?>$/gi, ''); | |
html = html.replace(/^<br ?\/?>/gi, ''); | |
html = html.replace(/><br ?\/?>/gi, '>'); | |
html = html.replace(/<br ?\/?>\s*<\/(h1|h2|h3|h4|h5|h6|li|p)/gi, '</$1'); | |
// Clean up paragraphs | |
html = html.replace(/<p>\s*<br ?\/?>\s*<\/p>/gi, '<p>\u00a0</p>'); | |
html = html.replace(/<p>( |\s)*<\/p>/gi, '<p>\u00a0</p>'); | |
html = html.replace(/\s*<br ?\/?>\s*<\/p>/gi, '</p>'); | |
html = html.replace(/<p>(?:\s*)<p>/g, '<p>'); | |
html = html.replace(/<\/p>\s*<\/p>/g, '</p>'); | |
html = html.replace(/<p>\W*<\/p>/g, ''); | |
html = html.replace(/<br[^>]*><\/p>/g, '</p>'); | |
html = html.replace(/<p>\s*(<img[^>]+>)\s*<\/p>/ig, '$1\n'); | |
// Format sourcecode | |
html = html.replace(/<p([^>]*)>(.*?)<\/p>(?!\n)/g, '<p$1>$2</p>\n'); | |
html = html.replace(/<\/(ul|ol|p)>(?!\n)/g, '</$1>\n'); | |
html = html.replace(/><li>/g, '>\n\t<li>'); | |
html = html.replace(/([^\n])<\/(ol|ul)>/g, '$1\n</$2>'); | |
html = html.replace(/([^\n])<img/ig, '$1\n<img'); | |
html = html.replace(/^\s*$/g, ''); | |
html = html.trim(); | |
this.textarea.value = html; | |
}, | |
cleanPasteFromWord: function() { | |
var html = this.textarea.value; | |
// Remove MS word comments | |
html = html.replace(/<!--[\s\S]+?-->/gi, ""); | |
// Dodgy tags and VML | |
html = html.replace(/<\/?(img|font|meta|link|style|div|v:\w+)[^>]*>/gi, ""); | |
// MS namespace elements | |
html = html.replace(/<\/?o:[^>]*>/gi, ""); | |
// XML namespace declarations | |
html = html.replace(/<\\?\?xml[^>]*>/gi, ""); | |
// on.., class, style and other attributes with and without quotes (different browsers) | |
html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=\"([^\"]*)\"/gi, ""); | |
html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=(\w+)/gi, ""); | |
// <s> into <strike> for strikethrough | |
html = html.replace(/<(\/?)s>/gi, "<$1strike>"); | |
// Only <span> elements left should be MS ones so can remove | |
html = html.replace(/<span\b[^>]*>(.*?)<\/span[^>]*>/gi, '$1'); | |
// Strip stupid mso classes | |
html = html.replace(/ class=\"(mso[^\"]*)\"/gi, ""); | |
html = html.replace(/ class=(mso\w+)/gi, ""); | |
// Eliminate all remaining style attributes | |
html = html.replace(/ style=\"[^\"]*\"/gi, ""); | |
// TOC links | |
html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, ""); | |
html = html.replace(/<a>(.*?)<\/a>/gi, "$1"); | |
// All blank spans | |
html = html.replace(/<span>(.*)<\/span>/gi, "$1"); | |
this.textarea.value = html; | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment