fakedarren · March 27, 2013 22:20
diff --git a/regex.js b/regex.js
 UI.TextEditor.implement({
        
 	cleanHTML: function()	{
 		
 		var html = this.textarea.value;
 		
 		// Remove double new lines
 		html = html.replace(/\n\n+/g, "\n");
 		
 		// Stupid apple-style-spans etc
 		html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />");
 		html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1');
 		html = html.replace(/ class="Apple-style-span"/gi, '');
 		
 		// Replace uppercase element names with lowercase
 		html = html.replace(/<[^> ]*/g, function(match)	{
 			return match.toLowerCase();
 		});
 		// Replace uppercase attribute names with lowercase
 		html = html.replace(/<[^>]*>/g, function(match)	{
 			match = match.replace(/ [^=]+=/g, function(match2)	{
 				return match2.toLowerCase();
 			});
 			return match;
 		});
 		// Put quotes around unquoted attributes
 		html = html.replace(/<[^>]*>/g, function(match)	{
 			match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\"");
 			return match;
 		});

 		// Convert inline styles to <strong> / <em> tags etc
 		//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>');
 		//html = html.replace(/<b\b[^>]*>(.*?)<\/b[^>]*>/gi, '<strong>$1</strong>');
 		//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>');
 		//html = html.replace(/<i\b[^>]*>(.*?)<\/i[^>]*>/gi, '<em>$1</em>');
 		html = html.replace(/<u\b[^>]*>(.*?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>');
 		
 		// strip stupid attributes and all classes
 		html = html.replace(/ (border|valign|align|width|language|height|class)=\"([^\"]*)\"/gi, "");

 		// More complex semantics
 		html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>');
 		html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>');
 		html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>');
 		html = html.replace(/<p>[\s\n]*(<(?:ul|ol)>.*?<\/(?:ul|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>');
 		html = html.replace(/<\/(ol|ul)>\s*(?!<(?:p|ol|ul|img).*?>)((?:<[^>]*>)?\w.*)$/g, '</$1><p>$2</p>');

 		// Any <script> tags
 		html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, "");
 		
 		this.textarea.value = html;
 		
 		if(/(class=\"?Mso|style=\"[^\"]*\bmso\-|w:WordDocument)/.test(html))	{
 			this.cleanPasteFromWord();	
 		}
 		
 		var html = this.textarea.value;
 		
 		// Convert <br> to <br />
 		html = html.replace(/(<br>)/g, "<br />");
 		
 		// Sort out leading / trailing / useless / invalid <br />s
 		html = html.replace(/<br ?\/?>$/gi, '');
 		html = html.replace(/^<br ?\/?>/gi, '');
 		html = html.replace(/><br ?\/?>/gi, '>');
 		html = html.replace(/<br ?\/?>\s*<\/(h1|h2|h3|h4|h5|h6|li|p)/gi, '</$1');
 		
 		// Clean up paragraphs
 		html = html.replace(/<p>\s*<br ?\/?>\s*<\/p>/gi, '<p>\u00a0</p>');
 		html = html.replace(/<p>(&nbsp;|\s)*<\/p>/gi, '<p>\u00a0</p>');
 		html = html.replace(/\s*<br ?\/?>\s*<\/p>/gi, '</p>');
 		html = html.replace(/<p>(?:\s*)<p>/g, '<p>');
 		html = html.replace(/<\/p>\s*<\/p>/g, '</p>');
 		html = html.replace(/<p>\W*<\/p>/g, '');
 		html = html.replace(/<br[^>]*><\/p>/g, '</p>');
 		html = html.replace(/<p>\s*(<img[^>]+>)\s*<\/p>/ig, '$1\n');

 		// Format sourcecode
 		html = html.replace(/<p([^>]*)>(.*?)<\/p>(?!\n)/g, '<p$1>$2</p>\n');
 		html = html.replace(/<\/(ul|ol|p)>(?!\n)/g, '</$1>\n');
 		html = html.replace(/><li>/g, '>\n\t<li>');
 		html = html.replace(/([^\n])<\/(ol|ul)>/g, '$1\n</$2>');
 		html = html.replace(/([^\n])<img/ig, '$1\n<img');
 		html = html.replace(/^\s*$/g, '');
 		html = html.trim();
 		
 		this.textarea.value = html;
 		
 	},
 	
 	cleanPasteFromWord: function()	{
 		
 		var html = this.textarea.value;
 		
 		// Remove MS word comments
 		html = html.replace(/<!--[\s\S]+?-->/gi, "");
 		// Dodgy tags and VML
 		html = html.replace(/<\/?(img|font|meta|link|style|div|v:\w+)[^>]*>/gi, "");
 		// MS namespace elements
 		html = html.replace(/<\/?o:[^>]*>/gi, "");
 		// XML namespace declarations
 		html = html.replace(/<\\?\?xml[^>]*>/gi, "");
 		// on.., class, style and other attributes with and without quotes (different browsers)
 		html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=\"([^\"]*)\"/gi, "");
 		html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=(\w+)/gi, "");
 		// <s> into <strike> for strikethrough
 		html = html.replace(/<(\/?)s>/gi, "<$1strike>");
 		// Only <span> elements left should be MS ones so can remove
 		html = html.replace(/<span\b[^>]*>(.*?)<\/span[^>]*>/gi, '$1');
 		// Strip stupid mso classes
 		html = html.replace(/ class=\"(mso[^\"]*)\"/gi,	"");
 		html = html.replace(/ class=(mso\w+)/gi, "");
 		// Eliminate all remaining style attributes
 		html = html.replace(/ style=\"[^\"]*\"/gi, "");
 		// TOC links
 		html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, "");
 		html = html.replace(/<a>(.*?)<\/a>/gi, "$1");
 		// All blank spans
 		html = html.replace(/<span>(.*)<\/span>/gi, "$1");
 		
 		this.textarea.value = html;
 		
 	}
 	
 });
	UI.TextEditor.implement({

	cleanHTML: function() {

	var html = this.textarea.value;

	// Remove double new lines
	html = html.replace(/\n\n+/g, "\n");

	// Stupid apple-style-spans etc
	html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />");
	html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1');
	html = html.replace(/ class="Apple-style-span"/gi, '');

	// Replace uppercase element names with lowercase
	html = html.replace(/<[^> ]*/g, function(match) {
	return match.toLowerCase();
	});
	// Replace uppercase attribute names with lowercase
	html = html.replace(/<[^>]*>/g, function(match) {
	match = match.replace(/ [^=]+=/g, function(match2) {
	return match2.toLowerCase();
	});
	return match;
	});
	// Put quotes around unquoted attributes
	html = html.replace(/<[^>]*>/g, function(match) {
	match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\"");
	return match;
	});

	// Convert inline styles to <strong> / <em> tags etc
	//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>');
	//html = html.replace(/<b\b[^>]>(.?)<\/b[^>]*>/gi, '<strong>$1</strong>');
	//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>');
	//html = html.replace(/<i\b[^>]>(.?)<\/i[^>]*>/gi, '<em>$1</em>');
	html = html.replace(/<u\b[^>]>(.?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>');

	// strip stupid attributes and all classes
	html = html.replace(/ (border\|valign\|align\|width\|language\|height\|class)=\"([^\"]*)\"/gi, "");

	// More complex semantics
	html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>');
	html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>');
	html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>');
	html = html.replace(/<p>[\s\n](<(?:ul\|ol)>.?<\/(?:ul\|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>');
	html = html.replace(/<\/(ol\|ul)>\s(?!<(?:p\|ol\|ul\|img).?>)((?:<[^>]>)?\w.)$/g, '</$1><p>$2</p>');

	// Any <script> tags
	html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, "");

	this.textarea.value = html;

	if(/(class=\"?Mso\|style=\"[^\"]*\bmso\-\|w:WordDocument)/.test(html)) {
	this.cleanPasteFromWord();
	}

	var html = this.textarea.value;

	// Convert <br> to <br />
	html = html.replace(/(<br>)/g, "<br />");

	// Sort out leading / trailing / useless / invalid <br />s
	html = html.replace(/<br ?\/?>$/gi, '');
	html = html.replace(/^<br ?\/?>/gi, '');
	html = html.replace(/><br ?\/?>/gi, '>');
	html = html.replace(/<br ?\/?>\s*<\/(h1\|h2\|h3\|h4\|h5\|h6\|li\|p)/gi, '</$1');

	// Clean up paragraphs
	html = html.replace(/<p>\s<br ?\/?>\s<\/p>/gi, '<p>\u00a0</p>');
	html = html.replace(/<p>( \|\s)*<\/p>/gi, '<p>\u00a0</p>');
	html = html.replace(/\s<br ?\/?>\s<\/p>/gi, '</p>');
	html = html.replace(/<p>(?:\s*)<p>/g, '<p>');
	html = html.replace(/<\/p>\s*<\/p>/g, '</p>');
	html = html.replace(/<p>\W*<\/p>/g, '');
	html = html.replace(/<br[^>]*><\/p>/g, '</p>');
	html = html.replace(/<p>\s(<img[^>]+>)\s<\/p>/ig, '$1\n');

	// Format sourcecode
	html = html.replace(/<p([^>])>(.?)<\/p>(?!\n)/g, '<p$1>$2</p>\n');
	html = html.replace(/<\/(ul\|ol\|p)>(?!\n)/g, '</$1>\n');
	html = html.replace(/><li>/g, '>\n\t<li>');
	html = html.replace(/([^\n])<\/(ol\|ul)>/g, '$1\n</$2>');
	html = html.replace(/([^\n])<img/ig, '$1\n<img');
	html = html.replace(/^\s*$/g, '');
	html = html.trim();

	this.textarea.value = html;

	},

	cleanPasteFromWord: function() {

	var html = this.textarea.value;

	// Remove MS word comments
	html = html.replace(/<!--[\s\S]+?-->/gi, "");
	// Dodgy tags and VML
	html = html.replace(/<\/?(img\|font\|meta\|link\|style\|div\|v:\w+)[^>]*>/gi, "");
	// MS namespace elements
	html = html.replace(/<\/?o:[^>]*>/gi, "");
	// XML namespace declarations
	html = html.replace(/<\\?\?xml[^>]*>/gi, "");
	// on.., class, style and other attributes with and without quotes (different browsers)
	html = html.replace(/ (id\|name\|lang\|type\|clear\|start\|language\|on\w+\|v:\w+\|w:\w+)=\"([^\"]*)\"/gi, "");
	html = html.replace(/ (id\|name\|lang\|type\|clear\|start\|language\|on\w+\|v:\w+\|w:\w+)=(\w+)/gi, "");
	// <s> into <strike> for strikethrough
	html = html.replace(/<(\/?)s>/gi, "<$1strike>");
	// Only <span> elements left should be MS ones so can remove
	html = html.replace(/<span\b[^>]>(.?)<\/span[^>]*>/gi, '$1');
	// Strip stupid mso classes
	html = html.replace(/ class=\"(mso[^\"]*)\"/gi, "");
	html = html.replace(/ class=(mso\w+)/gi, "");
	// Eliminate all remaining style attributes
	html = html.replace(/ style=\"[^\"]*\"/gi, "");
	// TOC links
	html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, "");
	html = html.replace(/<a>(.*?)<\/a>/gi, "$1");
	// All blank spans
	html = html.replace(/<span>(.*)<\/span>/gi, "$1");

	this.textarea.value = html;

	}

	});