Created
March 5, 2018 06:42
-
-
Save TooBug/da708e7b50cc5a8ddc121aa393d8f307 to your computer and use it in GitHub Desktop.
clean word tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8" /> | |
<title>cleanWord</title> | |
<style> | |
html,body{ | |
height:100%; | |
margin:0; | |
padding:0; | |
} | |
#source,#sourceCode,#result,#preview{ | |
display:inline-block; | |
margin:5px; | |
border:1px solid #ccc; | |
width:22%; | |
height:80%; | |
vertical-align: top; | |
overflow: auto; | |
} | |
</style> | |
</head> | |
<body> | |
<div> | |
<label><input type="checkbox" id="deep" checked />深度清理</label> | |
</div> | |
<div id="source" contenteditable></div> | |
<textarea id="sourceCode"></textarea> | |
<textarea id="result"></textarea> | |
<div id="preview"></div> | |
<script src="http://code.jquery.com/jquery-2.1.3.min.js"></script> | |
<script> | |
/*global $*/ | |
/*jshint strict:false*/ | |
function doClean(){ | |
setTimeout(function(){ | |
$('#sourceCode').val($('#source').html()); | |
var html = cleanHTML($('#source').html()); | |
$('#result').val(html); | |
$('#preview').html(html); | |
},0) | |
} | |
$('#source').on('paste',doClean); | |
$('#deep').on('change',doClean); | |
function cleanHTML(sHtml) | |
{ | |
var isIE = false; | |
var editorRoot = '/'; | |
var cleanPaste = $('#deep').prop('checked')?2:1; | |
var imgPlaceholder = 'http://lorempixel.com/200/200/animals/1/placeholder/'; | |
//区块标签清理 | |
sHtml = sHtml.replace(/<!--[\s\S]*?-->|<!(--)?\[[\s\S]+?\](--)?>|<style(\s+[^>]*?)?>[\s\S]*?<\/style>/ig, ''); | |
sHtml = sHtml.replace(/\r?\n/ig, ''); | |
//保留Word图片占位 | |
if(isIE){ | |
sHtml = sHtml.replace(/<v:shapetype(\s+[^>]*)?>[\s\S]*<\/v:shapetype>/ig,''); | |
sHtml = sHtml.replace(/<v:shape(\s+[^>]+)?>[\s\S]*?<v:imagedata(\s+[^>]+)?>\s*<\/v:imagedata>[\s\S]*?<\/v:shape>/ig,function(all,attr1,attr2){ | |
var match; | |
match = attr2.match(/\s+src\s*=\s*("[^"]+"|'[^']+'|[^>\s]+)/i); | |
if(match){ | |
match = match[1].match(/^(["']?)(.*)\1/)[2]; | |
var sImg ='<img src="'+imgPlaceholder+'" _xhe_temp="true" class="wordImage"'; | |
match = attr1.match(/\s+style\s*=\s*("[^"]+"|'[^']+'|[^>\s]+)/i); | |
if(match){ | |
match = match[1].match(/^(["']?)(.*)\1/)[2]; | |
sImg += ' style="' + match + '"'; | |
} | |
sImg += ' />'; | |
return sImg; | |
} | |
return ''; | |
}); | |
} | |
else{ | |
sHtml = sHtml.replace(/<img( [^<>]*(v:shapes|msohtmlclip)[^<>]*)\/?>/ig,function(all,attr){ | |
var match,str = '<img src="'+imgPlaceholder+'" _xhe_temp="true" class="wordImage"'; | |
match = attr.match(/ width\s*=\s*"([^"]+)"/i); | |
if(match)str += ' width="'+match[1]+'"'; | |
match = attr.match(/ height\s*=\s*"([^"]+)"/i); | |
if(match)str += ' height="'+match[1]+'"'; | |
return str + ' />'; | |
}); | |
} | |
sHtml=sHtml.replace(/(<(\/?)([\w\-:]+))((?:\s+[\w\-:]+(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^>\s]+))?)*)\s*(\/?>)/g,function(all,left,end,tag,attr,right){ | |
tag=tag.toLowerCase(); | |
if((tag.match(/^(link)$/)&&attr.match(/file:\/\//i))||tag.match(/:/)||(tag==='span'&&cleanPaste===2))return ''; | |
if(!end){ | |
attr=attr.replace(/\s([\w\-:]+)(?:\s*=\s*("[^"]*"|'[^']*'|[^>\s]+))?/ig,function(all,n,v){ | |
n=n.toLowerCase(); | |
if(/:/.test(n))return ''; | |
v=v.match(/^(["']?)(.*)\1/)[2]; | |
if(cleanPaste===1){//简单清理 | |
switch(tag){ | |
case 'p': | |
if(n === 'style'){ | |
v=v.replace(/"|"/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){ | |
return /^(text-align)$/i.test(n)?(n+':'+v+';'):''; | |
}).replace(/^\s+|\s+$/g,''); | |
return v?(' '+n+'="'+v+'"'):''; | |
} | |
break; | |
case 'span': | |
if(n === 'style'){ | |
v=v.replace(/"|"/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){ | |
return /^(color|background|font-size|font-family)$/i.test(n)?(n+':'+v+';'):''; | |
}).replace(/^\s+|\s+$/g,''); | |
return v?(' '+n+'="'+v+'"'):''; | |
} | |
break; | |
case 'table': | |
if(n.match(/^(cellspacing|cellpadding|border|width)$/i))return all; | |
break; | |
case 'td': | |
if(n.match(/^(rowspan|colspan)$/i))return all; | |
if(n === 'style'){ | |
v=v.replace(/"|"/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){ | |
return /^(width|height)$/i.test(n)?(n+':'+v+';'):''; | |
}).replace(/^\s+|\s+$/g,''); | |
return v?(' '+n+'="'+v+'"'):''; | |
} | |
break; | |
case 'a': | |
if(n.match(/^(href)$/i))return all; | |
break; | |
case 'font': | |
case 'img': | |
return all; | |
break; | |
} | |
} | |
else if(cleanPaste===2){ | |
switch(tag){ | |
case 'td': | |
if(n.match(/^(rowspan|colspan)$/i))return all; | |
break; | |
case 'img': | |
return all; | |
} | |
} | |
return ''; | |
}); | |
} | |
return left+attr+right; | |
}); | |
//空内容的标签 | |
for(var i=0;i<3;i++)sHtml = sHtml.replace( /<([^\s>]+)(\s+[^>]*)?>\s*<\/\1>/g,''); | |
//无属性的无意义标签 | |
function cleanEmptyTag(all,tag,content){ | |
return content; | |
} | |
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S])*?<\/\1>)*?<\/\1>)*?)<\/\1>/ig,cleanEmptyTag);//第3层 | |
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S])*?<\/\1>)*?)<\/\1>/ig,cleanEmptyTag);//第2层 | |
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S])*?)<\/\1>/ig,cleanEmptyTag);//最里层 | |
//合并多个font | |
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<font(\s+[^>]+)><font(\s+[^>]+)>/ig,function(all,attr1,attr2){ | |
return '<font'+attr1+attr2+'>'; | |
}); | |
//清除表格间隙里的空格等特殊字符 | |
sHtml=sHtml.replace(/(<(\/?)(tr|td)(?:\s+[^>]+)?>)[^<>]+/ig,function(all,left,end,tag){ | |
if(!end&&/^td$/i.test(tag))return all; | |
else return left; | |
}); | |
return sHtml; | |
} | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment