Created
September 2, 2014 02:08
-
-
Save precious-ming/3efd4316e8ba4af8e9d6 to your computer and use it in GitHub Desktop.
HTML标签过滤(正则)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* <p> | |
* Title: HTML相关的正则表达式工具类 | |
* </p> | |
* <p> | |
* Description: 包括过滤HTML标记,转换HTML标记,替换特定HTML标记 | |
* </p> | |
*/ | |
public class HtmlRegexpUtil { | |
private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<开头以>结尾的标签 | |
private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // 找出IMG标签 | |
private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // 找出IMG标签的SRC属性 | |
public HtmlRegexpUtil() { | |
} | |
/** | |
* | |
* 基本功能:替换标记以正常显示 | |
* <p> | |
* @param input | |
* @return String | |
*/ | |
public String replaceTag(String input) { | |
if (!hasSpecialChars(input)) { | |
return input; | |
} | |
StringBuffer filtered = new StringBuffer(input.length()); | |
char c; | |
for (int i = 0; i <= input.length() - 1; i++) { | |
c = input.charAt(i); | |
switch (c) { | |
case '<': | |
filtered.append("<"); | |
break; | |
case '>': | |
filtered.append(">"); | |
break; | |
case '"': | |
filtered.append("""); | |
break; | |
case '&': | |
filtered.append("&"); | |
break; | |
default: | |
filtered.append(c); | |
} | |
} | |
return (filtered.toString()); | |
} | |
/** | |
* | |
* 基本功能:判断标记是否存在 | |
* <p> | |
* @param input | |
* @return boolean | |
*/ | |
public static boolean hasSpecialChars(String input) { | |
boolean flag = false; | |
if ((input != null) && (input.length() > 0)) { | |
char c; | |
for (int i = 0; i <= input.length() - 1; i++) { | |
c = input.charAt(i); | |
switch (c) { | |
case '>': | |
flag = true; | |
break; | |
case '<': | |
flag = true; | |
break; | |
case '"': | |
flag = true; | |
break; | |
case '&': | |
flag = true; | |
break; | |
} | |
} | |
} | |
return flag; | |
} | |
/** | |
* | |
* 基本功能:过滤所有以"<"开头以">"结尾的标签 | |
* <p> | |
* @param str | |
* @return String | |
*/ | |
public static String filterHtml(String str) { | |
Pattern pattern = Pattern.compile(regxpForHtml); | |
Matcher matcher = pattern.matcher(str); | |
StringBuffer sb = new StringBuffer(); | |
boolean result1 = matcher.find(); | |
while (result1) { | |
matcher.appendReplacement(sb, ""); | |
result1 = matcher.find(); | |
} | |
matcher.appendTail(sb); | |
return sb.toString(); | |
} | |
/** | |
* | |
* 基本功能:过滤指定标签 | |
* <p> | |
* | |
* @param str | |
* @param tags | |
* 指定标签 | |
* @return String | |
*/ | |
public static String removeHTMLTags(String str, String tags) { | |
if (str == null) | |
return null; | |
if (tags == null) | |
return str; | |
String regx="(</?)("+tags+")([^>]*>)"; | |
Matcher matcher ; | |
Pattern pattern = Pattern.compile(regx, Pattern.CASE_INSENSITIVE+Pattern.MULTILINE);// 不区分大小写 | |
//此处需要循环匹配,防止恶意构造的字符串 | |
while((matcher=pattern.matcher(str)).find()){ | |
str= matcher.replaceAll(""); | |
} | |
return str; | |
} | |
/** | |
* 去除网页中的事件 | |
* @param content | |
* @return | |
*/ | |
public static String removeEvents(String content){ | |
String regx="(<[^<]*)(on\\w*\\x20*=|javascript:)"; | |
Pattern pattern = Pattern.compile(regx, Pattern.CASE_INSENSITIVE+Pattern.MULTILINE);// 不区分大小写 | |
Matcher matcher ; | |
String ts= content; | |
//此处需要循环匹配,防止恶意构造的字符串如 onclick=onclick=XXX | |
while((matcher=pattern.matcher(ts)).find()){ | |
ts= matcher.replaceAll("$1"); | |
} | |
return ts; | |
} | |
/** | |
* | |
* @param content img所在源字符串 | |
* 匹配img标签的正则式(可以直接放在java中,如果是其他语言使用,应把\\替换为\): | |
* <img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*> | |
* @return | |
*/ | |
public static List<String> findImgs(String content){ | |
Pattern p = Pattern.compile("<img[^>]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>");//<img[^<>]*src=[\'\"]([0-9A-Za-z.\\/]*)[\'\"].(.*?)>"); | |
Matcher m = p.matcher(content); | |
List<String> imgs = new ArrayList<String>(); | |
/*boolean result = m.find(); | |
int count = m.groupCount();*/ | |
while(m.find()){ | |
//System.out.println("匹配的img: "+m.group()+"-------------↓↓↓↓↓↓"); | |
//System.out.println("匹配的img: "+m.group(1)+"-------------↓↓↓↓↓↓"); | |
imgs.add(m.group()); | |
} | |
return imgs; | |
} | |
/** | |
* | |
* 基本功能:替换指定的标签 | |
* <p> | |
* | |
* @param str | |
* @param beforeTag | |
* 要替换的标签 | |
* @param tagAttrib | |
* 要替换的标签属性值 | |
* @param startTag | |
* 新标签开始标记 | |
* @param endTag | |
* 新标签结束标记 | |
* @return String | |
* @如:替换img标签的src属性值为[img]属性值[/img] | |
*/ | |
public static String replaceHtmlTag(String str, String beforeTag, | |
String tagAttrib, String startTag, String endTag) { | |
String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>"; | |
String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\""; | |
Pattern patternForTag = Pattern.compile(regxpForTag); | |
Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib); | |
Matcher matcherForTag = patternForTag.matcher(str); | |
StringBuffer sb = new StringBuffer(); | |
boolean result = matcherForTag.find(); | |
while (result) { | |
StringBuffer sbreplace = new StringBuffer(); | |
Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag | |
.group(1)); | |
if (matcherForAttrib.find()) { | |
matcherForAttrib.appendReplacement(sbreplace, startTag | |
+ matcherForAttrib.group(1) + endTag); | |
} | |
matcherForTag.appendReplacement(sb, sbreplace.toString()); | |
result = matcherForTag.find(); | |
} | |
matcherForTag.appendTail(sb); | |
return sb.toString(); | |
} | |
public static String makeSafe(String content){ | |
return removeEvents(removeHTMLTags(content,"html|body|head|title|style|video|canvas|script|iframe|frameset|frame|object|embed|xml|input|button|textarea|select|pre|option|plaintext|form")); | |
} | |
public static String makeSafe(String content,String tags){ | |
if(tags==null)return makeSafe(content); | |
return removeEvents(removeHTMLTags(content,tags)); | |
} | |
public static void main(String[] args) { | |
/*System.out.println(makeSafe("dfdf<a href=\"111\" onclick=onclick=alert(1) href=\"javascript:alert(1)\">22233</a><plaintext><script type=\"javascript\">alert(123)</script><p onclick= '111'>111</p>")); | |
String str = "<P>“那么我拉屎要不要通”</P>"; | |
String str2 = HtmlRegexpUtil.removeHTMLTags(str, "P"); | |
System.out.println(str2);*/ | |
/*String str = "<p align=center><img src='http://apic.lawtv.net.cn/res/cont/20140124/14//res04_attpic_brief(2).jpg'></p><p><br> 网站被关闭,<img src='http://apic.lawtv.net.cn/res/cont/20140124/14//res04_attpic_brief(3).jpg'>记者被逮捕,广告被叫停,报纸被起诉,报刊被查处,看起来好像都是坏消息,其实真正说起来应该是好消息。 </p>"; | |
List<String> list = HtmlRegexpUtil.findImgs(str); | |
for(String s : list){ | |
System.out.println("截取的图片标签:"+s); | |
}*/ | |
String str = "<p align=center><img src='http://apic.lawtv.net.cn/res/cont/20140124/14//res04_attpic_brief(2).jpg'></p>" | |
+ "+<p><br> 网站被关闭,记者被逮捕,广告被叫停,报纸被起诉,报刊被查处,看起来好像都是坏消息,其实真正说起来应该是好消息。 </p>" | |
+ "<p> 因为2013年所有与传媒有关的新闻,显现的都是一种规范与秩序的主旋律。于是,才有了全国新闻战线关于学习马克思主义新闻观的全员培训,才有了全体采编人员关于记者证换领所必须经过的集中培训考核之后的“国考”。 </p>" | |
+ "<p> 为了规范与秩序,就需要全体媒体人知道,我们的底线是什么,我们的边线在哪里,我们的高压线有什么样的威力。通过学习,通过考核,通过管理,让规范的媒体享受到规范的殊荣,让有秩序的媒体发展得更加强大。同时,通过重点查处,通过重拳出击,通过法律说法,有偿新闻被遏制,恶俗广告被制止,虚假新闻被控制,非法网站被关闭,违法犯罪被惩罚…… </p>" | |
+ "<p> 在网络媒体如火如荼的今天,在新兴媒体风起云涌的当下,在传统媒体急剧转型的时刻,2013年的传媒可谓规范与秩序相得益彰,传统与现代比学赶帮,从而共同谱写了一曲有声有色、有为有守的年度传媒篇。</p>"; | |
str = str.replaceAll("</p>", "</p>@@"); | |
System.out.println(str); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment