Created
August 5, 2010 17:31
-
-
Save spullara/510063 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package bagcheck.util; | |
| import java.util.Stack; | |
| import java.util.regex.Matcher; | |
| import java.util.regex.Pattern; | |
| /** | |
| * Clean up Text | |
| * <p/> | |
| * User: sam | |
| * Date: Aug 5, 2010 | |
| * Time: 9:25:02 AM | |
| */ | |
| public class Text { | |
| private static Pattern matchTags = Pattern.compile("<(/?[^><]+?)>|<([^><]+?)"); | |
| private static Pattern allowedTags = Pattern.compile("/?(br|p|a|b|i|ol|li|ul|blockquote)"); | |
| private static Pattern allowedATag = Pattern.compile("a\\s+href=\"([^\"]+?)\""); | |
| private static Pattern allowedHref = Pattern.compile("https?://[a-zA-Z0-9-_./]+"); | |
| public static String strip(String input) { | |
| Stack<String> tags = new Stack<String>(); | |
| input = input.replace("&", "&"); | |
| StringBuffer sb = new StringBuffer(); | |
| // Match what we allow | |
| Matcher m1 = matchTags.matcher(input); | |
| while (m1.find()) { | |
| String tag = m1.group(1); | |
| if (tag == null) { | |
| m1.appendReplacement(sb, "<" + m1.group(2)); | |
| } else if (allowedTags.matcher(tag).matches()) { | |
| if (tag.startsWith("/")) { | |
| if (tags.size() > 0 && tags.pop().equals(tag.substring(1))) { | |
| } else { | |
| m1.appendReplacement(sb, ""); | |
| continue; | |
| } | |
| } else { | |
| if (!tag.equals("br")) tags.push(tag); | |
| } | |
| m1.appendReplacement(sb, "<" + tag + ">"); | |
| } else { | |
| Matcher m2 = allowedATag.matcher(tag); | |
| if (m2.matches() && allowedHref.matcher(m2.group(1)).matches()) { | |
| m1.appendReplacement(sb, "<" + tag + " rel=\"nofollow\">"); | |
| tags.add("a"); | |
| } else { | |
| m1.appendReplacement(sb, ""); | |
| } | |
| } | |
| } | |
| m1.appendTail(sb); | |
| return sb.toString().replace("\n", "<br>"); | |
| } | |
| public static void main(String[] args) { | |
| System.out.println(strip("<a href=\"http://lessthan10.com\">1 < 10</a>")); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment