Created
February 10, 2012 08:50
-
-
Save adutra/1787869 to your computer and use it in GitHub Desktop.
HtmlCleaner ITagInfoProvider implementation that handles whitespace gracefully
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.htmlcleaner; | |
public class HtmlUtils{ | |
/** | |
* @see "http://www.w3.org/TR/html401/struct/text.html#h-9.1" | |
* ASCII space ( ) | |
* ASCII tab (	) | |
* carriage return (
) | |
* line feed (
) | |
* ASCII form feed () | |
* Zero-width space (​) | |
*/ | |
private static final char[] WHITESPACE = " \t\r\n\u000C\u200B".toCharArray(); | |
public static boolean isHtmlWhitespace(String text){ | |
if (text == null || "".equals(text)) { | |
return true; | |
} | |
outer : for (int i = 0; i < text.length(); i++) { | |
char c = text.charAt(i); | |
for (int j = 0; j < WHITESPACE.length; j++) { | |
if (WHITESPACE[j] == c) { | |
continue outer; | |
} | |
} | |
return false; | |
} | |
return true; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.htmlcleaner; | |
public class WhitespaceTolerantTagInfo extends TagInfo { | |
public WhitespaceTolerantTagInfo(String name, int contentType, int belongsTo, boolean deprecated, boolean unique, boolean ignorePermitted) { | |
super(name, contentType, belongsTo, deprecated, unique, ignorePermitted); | |
} | |
@Override | |
boolean allowsItem(BaseToken token) { | |
if(token instanceof ContentNode) { | |
if(HtmlUtils.isHtmlWhitespace(((ContentNode) token).getContent().toString())){ | |
return true; | |
} | |
} | |
return super.allowsItem(token); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.htmlcleaner; | |
public class WhitespaceTolerantTagInfoProvider extends DefaultTagProvider implements ITagInfoProvider { | |
/** | |
* | |
*/ | |
private static final long serialVersionUID = 6349860503307965029L; | |
@Override | |
public TagInfo put(String key, TagInfo value) { | |
WhitespaceTolerantTagInfo info = new WhitespaceTolerantTagInfo(value.getName(), value.getContentType(), value.getBelongsTo(), value.isDeprecated(), value.isUnique(), value.isIgnorePermitted()); | |
info.setChildTags(value.getChildTags()); | |
info.setContinueAfterTags(value.getContinueAfterTags()); | |
info.setCopyTags(value.getCopyTags()); | |
info.setFatalTag(value.getFatalTag()); | |
info.setHigherTags(value.getHigherTags()); | |
info.setMustCloseTags(value.getMustCloseTags()); | |
info.setPermittedTags(value.getPermittedTags()); | |
info.setRequiredParent(value.getRequiredParent()); | |
return super.put(key, info); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment