Skip to content

Instantly share code, notes, and snippets.

@adutra
Created February 10, 2012 08:50
Show Gist options
  • Save adutra/1787869 to your computer and use it in GitHub Desktop.
Save adutra/1787869 to your computer and use it in GitHub Desktop.
HtmlCleaner ITagInfoProvider implementation that handles whitespace gracefully
package org.htmlcleaner;
public class HtmlUtils{
/**
* @see "http://www.w3.org/TR/html401/struct/text.html#h-9.1"
* ASCII space ( )
* ASCII tab (	)
* carriage return (
)
* line feed (
)
* ASCII form feed ()
* Zero-width space (​)
*/
private static final char[] WHITESPACE = " \t\r\n\u000C\u200B".toCharArray();
public static boolean isHtmlWhitespace(String text){
if (text == null || "".equals(text)) {
return true;
}
outer : for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
for (int j = 0; j < WHITESPACE.length; j++) {
if (WHITESPACE[j] == c) {
continue outer;
}
}
return false;
}
return true;
}
}
package org.htmlcleaner;
public class WhitespaceTolerantTagInfo extends TagInfo {
public WhitespaceTolerantTagInfo(String name, int contentType, int belongsTo, boolean deprecated, boolean unique, boolean ignorePermitted) {
super(name, contentType, belongsTo, deprecated, unique, ignorePermitted);
}
@Override
boolean allowsItem(BaseToken token) {
if(token instanceof ContentNode) {
if(HtmlUtils.isHtmlWhitespace(((ContentNode) token).getContent().toString())){
return true;
}
}
return super.allowsItem(token);
}
}
package org.htmlcleaner;
public class WhitespaceTolerantTagInfoProvider extends DefaultTagProvider implements ITagInfoProvider {
/**
*
*/
private static final long serialVersionUID = 6349860503307965029L;
@Override
public TagInfo put(String key, TagInfo value) {
WhitespaceTolerantTagInfo info = new WhitespaceTolerantTagInfo(value.getName(), value.getContentType(), value.getBelongsTo(), value.isDeprecated(), value.isUnique(), value.isIgnorePermitted());
info.setChildTags(value.getChildTags());
info.setContinueAfterTags(value.getContinueAfterTags());
info.setCopyTags(value.getCopyTags());
info.setFatalTag(value.getFatalTag());
info.setHigherTags(value.getHigherTags());
info.setMustCloseTags(value.getMustCloseTags());
info.setPermittedTags(value.getPermittedTags());
info.setRequiredParent(value.getRequiredParent());
return super.put(key, info);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment