Created
June 14, 2010 13:14
-
-
Save DanielJomphe/437660 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Applies a very default Google Caja HTML cajoling pipeline. | |
* <p/> | |
* There's three things we might not like about this naive, default implementation. | |
* <ol> | |
* <li>It rewrites URLs by prefixing them and suffixing them with stuff.</li> | |
* <li>It uses Google Caja's default whitelists. They accept all valid markup that's not deemed insecure. We might prefer to provide more restrictive whitelists.</li> | |
* <li>It will be slow in an app where there's many users concurrently using it. This is not the case for our first client's needs, so we don't care for now.</li> | |
* </ol> | |
*/ | |
public class MyCaja { | |
/** | |
* Applies a whitelist to all html&css contents in the provided <code>weirdHtml</code>, and completely removes any kind of javascript found in it. | |
* @param weirdHtml The markup to sanitize. | |
* @param charsetName The charset to use throughout. If ever you'd like the output to be in a different charset than this one, take a look at this method's code to see what parameter you may add. | |
* @return The sanitized version of <code>weirdHtml</code>. | |
* @throws MyCajaException if <code>sanitize/<code> couldn't do its job for any reason. | |
* @see MyCaja | |
*/ | |
public static String sanitize(String weirdHtml, String charsetName) throws MyCajaException { | |
final EchoingMessageQueue mq = new EchoingMessageQueue(new PrintWriter(System.out), new MessageContext(), false); | |
BuildInfo.getInstance().addBuildInfo(mq); | |
String htmlAndJs; | |
try { | |
htmlAndJs = cajoleHtml(weirdHtml, charsetName, mq); | |
} catch (final Exception e) { | |
throw new MyCajaException(); | |
} | |
/* The objective in our way of using Caja was to write as less custom code as possible; KISS. | |
* I feel Caja is still a moving target, so the less we custom-write, the less we'll have to maintain. | |
* Due to this decision, what we get out of htmlHandler.apply(...) is both html and a very small javascript function. | |
* I don't believe this superfluous javascript handling comes with too big of a performance hit for this client's needs, but if it does, | |
* we might want to try to inline HtmlHandler's class here and remove the superfluous javascript handling. | |
*/ | |
final String html = htmlAndJs.split("<script[^>]*>")[0]; | |
return html; | |
} | |
private static String cajoleHtml(String weirdHtml, String charsetName, final EchoingMessageQueue mq) throws IOException, UnsupportedContentTypeException { | |
final InputSource inputSource = InputSource.UNKNOWN; | |
final String contentType = ContentType.HTML.mimeType; | |
final FetchedData in = FetchedData.fromReader(new StringReader(weirdHtml), inputSource, contentType, charsetName); | |
// http://daniel.com: if there was a constructor that accepts an UriPolicy, that would be great. We don't need to rewrite URLs. | |
final HtmlHandler htmlHandler = new HtmlHandler(BuildInfo.getInstance(), "http://daniel.com", UriFetcher.NULL_NETWORK); | |
final ByteArrayOutputStream out = new ByteArrayOutputStream(); | |
htmlHandler.apply( | |
inputSource.getUri(), | |
null,//Transform.CAJOLE, | |
null,//Lists.newArrayList(Directive.CAJITA), | |
new NullContentHandlerArgs(), | |
contentType, contentType, | |
new StrictContentTypeCheck(), | |
in, out, | |
mq); | |
return out.toString(charsetName); | |
} | |
public static void main(String[] args) throws IOException, MyCajaException { | |
System.out.println(sanitize("", "ISO-8859-1")); | |
System.out.println(sanitize(FileUtils.readFileToString(new File("C:/input.html")), "ISO-8859-1")); | |
} | |
/** | |
* HtmlHandler only uses it to check arg MODULE_CALLBACK, which we don't care about. Each time this code is maintained, please make sure HtmlHandler didn't start using more args. | |
*/ | |
protected static class NullContentHandlerArgs extends ContentHandlerArgs { | |
@Override | |
public String get(String name) { | |
return null; | |
} | |
} | |
public static class MyCajaException extends Exception { | |
private static final long serialVersionUID = 0L; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment