Created
July 8, 2011 15:10
-
-
Save diosmosis/1072047 to your computer and use it in GitHub Desktop.
HTML Sanitizer written in Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'set' | |
# A SanitizationLevel holds information regarding an XML node's start tag. | |
# Levels are used to incrementally build sanitized HTML while dealing with | |
# incomplete nodes (ie, nodes that are never closed). | |
class SanitizationLevel | |
# Constructor. | |
def initialize(tag_name, sanitized, start_pos, end_pos, tagsize) | |
@tag_name = tag_name | |
@sanitized = sanitized | |
@start_pos = start_pos | |
@end_pos = end_pos | |
@tagsize = tagsize | |
@closed = false | |
end | |
# Gets the tag name of the node this level is associated with. | |
# This property cannot be set. | |
def tag_name | |
@tag_name | |
end | |
# Gets the current sanitized string for all levels below this level. This | |
# property will only be modified after a node's children have been processed. | |
# This property cannot be set. | |
def sanitized | |
@sanitized | |
end | |
# Gets the starting index of this level's node in the *pre-escaped* HTML. | |
def start_pos | |
@start_pos | |
end | |
# Gets the current end position of this level's node in the *pre-escaped* | |
# HTML. | |
def end_pos | |
@end_pos | |
end | |
# Sets the current end position of this level's node in the *pre-escaped* | |
# HTML. | |
def end_pos=(value) | |
@end_pos = value | |
end | |
# Gets the size of the start tag of this level's node. | |
# This property cannot be set. | |
def tagsize | |
@tagsize | |
end | |
# A helper function that will extract the 'extra' information of this level's | |
# node start given the pre-escaped HTML. The 'extra' information is | |
# essentially the attributes of an XML node. | |
def extra(input) | |
input[@start_pos + 4 + @tag_name.size..@start_pos + @tagsize - 5] | |
end | |
end | |
# Regex that matches XML node attributes. The following groups are created by | |
# a match: | |
# 0. The attribute key. | |
# 1. The attribute value (or nil if there is none). | |
ATTRIBUTE_REGEX = /([a-zA-Z0-9_]+)(?:\s*=\s*((?:[a-zA-Z0-9_]+)|(?:"[^"]+")|(?:'[^']+'))?)?/ | |
# Regex that matches XML node starts & node ends in escaped HTML. The following | |
# groups are created by a match: | |
# 0. the entire match string | |
# 1. the '/' in a node end, or nil f not present. used to tell if this is the | |
# start or end of a node. | |
# 2. the tag id, ie, 'html', 'body', 'input', etc. | |
# 3. everything after the tag id and before the ending > | |
# NOTE: Regexes are fragile. As such, this regex can parse node starts that use | |
# '>' in attribute values, but cannot deal with tags that spanmultiple | |
# lines. | |
NODE_TAG_REGEX = /(<(\/?)([a-zA-Z0-9_]+)((?:.(?!<)|(?:"[^"]*")|(?:'[^']*'))*)>)/ | |
# Escapes '&', '"' and '\'' characters in a string and returns the escaped | |
# string. This is executed on chunks of text that are known not to be | |
# XML tags. | |
def escape_nontags(s) | |
return s.gsub(/(?:&(?![gl]t;))|["']/, {'&' => '&', '"' => '"', "'" => '''}) | |
end | |
# Replaces escaped HTML ('<', '>', '&', '"', ''') with | |
# the unescaped equivalents. Used when a complete <pre> node is found. | |
# The supplied string is modified in-place. | |
def unescape(s) | |
s.gsub!(/&(?:(?:[lg]t)|(?:amp)|(?:quot)|(?:#39));/, | |
{'<' => '<', '>' => '>', '&' => '&', '"' => '"', ''' => "'"}) | |
end | |
# Sanitizes the attributes section of an XML node start by including attributes | |
# that are in the supplied whitelist set. | |
# The sanitized version is returned. | |
def sanitize_attributes(attributes, whitelist) | |
sanitized = '' | |
attributes.scan ATTRIBUTE_REGEX do |key, value| | |
if whitelist.include? key.downcase | |
sanitized << ' ' << key | |
sanitized << '=' << value if value | |
end | |
end | |
return sanitized | |
end | |
# The stack of sanitization levels. There will always be one level representing | |
# the document root. | |
# This class is really just a wrapper around an array. | |
class SanitizationStack | |
# Constructor. Creates the root santization level. | |
def initialize() | |
@levels = [SanitizationLevel.new('', '', -1, 0, 0)] | |
@last_closed_start = 0 | |
end | |
# Adds a sanitization level using the found tag's name, start position (in | |
# the *pre-escaped* HTML), and the size of the tag found. | |
# The sanitized string is initialized to an empty string and the level's | |
# end position is initialized to the end of the start tag. | |
def push(tag_name, start_pos, tagsize) | |
@levels << SanitizationLevel.new(tag_name, '', start_pos, start_pos + tagsize, tagsize) | |
end | |
# Returns the top level in the stack. | |
def top | |
@levels.last | |
end | |
# Searches the stack from top to bottom looking for a SanitizationLevel with | |
# the given tag name. The index of the found level is returned. Nil is | |
# returned if nothing is found. | |
def most_recent_tag_start(tag_name) | |
@levels.rindex {|x| x.tag_name == tag_name} | |
end | |
# Flattens and sanitizes a node using its associated SanitizationLevel. | |
# Every level after 'from' that has not been sanitized is assumed to | |
# be an unmatched node start and left escaped. | |
def close_node(input, from, current, attribute_whitelist) | |
result = '' | |
parent_level = @levels[from - 1] | |
this_level = @levels[from] | |
# if there's any text before 'this' node and the current known end of the | |
# parent node, escape and append it | |
if this_level.start_pos != 0 | |
in_between = input[parent_level.end_pos..this_level.start_pos-1] | |
result << escape_nontags(in_between) | |
end | |
# sanitize the attributes of 'this' node. | |
attributes = sanitize_attributes(this_level.extra(input), attribute_whitelist) | |
# output 'this' node's starting tag (unescaped) | |
result << '<' << this_level.tag_name << attributes << '>' | |
result << this_level.sanitized | |
# flatten every child of 'this' node. we treat those nodes as unmatched | |
# starts, leaving them escaped. | |
(from + [email protected]).each do |i| | |
level = @levels[i] | |
level_parent = @levels[i-1] | |
# escape and append any text between level's start and its parent's end | |
result << escape_nontags(input[level_parent.end_pos..level.start_pos-1]) | |
# escape and append the incomplete level's tag | |
result << escape_nontags(input[level.start_pos..level.start_pos + level.tagsize - 1]) | |
result << level.sanitized | |
end | |
# append the text between the last parsed tag's end position and the | |
# current position, and this level's closing tag | |
result << escape_nontags(input[@levels.last.end_pos..current-1]) \ | |
<< '</' << this_level.tag_name << '>' | |
# remove every level including and after 'from' | |
@levels[[email protected]] = [] | |
# save the new top's end_pos. this must be remembered for 'finish', but is | |
# overwritten by 'sanitize' | |
@last_closed_start = parent_level.end_pos | |
return result | |
end | |
# Finishes up the sanitization process. Deals with any remaining unmatched | |
# nodes and any text after the last node start found. | |
# The sanitized text is returned. | |
def finish(input) | |
result = '' | |
# if there is more than one level in the stack, there are levels that have | |
# not been matched. the text they cover must be escaped and appended. | |
if @levels.size > 1 | |
result << escape_nontags(input[0..@last_closed_start - 1]) | |
end | |
# append the root level's sanitized text, then escape and append escape | |
# everything after it | |
top = @levels.last | |
result << top.sanitized << escape_nontags(input[top.end_pos..input.size-1]) | |
return result | |
end | |
end | |
# Sanitizes HTML input using a Hash that maps allowed node names with sets of | |
# allowed attribute names. | |
# | |
# This is a whitelist sanitization function, so the HTML input is escaped | |
# before it is processed. This way, any dangerous node types you have not | |
# considered will be dealt with. | |
# | |
# In addition to sanitization, this function will do some mild formatting | |
# and is able to recognize and santize malformed HTML. | |
# | |
# Implementation Notes: | |
# * The sanitized string is built incrementally, thus allowing the use of | |
# a stack instead of a tree structure. This strategy is faster than | |
# creating an entire tree and more powerful than a simple regex search. | |
# * The regex used to parse XML node tags is not able to parse multi-line | |
# tags. *Tags must therefore exist on one line.* | |
def sanitize_html(input, whitelist) | |
levelstack = SanitizationStack.new | |
# escape ALL < and > characters. xml nodes with tag types that exist in the | |
# whitelist will be unescaped below. | |
input.gsub!(/[<>]/, {'<' => '<', '>' => '>'}) | |
# process everything that looks like an XML node tag | |
input.scan NODE_TAG_REGEX do |tag, slash, tag_name, extra| | |
next unless whitelist.has_key? tag_name.downcase | |
current = Regexp.last_match.offset(0).first | |
has_last_slash = (extra and extra.length > 0 and extra[-1] == '/') | |
is_start_node = (slash.empty? and not has_last_slash) | |
if is_start_node | |
# tag is a node start (ie, <abc>) | |
levelstack.push(tag_name, current, tag.size) | |
elsif has_last_slash | |
# tag is a whole node (ie, <abc/>) | |
top = levelstack.top | |
# append everything between the end of the last found tag and this tag to | |
# the parent's sanitized string | |
top.sanitized << input[top.end_pos..current-1] if current != 0 | |
# append the tag with sanitized attributes to the parent's sanitized | |
# string | |
top.sanitized << '<' << tag_name << sanitize_attributes(extra, whitelist[tag_name]) << '/>' | |
# update the ending position of the parent of the node | |
top.end_pos = current + tag.size | |
elsif not extra or extra.empty? | |
# tag is a node end (ie, </abc>) | |
# find the index of the sanitization level that this closing tag | |
# references. if none can be found, this is an *unmatched node end*, | |
# so we ignore it. | |
start_i = levelstack.most_recent_tag_start(tag_name) | |
if start_i | |
# flatten and remove the sub-tree that starts from start_i | |
flattened = levelstack.close_node(input, start_i, current, whitelist[tag_name]) | |
# if we're closing a <pre> node, unescape everything the node contains. | |
if tag_name == 'pre' | |
unescape(flattened) | |
end | |
# append the flattened string to the parent's sanitized string and | |
# update its end position. | |
top = levelstack.top | |
top.sanitized << flattened | |
top.end_pos = current + tag.size | |
end | |
end | |
end | |
# finish up: add everything after last found tag and make sure to deal with | |
# any lingering unmatched node starts | |
return levelstack.finish(input) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment