Created
September 23, 2010 15:39
-
-
Save steida/593816 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
using System.Linq; | |
using HtmlAgilityPack; | |
namespace TeamVision.Helpers | |
{ | |
/* | |
* remove blacklisted | |
* sanitize whitelisted sttributes | |
* inore textnodes | |
* unwrap others | |
* map tags | |
* normalize nested hierarchies | |
* normalize headings | |
* wrap all blockless textnodes to paragraphs | |
* | |
* todo: test | |
* | |
* <h1>Ahoj</h1> | |
* <h5>do h2</h5> | |
* <script>alert('xxs')</script> | |
* <p><div>ho<b>v</b>no</div></p> | |
* <strong>na b</strong><em>na i</em> | |
* <cite>cituju</cite> | |
* <a style='display: none' href=google.com>foo</a> | |
* <scrip>alert('xxs')</scrip> | |
* <!--fuck--> | |
* | |
* <h1>Ahoj</h1> | |
* <h2>do h2</h2> | |
* <p>ho<b>v</b>no</p> | |
* <b>na b</b><i>na i</i> | |
* cituju | |
* <a href="google.com">foo</a> | |
* alert('xxs') | |
* | |
*/ | |
public static class HtmlSanitizer | |
{ | |
static readonly IDictionary<string, string[]> whitelist = new Dictionary<string, string[]> | |
{ | |
{ "h1", null }, | |
{ "h2", null }, | |
{ "h3", null }, | |
{ "h4", null }, | |
{ "h5", null }, | |
{ "h6", null }, | |
{ "a", new[] { "href" } }, | |
{ "img", new[] { "src", "title" } }, | |
{ "strong", null }, | |
{ "b", null }, | |
{ "em", null }, | |
{ "i", null }, | |
{ "p", null }, | |
{ "blockquote", null }, | |
{ "ul", null }, | |
{ "ol", null }, | |
{ "li", null }, | |
{ "div", null }, | |
{ "sub", null }, | |
{ "sup", null }, | |
{ "table", null }, | |
{ "tr", null }, | |
{ "td", null }, | |
{ "th", null } | |
}; | |
static readonly string[] blacklist = new string[] { | |
"script" | |
}; | |
static readonly IDictionary<string, string> map = new Dictionary<string, string> | |
{ | |
{ "strong", "b"}, | |
{ "em", "i"}, | |
{ "div", "p"}, | |
{ "h4", "h3"}, | |
{ "h5", "h3"}, | |
{ "h6", "h3"} | |
}; | |
static readonly string[] blocks = new string[] { | |
"h1", | |
"h2", | |
"h3", | |
"h4", | |
"h5", | |
"h6", | |
"p", | |
"blockquote", | |
"li", | |
"td", | |
"th" | |
}; | |
public static string Sanitize(this string str) | |
{ | |
var doc = new HtmlDocument(); | |
doc.OptionFixNestedTags = true; | |
doc.LoadHtml(str); | |
Sanitize(doc); | |
return doc.DocumentNode.WriteTo().Trim(); | |
} | |
static void Sanitize(HtmlDocument doc) | |
{ | |
ProcessListed(doc); | |
UnwrapDoubleNested(doc); | |
WrapBlocklessTextNodes(doc); | |
} | |
static void ProcessListed(HtmlDocument doc) | |
{ | |
foreach (var node in GetAll(doc)) | |
{ | |
switch (node.NodeType) | |
{ | |
case HtmlNodeType.Element: | |
if (whitelist.ContainsKey(node.Name)) | |
{ | |
node.SanitizeAttributes(); | |
if (map.ContainsKey(node.Name)) | |
node.ChangeTag(map[node.Name]); | |
} | |
else if (blacklist.Contains(node.Name)) | |
node.Remove(); | |
else | |
node.Unwrap(); | |
break; | |
case HtmlNodeType.Text: | |
continue; | |
default: | |
node.Remove(); | |
break; | |
} | |
} | |
} | |
static void UnwrapDoubleNested(HtmlDocument doc) | |
{ | |
while (true) | |
{ | |
var nested = GetAll(doc).FirstOrDefault(n => | |
n.ParentNode != null && | |
n.ParentNode.Name == n.Name | |
); | |
if (nested == null) | |
break; | |
nested.Unwrap(); | |
}; | |
} | |
static void WrapBlocklessTextNodes(HtmlDocument doc) | |
{ | |
var nodes = GetAll(doc).Where(n => | |
n.NodeType == HtmlNodeType.Text && | |
!n.HasBlockParent() | |
); | |
foreach (var node in nodes) | |
{ | |
var p = node.OwnerDocument.CreateElement("p"); | |
node.ParentNode.ReplaceChild(p, node); | |
p.AppendChild(node); | |
} | |
} | |
static IEnumerable<HtmlNode> GetAll(HtmlDocument doc) | |
{ | |
return doc.DocumentNode.ChildNodes.Flatten(ch => ch.ChildNodes).ToArray(); | |
} | |
public static void SanitizeAttributes(this HtmlNode node) | |
{ | |
if (node.HasAttributes) | |
{ | |
var allowed = whitelist[node.Name]; | |
for (var i = node.Attributes.Count - 1; i >= 0; i--) | |
{ | |
var attribute = node.Attributes[i]; | |
if (allowed == null || !allowed.Contains(attribute.Name)) | |
node.Attributes.Remove(attribute); | |
} | |
} | |
} | |
public static void ChangeTag(this HtmlNode node, string tagName) | |
{ | |
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node) | |
// System.InvalidProgramException: Unexpected error. | |
var clone = node.OwnerDocument.CreateElement(tagName); | |
foreach (var child in node.ChildNodes) | |
clone.AppendChild(child); | |
node.ParentNode.ReplaceChild(clone, node); | |
} | |
public static void Unwrap(this HtmlNode node) | |
{ | |
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node) | |
// System.InvalidProgramException: Unexpected error. | |
var clone = node.Clone(); | |
foreach (var child in clone.ChildNodes) | |
node.ParentNode.InsertBefore(child, node); | |
node.Remove(); | |
} | |
public static bool HasBlockParent(this HtmlNode node) | |
{ | |
var parent = node.ParentNode; | |
while (parent != null) { | |
if (blocks.Contains(parent.Name)) | |
return true; | |
parent = parent.ParentNode; | |
} | |
return false; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment