-
-
Save rarous/593822 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Web; | |
using Matters.Helpers; | |
using HtmlAgilityPack; | |
namespace Matters.Helpers | |
{ | |
/* | |
* remove blacklisted | |
* sanitize whitelisted sttributes | |
* inore textnodes | |
* unwrap others | |
* map tags | |
* normalize nested hierarchies | |
* normalize headings | |
*/ | |
public static class HtmlSanitizer | |
{ | |
static readonly IDictionary<string, string[]> whitelist = new Dictionary<string, string[]> | |
{ | |
{ "h1", null }, | |
{ "h2", null }, | |
{ "h3", null }, | |
{ "h4", null }, | |
{ "h5", null }, | |
{ "h6", null }, | |
{ "a", new[] { "href" } }, | |
{ "img", new[] { "src", "title" } }, | |
{ "strong", null }, | |
{ "b", null }, | |
{ "em", null }, | |
{ "i", null }, | |
{ "p", null }, | |
{ "blockquote", null }, | |
{ "ul", null }, | |
{ "ol", null }, | |
{ "li", null }, | |
{ "div", null }, | |
{ "sub", null }, | |
{ "sup", null }, | |
{ "table", null }, | |
{ "tr", null }, | |
{ "td", null }, | |
{ "th", null } | |
}; | |
static readonly string[] blacklist = new string[] { | |
"script" | |
}; | |
static readonly IDictionary<string, string> map = new Dictionary<string, string> | |
{ | |
{ "strong", "b"}, | |
{ "em", "i"}, | |
{ "div", "p"}, | |
{ "h3", "h2"}, | |
{ "h4", "h2"}, | |
{ "h5", "h2"}, | |
{ "h6", "h2"} | |
}; | |
public static string Sanitize(this string str) | |
{ | |
var doc = new HtmlDocument(); | |
doc.OptionFixNestedTags = true; | |
doc.LoadHtml(str); | |
Sanitize(doc); | |
return doc.DocumentNode.WriteTo().Trim(); | |
} | |
static void Sanitize(HtmlDocument doc) | |
{ | |
ProcessListed(doc); | |
UnwrapDoubleNested(doc); | |
//doc.DocumentNode. | |
} | |
static void ProcessListed(HtmlDocument doc) | |
{ | |
foreach (var node in GetAll(doc)) | |
{ | |
switch (node.NodeType) | |
{ | |
case HtmlNodeType.Element: | |
if (whitelist.ContainsKey(node.Name)) | |
{ | |
node.SanitizeAttributes(); | |
if (map.ContainsKey(node.Name)) | |
node.ChangeTag(map[node.Name]); | |
} | |
else if (blacklist.Contains(node.Name)) | |
node.Remove(); | |
else | |
node.Unwrap(); | |
break; | |
case HtmlNodeType.Text: | |
continue; | |
default: | |
node.Remove(); | |
break; | |
} | |
} | |
} | |
static void UnwrapDoubleNested(HtmlDocument doc) | |
{ | |
while (true) | |
{ | |
var nested = GetAll(doc).FirstOrDefault(n => | |
n.ParentNode != null && | |
n.ParentNode.Name == n.Name | |
); | |
if (nested == null) | |
break; | |
nested.Unwrap(); | |
}; | |
} | |
static IEnumerable<HtmlNode> GetAll(HtmlDocument doc) | |
{ | |
return doc.DocumentNode.ChildNodes.Flatten(ch => ch.ChildNodes).ToArray(); | |
} | |
public static void SanitizeAttributes(this HtmlNode node) | |
{ | |
if (node.HasAttributes) | |
{ | |
var allowed = whitelist[node.Name]; | |
for (var i = node.Attributes.Count - 1; i >= 0; i--) | |
{ | |
var attribute = node.Attributes[i]; | |
if (allowed == null || !allowed.Contains(attribute.Name)) | |
node.Attributes.Remove(attribute); | |
} | |
} | |
} | |
public static void ChangeTag(this HtmlNode node, string tagName) | |
{ | |
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node) | |
// System.InvalidProgramException: Unexpected error. | |
var clone = node.OwnerDocument.CreateElement(tagName); | |
foreach (var child in node.ChildNodes) | |
clone.AppendChild(child); | |
node.ParentNode.ReplaceChild(clone, node); | |
} | |
public static void Unwrap(this HtmlNode node) | |
{ | |
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node) | |
// System.InvalidProgramException: Unexpected error. | |
var clone = node.Clone(); | |
foreach (var child in clone.ChildNodes) | |
node.ParentNode.InsertBefore(child, node); | |
node.Remove(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment