Skip to content

Instantly share code, notes, and snippets.

@Dynyx
Created June 4, 2012 13:28
Show Gist options
  • Save Dynyx/2868380 to your computer and use it in GitHub Desktop.
Save Dynyx/2868380 to your computer and use it in GitHub Desktop.
User input sanitization
// Uses Tidy.Net
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Security.Application;
using TidyNet;
namespace MyApp
{
public class HtmlUtility
{
private static readonly Regex HtmlTagExpression = new Regex(@"
(?'tag_start'</?)
(?'tag'\w+)((\s+
(?'attribute'(\w+)(\s*=\s*(?:"".*?""|'.*?'|[^'"">\s]+)))?)+\s*|\s*)
(?'tag_end'/?>)",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex HtmlAttributeExpression = new Regex(@"
(?'attribute'\w+)
(\s*=\s*)
(""(?'value'.*?)""|'(?'value'.*?)')",
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Dictionary<string, List<string>> ValidHtmlTags = new Dictionary<string, List<string>>
{
{"p", new List<string> {"style", "class", "align"}},
{"div", new List<string> {"style", "class", "align"}},
{"span", new List<string> {"style", "class"}},
{"br", new List<string> {"style", "class"}},
{"hr", new List<string> {"style", "class"}},
{"label", new List<string> {"style", "class"}},
{"h1", new List<string> {"style", "class"}},
{"h2", new List<string> {"style", "class"}},
{"h3", new List<string> {"style", "class"}},
{"h4", new List<string> {"style", "class"}},
{"h5", new List<string> {"style", "class"}},
{"h6", new List<string> {"style", "class"}},
{"font", new List<string> {"style", "class", "color", "face", "size"}},
{"strong", new List<string> {"style", "class"}},
{"b", new List<string> {"style", "class"}},
{"em", new List<string> {"style", "class"}},
{"i", new List<string> {"style", "class"}},
{"u", new List<string> {"style", "class"}},
{"strike", new List<string> {"style", "class"}},
{"ol", new List<string> {"style", "class"}},
{"ul", new List<string> {"style", "class"}},
{"li", new List<string> {"style", "class"}},
{"blockquote", new List<string> {"style", "class"}},
{"code", new List<string> {"style", "class"}},
{"a", new List<string> {"style", "class", "href", "title", "target"}},
{"img", new List<string> {"style", "class", "src", "height", "width", "alt", "title", "hspace", "vspace", "border"}},
{"table", new List<string> {"style", "class"}},
{"thead", new List<string> {"style", "class"}},
{"tbody", new List<string> {"style", "class"}},
{"tfoot", new List<string> {"style", "class"}},
{"th", new List<string> {"style", "class", "scope"}},
{"tr", new List<string> {"style", "class"}},
{"td", new List<string> {"style", "class", "colspan"}},
{"q", new List<string> {"style", "class", "cite"}},
{"cite", new List<string> {"style", "class"}},
{"abbr", new List<string> {"style", "class"}},
{"acronym", new List<string> {"style", "class"}},
{"del", new List<string> {"style", "class"}},
{"ins", new List<string> {"style", "class"}}
};
/// <summary>
/// Removes the invalid HTML tags.
/// </summary>
/// <param name="input">The text.</param>
/// <returns></returns>
public static string RemoveInvalidHtmlTags(string input)
{
var html = TidyHtml(input);
if (string.IsNullOrEmpty(html))
return AntiXss.HtmlEncode(input);
return HtmlTagExpression.Replace(html, new MatchEvaluator(match =>
{
var builder = new StringBuilder(match.Length);
var tagStart = match.Groups["tag_start"];
var tagEnd = match.Groups["tag_end"];
var tag = match.Groups["tag"].Value;
var attributes = match.Groups["attribute"];
if (false == ValidHtmlTags.ContainsKey(tag))
{
builder.Append(tagStart.Success ? tagStart.Value : "<");
builder.Append(tag);
builder.Append(tagEnd.Success ? tagEnd.Value : ">");
return AntiXss.HtmlEncode(builder.ToString());
}
builder.Append(tagStart.Success ? tagStart.Value : "<");
builder.Append(tag);
foreach (Capture attribute in attributes.Captures)
{
builder.Append(MatchHtmlAttribute(tag, attribute));
}
// add nofollow to all hyperlinks
if (tagStart.Success && tagStart.Value == "<" && tag.Equals("a", StringComparison.OrdinalIgnoreCase))
builder.Append(" rel=\"nofollow\"");
builder.Append(tagEnd.Success ? tagEnd.Value : ">");
return builder.ToString();
}));
}
private static string MatchHtmlAttribute(string tag, Capture capture)
{
var output = string.Empty;
var match = HtmlAttributeExpression.Match(capture.Value);
var attribute = match.Groups["attribute"].Value;
var value = match.Groups["value"].Value;
if (ValidHtmlTags[tag].Contains(attribute))
{
switch (attribute)
{
case "src":
case "href":
if (Regex.IsMatch(value, @"https?://[^""]+"))
output = string.Format(" {0}=\"{1}\"", attribute, AntiXss.UrlEncode(value));
break;
default:
output = string.Format(" {0}=\"{1}\"", attribute, value);
break;
}
}
return output;
}
private static string TidyHtml(string text)
{
var doc = new Tidy();
var messages = new TidyMessageCollection();
var input = new MemoryStream();
var output = new MemoryStream();
var array = Encoding.UTF8.GetBytes(text);
input.Write(array, 0, array.Length);
input.Position = 0;
doc.Options.DocType = DocType.Strict;
doc.Options.Xhtml = true;
doc.Options.CharEncoding = CharEncoding.UTF8;
doc.Options.LogicalEmphasis = true;
doc.Options.MakeClean = false;
doc.Options.SmartIndent = false;
doc.Options.IndentContent = false;
doc.Options.TidyMark = false;
doc.Options.DropFontTags = false;
doc.Options.QuoteAmpersand = true;
doc.Options.DropEmptyParas = true;
doc.Options.CharEncoding = CharEncoding.UTF8;
doc.Parse(input, output, messages);
return RemoveTidyAdditions(Encoding.UTF8.GetString(output.ToArray()));
}
private static string RemoveTidyAdditions(string text)
{
if (string.IsNullOrEmpty(text))
return string.Empty;
var start = text.IndexOf("<body>");
var end = text.IndexOf("</body>");
if (start != -1 && end > start && end < text.Length)
text = text.Substring(start + 6, end - (start + 6));
else
return string.Empty;
return Regex.Replace(text, "[\r\n\t]*", string.Empty);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment