Created
September 25, 2017 13:35
-
-
Save UweKeim/2187791205263ea86fb472b216548cf0 to your computer and use it in GitHub Desktop.
Small, incomplete "port" of WordPress "wptexturize" function to .NET/C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace ZetaProducer.RuntimeBusinessLogic.Rendering.Helper | |
{ | |
using AngleSharp.Dom; | |
using AngleSharp.Parser.Html; | |
using System.Linq; | |
using System.Net; | |
using System.Text.RegularExpressions; | |
public static class Texturizer | |
{ | |
/// <summary> | |
/// Angelehnt an die WordPress-Funktion "wptexturize". | |
/// https://developer.wordpress.org/reference/functions/wptexturize | |
/// https://codex.wordpress.org/Function_Reference/wptexturize | |
/// Ersetzt bestimmte Zeichen. | |
/// | |
/// Ist aber noch lange nicht so umfangreich wie die WordPress-Funktion. | |
/// | |
/// Quelltext zu "wptexturize": | |
/// https://github.com/WordPress/WordPress/blob/master/wp-includes/formatting.php#L51 | |
/// </summary> | |
public static string Texturize(string text) | |
{ | |
if (string.IsNullOrWhiteSpace(text)) return text; | |
var parser = new HtmlParser(); | |
var document = parser.Parse(text); | |
var it = document.CreateNodeIterator(document); | |
INode node; | |
var didFindBody = false; | |
while ((node = it.Next()) != null) | |
{ | |
if ((didFindBody || node.NodeName.ToLowerInvariant() == @"body") && !NoTexturizeTags.Contains(node.NodeName.ToLower())) | |
{ | |
didFindBody = true; | |
var before = node.NodeValue; // TODO: Ist 'NodeValue' die korrekte Property? | |
if (!string.IsNullOrEmpty(before)) | |
{ | |
var after = processOneNode(before); | |
if (after != before) node.NodeValue = after; // TODO: Ist 'NodeValue' die korrekte Property? | |
} | |
} | |
} | |
return document.DocumentElement.GetElementsByTagName(@"body").First().InnerHtml; | |
} | |
private static string processOneNode(string content) | |
{ | |
var result = content; | |
result = Regex.Replace(result, @"(^|\s+)-(\s+|$)", $@"$1{EmDash}$2", RegexOptions.Singleline); | |
result = Regex.Replace(result, @"(^|\b)\.\.\.(\b|$)", $@"$1{Ellipsis}$2", RegexOptions.Singleline); | |
result = Regex.Replace(result, @"(^|\s+)""(\w+)", $@"$1{OpeningDoubleQuote}$2", RegexOptions.Singleline); | |
result = Regex.Replace(result, @"(\w+)""(\s+|$|[!.,;])", $@"$1{ClosingDoubleQuote}$2", RegexOptions.Singleline); | |
result = Regex.Replace(result, @"(^|\s+)'(\w+)", $@"$1{OpeningSingleQuote}$2", RegexOptions.Singleline); | |
result = Regex.Replace(result, @"(\w+)'(\s+|$)", $@"$1{ClosingSingleQuote}$2", RegexOptions.Singleline); | |
// TODO: Noch mehr (alle?) Regeln von WordPress übernehmen. | |
// https://github.com/AngleSharp/AngleSharp/issues/361#issuecomment-230155588 | |
var c = WebUtility.HtmlDecode(result); | |
return c; | |
} | |
private static readonly string[] NoTexturizeTags = | |
{ | |
@"pre", | |
@"code", | |
@"kbd", | |
@"style", | |
@"script", | |
@"tt" | |
}; | |
private const string OpeningDoubleQuote = @"„"; | |
private const string ClosingDoubleQuote = @"“"; | |
private const string Apostrophe = @"'"; | |
private const string OpeningSingleQuote = @"‚"; | |
private const string ClosingSingleQuote = @"‘"; | |
private const string EmDash = @"—"; | |
private const string EnDash = @"–"; | |
private const string Ellipsis = @"…"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment