Created
November 30, 2018 20:57
-
-
Save minaairsupport/b14b3aeb59e64ebf41dcf5a0af0fe161 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static class HtmlToPlainConvertor | |
{ | |
public static string HtmlToPlainText(string html) | |
{ | |
const string tagWhiteSpace = @"(>|$)(\W|\n|\r)+<";//matches one or more (white space or line breaks) between '>' and '<' | |
const string stripFormatting = @"<[^>]*(>|$)";//match any character between '<' and '>', even when end tag is missing | |
const string lineBreak = @"<(br|BR)\s{0,1}\/{0,1}>";//matches: <br>,<br/>,<br />,<BR>,<BR/>,<BR /> | |
var lineBreakRegex = new Regex(lineBreak, RegexOptions.Multiline); | |
var stripFormattingRegex = new Regex(stripFormatting, RegexOptions.Multiline); | |
var tagWhiteSpaceRegex = new Regex(tagWhiteSpace, RegexOptions.Multiline); | |
var text = html; | |
//Decode html specific characters | |
text = System.Net.WebUtility.HtmlDecode(text); | |
//Remove tag whitespace/line breaks | |
text = tagWhiteSpaceRegex.Replace(text, "><"); | |
//Replace <br /> with line breaks | |
text = lineBreakRegex.Replace(text, Environment.NewLine); | |
//Strip formatting | |
text = stripFormattingRegex.Replace(text, string.Empty); | |
return text; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment