Last active
April 9, 2024 15:07
-
-
Save ArtemAvramenko/e9a2ae1c814450c701b1fc270caf37fd to your computer and use it in GitHub Desktop.
Convert HTML to plain text in C#
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static readonly Regex[] _htmlReplaces = new[] { | |
new Regex(@"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>", RegexOptions.Compiled | RegexOptions.Singleline, TimeSpan.FromSeconds(1)), | |
new Regex(@"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>", RegexOptions.Compiled | RegexOptions.Singleline, TimeSpan.FromSeconds(1)), | |
new Regex(@"<[^>]*>", RegexOptions.Compiled), | |
new Regex(@" +", RegexOptions.Compiled) | |
}; | |
public static string HtmlToPlainText(string html) | |
{ | |
foreach (var r in _htmlReplaces) | |
{ | |
html = r.Replace(html, " "); | |
} | |
var lines = html | |
.Split(new[] { '\r', '\n' }) | |
.Select(_ => WebUtility.HtmlDecode(_.Trim())) | |
.Where(_ => _.Length > 0) | |
.ToArray(); | |
return string.Join("\n", lines); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment