Skip to content

Instantly share code, notes, and snippets.

@ArtemAvramenko
Last active April 9, 2024 15:07
Show Gist options
  • Save ArtemAvramenko/e9a2ae1c814450c701b1fc270caf37fd to your computer and use it in GitHub Desktop.
Save ArtemAvramenko/e9a2ae1c814450c701b1fc270caf37fd to your computer and use it in GitHub Desktop.
Convert HTML to plain text in C#
private static readonly Regex[] _htmlReplaces = new[] {
new Regex(@"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>", RegexOptions.Compiled | RegexOptions.Singleline, TimeSpan.FromSeconds(1)),
new Regex(@"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>", RegexOptions.Compiled | RegexOptions.Singleline, TimeSpan.FromSeconds(1)),
new Regex(@"<[^>]*>", RegexOptions.Compiled),
new Regex(@" +", RegexOptions.Compiled)
};
public static string HtmlToPlainText(string html)
{
foreach (var r in _htmlReplaces)
{
html = r.Replace(html, " ");
}
var lines = html
.Split(new[] { '\r', '\n' })
.Select(_ => WebUtility.HtmlDecode(_.Trim()))
.Where(_ => _.Length > 0)
.ToArray();
return string.Join("\n", lines);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment