Skip to content

Instantly share code, notes, and snippets.

@donomans
Created August 31, 2012 20:56
Show Gist options
  • Save donomans/3558866 to your computer and use it in GitHub Desktop.
Save donomans/3558866 to your computer and use it in GitHub Desktop.
Strip the HTML and get raw page content
private static String StripHtml(String source)
{
List<Char> array = new List<Char>(source.Length);
Boolean inside = false;
Boolean dquotes = false;
Boolean squotes = false;
///1) look for tags or things that need to be fully removed (entire containing contents) and remove them
source = source.Replace("&nbsp;", "").Replace("Â", "").Replace("<br>", " ").Replace("<br />", " ");
String lowersource = source.ToLower();
Int32 scriptindex = lowersource.IndexOf("<script");
while (scriptindex > 0)
{
Int32 scriptendindex = lowersource.IndexOf("</script>", scriptindex) + "</script>".Length;
lowersource = lowersource.Remove(scriptindex, scriptendindex - scriptindex);
source = source.Remove(scriptindex, scriptendindex - scriptindex);
scriptindex = lowersource.IndexOf("<script");
}
///really cheesy way for now to remove most of the extra spacing
source = source.Replace(Environment.NewLine, "").Replace(" ", "");
///2) search entire contents for < and > tags outside of quotes and remove those pieces
foreach (Char c in source)
{
switch (c)
{
case '\r':
case '\n':
case '\t':
continue;
case '\'':
if (inside && !dquotes)
{
if (squotes)
squotes = false;
else
squotes = true;
continue;
}
break;
case '"':
if (inside && !squotes)
{
if (dquotes)
dquotes = false;
else
dquotes = true;
continue;
}
break;
case '>':
if (dquotes || squotes) ///if we're within the inside and we are in quotes then we have to keep going
continue;
inside = false;
continue;
case '<':
if (dquotes || squotes) ///if we're in quotes then we have to keep going
continue;
inside = true;
continue;
}
if (!inside)
{
array.Add(c);
}
}
return new String(array.ToArray()).Trim();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment