Created
August 31, 2012 20:56
-
-
Save donomans/3558866 to your computer and use it in GitHub Desktop.
Strip the HTML and get raw page content
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String StripHtml(String source) | |
{ | |
List<Char> array = new List<Char>(source.Length); | |
Boolean inside = false; | |
Boolean dquotes = false; | |
Boolean squotes = false; | |
///1) look for tags or things that need to be fully removed (entire containing contents) and remove them | |
source = source.Replace(" ", "").Replace("Â", "").Replace("<br>", " ").Replace("<br />", " "); | |
String lowersource = source.ToLower(); | |
Int32 scriptindex = lowersource.IndexOf("<script"); | |
while (scriptindex > 0) | |
{ | |
Int32 scriptendindex = lowersource.IndexOf("</script>", scriptindex) + "</script>".Length; | |
lowersource = lowersource.Remove(scriptindex, scriptendindex - scriptindex); | |
source = source.Remove(scriptindex, scriptendindex - scriptindex); | |
scriptindex = lowersource.IndexOf("<script"); | |
} | |
///really cheesy way for now to remove most of the extra spacing | |
source = source.Replace(Environment.NewLine, "").Replace(" ", ""); | |
///2) search entire contents for < and > tags outside of quotes and remove those pieces | |
foreach (Char c in source) | |
{ | |
switch (c) | |
{ | |
case '\r': | |
case '\n': | |
case '\t': | |
continue; | |
case '\'': | |
if (inside && !dquotes) | |
{ | |
if (squotes) | |
squotes = false; | |
else | |
squotes = true; | |
continue; | |
} | |
break; | |
case '"': | |
if (inside && !squotes) | |
{ | |
if (dquotes) | |
dquotes = false; | |
else | |
dquotes = true; | |
continue; | |
} | |
break; | |
case '>': | |
if (dquotes || squotes) ///if we're within the inside and we are in quotes then we have to keep going | |
continue; | |
inside = false; | |
continue; | |
case '<': | |
if (dquotes || squotes) ///if we're in quotes then we have to keep going | |
continue; | |
inside = true; | |
continue; | |
} | |
if (!inside) | |
{ | |
array.Add(c); | |
} | |
} | |
return new String(array.ToArray()).Trim(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment