Created
May 2, 2020 08:24
-
-
Save kovachwt/dd6b39f0af8abcc9451415dd6a8cecc2 to your computer and use it in GitHub Desktop.
Remove HTML tags from a Drupal node text dump
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.IO; | |
using System.Collections.Generic; | |
using System.Text.RegularExpressions; | |
namespace txtsredvach | |
{ | |
class Program | |
{ | |
static string outputFile; | |
static void Main(string[] args) | |
{ | |
string file = args[0]; | |
file = Path.Combine(Directory.GetCurrentDirectory(), file); | |
outputFile = file + ".processed.txt"; | |
Console.WriteLine("Processing file " + file); | |
using (FileStream fs = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) | |
using (BufferedStream bs = new BufferedStream(fs)) | |
using (StreamReader sr = new StreamReader(bs)) | |
{ | |
string chunk = ""; | |
List<string> chunkseparators = new List<string>() { "body", @"\n<!--break-->" }; | |
int chunksdone = 0; | |
int numlines = 0; | |
string line; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
if (chunkseparators.Contains(("" + line).Trim().ToLower()) | |
|| (numlines > 20 && ("" + line).Trim().ToLower().EndsWith("</p>"))) | |
{ | |
processChunk(chunk); | |
chunk = ""; | |
numlines = 0; | |
chunksdone++; | |
if (chunksdone % 100 == 0) | |
Console.WriteLine("Processed " + chunksdone + " chunks so far."); | |
} | |
else | |
{ | |
chunk += line + Environment.NewLine; | |
numlines++; | |
} | |
} | |
Console.WriteLine("Processed " + chunksdone + " chunks!"); | |
Console.ReadLine(); | |
} | |
} | |
static void processChunk(string chunk) | |
{ | |
chunk = chunk.Replace(@"\n", " "); | |
chunk = chunk.Replace(@"\t", " "); | |
chunk = Html2Text(chunk, true); | |
File.AppendAllText(outputFile, chunk.Trim() + Environment.NewLine + Environment.NewLine); | |
} | |
static string Html2Text(string source, bool convertEntities) | |
{ | |
if (source == null) | |
return ""; | |
try | |
{ | |
string result; | |
// Remove HTML Development formatting | |
// Replace line breaks with space | |
// because browsers inserts space | |
result = source.Replace("\r", " "); | |
// Replace line breaks with space | |
// because browsers inserts space | |
result = result.Replace("\n", " "); | |
// Remove step-formatting | |
result = result.Replace("\t", string.Empty); | |
// Remove repeating speces becuase browsers ignore them | |
result = Regex.Replace(result, @"( )+", " "); | |
// Remove the header (prepare first by clearing attributes) | |
result = Regex.Replace(result, @"<( )*head([^>])*>", "<head>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// remove all scripts (prepare first by clearing attributes) | |
result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"(<script>).*?(</script>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// remove all styles (prepare first by clearing attributes) | |
result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, "(<style>).*?(</style>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// insert tabs in spaces of <td> tags | |
result = Regex.Replace(result, @"<( )*td([^>])*>", "\t", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// insert line breaks in places of <BR>, <DIV>, <TR> and <LI> tags | |
result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"<( )*div([^>])*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// insert line paragraphs (double line breaks) in place | |
// if <P> tags | |
result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// Remove remaining tags like <a>, links, images, | |
// comments etc - anything thats enclosed inside < > | |
result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
if (convertEntities) | |
{ | |
// replace special characters: | |
result = Regex.Replace(result, @" ", " ", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"•", " * ", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"‹", "<", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"›", ">", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"™", "(tm)", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"⁄", "/", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"<", "<", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @">", ">", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"©", "(c)", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, @"®", "(r)", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// Remove all others. More can be added, see | |
// http://hotwired.lycos.com/webmonkey/reference/special_characters/ | |
result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
} | |
// make line breaking consistent | |
result = result.Replace("\n", "\r"); | |
// Remove extra line breaks and tabs: | |
// replace over 2 breaks with 2 and over 4 tabs with 4. | |
// Prepare first to remove any whitespaces inbetween | |
// the escaped characters and remove redundant tabs inbetween linebreaks | |
result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, "(\t)( )+(\t)", "\t\t", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, "(\t)( )+(\r)", "\t\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
result = Regex.Replace(result, "(\r)( )+(\t)", "\r\t", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// Remove redundant tabs | |
result = Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// Remove multible tabs followind a linebreak with just one tab | |
result = Regex.Replace(result, "(\r)(\t)+", "\r\t", RegexOptions.IgnoreCase | RegexOptions.Compiled); | |
// Initial replacement target string for linebreaks | |
string breaks = "\r\r\r"; | |
// Initial replacement target string for tabs | |
string tabs = "\t\t\t\t\t"; | |
for (int index = 0; index < result.Length; index++) | |
{ | |
result = result.Replace(breaks, "\r\r"); | |
result = result.Replace(tabs, "\t\t\t\t"); | |
breaks = breaks + "\r"; | |
tabs = tabs + "\t"; | |
} | |
// Thats it. | |
return result; | |
} | |
catch | |
{ | |
return source; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment