Skip to content

Instantly share code, notes, and snippets.

@SamWM
Created February 1, 2012 10:02
Show Gist options
  • Save SamWM/1716310 to your computer and use it in GitHub Desktop.
Save SamWM/1716310 to your computer and use it in GitHub Desktop.
Clean Up Words Messy HTML
using System;
using System.Reflection;
using System.Collections.Specialized;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
[assembly: AssemblyTitle("CleanWordHtml")]
[assembly: AssemblyDescription("Cleans up HTML generated by Microsoft Word")]
[assembly: AssemblyVersion("1.0.1.*")]
// original code by Jeff Atwood, backported to .NET 1.1 and turned into a command line tool. http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html
public class CleanUp
{
static bool Mso = false;
static bool IgnoreSpans = false;
static bool IgnoreDivs = false;
static void Main(string[] args)
{
string help = "Cleans up HTML generated by Microsoft Word" + Environment.NewLine + Environment.NewLine
+ "Usage:" + Environment.NewLine
+ "------" + Environment.NewLine
+ "CleanWordHtml \"path to file\"" + Environment.NewLine
+ "CleanWordHtml -path \"path to file\"" + Environment.NewLine + Environment.NewLine
+ " Other options:" + Environment.NewLine
+ " -mso (remove only classes generated by word)" + Environment.NewLine
+ " -ignorespans (don't remove span tags)" + Environment.NewLine
+ " -ignoredivs (don't remove div tags)";
string filepath = string.Empty;
if ((args.Length == 0 || IsNullOrEmpty(args[0])))
{
Console.WriteLine(help);
return;
}
if (args.Length == 1)
{
filepath = args[0];
}
else
{
for (int i = 0;i < args.Length;i++)
{
if (args[i].ToLower() == "-path")
{
filepath = args[i + 1];
}
if (args[i].ToLower() == "-mso")
{
Mso = true;
}
if (args[i].ToLower() == "-ignorespans")
{
IgnoreSpans = true;
}
if (args[i].ToLower() == "-ignoredivs")
{
IgnoreDivs = true;
}
}
}
if (IsNullOrEmpty(filepath))
{
Console.WriteLine(help);
return;
}
if (Path.GetFileName(filepath) == filepath)
{
filepath = Path.Combine(Environment.CurrentDirectory, filepath);
}
if (!File.Exists(filepath))
{
Console.WriteLine("File '" + filepath + "' doesn't exist.");
return;
}
string html = ReadAllText(filepath);
Console.WriteLine("Input html is " + html.Length + " chars");
html = CleanWordHtml(html);
html = FixEntities(html);
filepath = Path.Combine(Path.GetDirectoryName(filepath), Path.GetFileNameWithoutExtension(filepath) + ".modified" + Path.GetExtension(filepath));
WriteAllText(filepath, html);
Console.WriteLine("Cleaned html is " + html.Length + " chars. Saved to " + filepath);
}
static string CleanWordHtml(string html)
{
StringCollection sc = new StringCollection();
if (!IgnoreSpans)
{
sc.Add(@"<(/?span|!\[)[^>]*?>");
}
if (!IgnoreDivs)
{
sc.Add(@"<(/?div|!\[)[^>]*?>");
}
if (!Mso)
{
// Get rid of classes
sc.Add(@"\s?class=[""']?\w+[""']?");
}
else
{
// Get rid of office classes
sc.Add(@"\s?class=[""']?Mso\w+[""']?");
}
// get rid of unnecessary tag spans (comments and title)
sc.Add(@"<!--(\w|\W)+?-->");
sc.Add(@"<title>(\w|\W)+?</title>");
// get rid of inline style
sc.Add(@"\s?style=[""']?\w+[""']?");
// Get rid of unnecessary tags
sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?st\d|/?head|/?html|body|/?body|!\[)[^>]*?>");
// Get rid of empty tags (except table cells)
sc.Add(@"(<[^/][^(th|d)>]*>){1}(&nbsp;)*(</[^>]+>){1}");
// remove bizarre v: element attached to <img> tag
sc.Add(@"\s+v:\w+=""[^""]+""");
// remove extra lines
sc.Add(@"(" + Environment.NewLine + "){2,}");
// remove extra spaces
sc.Add(@"( ){2,}");
foreach (string s in sc)
{
html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
}
// quote unquoted attributes
//html = Regex.Replace(html, @"(\w+=)(\w+)(?=[ >])", @"$1""$2""", RegexOptions.IgnoreCase);
return html;
}
static string FixEntities(string html)
{
NameValueCollection nvc = new NameValueCollection();
nvc.Add("“", "&ldquo;");
nvc.Add("”", "&rdquo;");
nvc.Add("—", "&mdash;");
foreach (string key in nvc.Keys)
{
html = html.Replace(key, nvc[key]);
}
return html;
}
static bool IsNullOrEmpty(string value)
{
if (value != null)
{
return (value.Length == 0);
}
return true;
}
static string ReadAllText(string path)
{
StringBuilder sb = new StringBuilder();
using (StreamReader sr = new StreamReader(path))
{
String line;
// Read and display lines from the file until the end of
// the file is reached.
while ((line = sr.ReadLine()) != null)
{
sb.Append(line + Environment.NewLine);
}
}
return sb.ToString();
}
static void WriteAllText(string path, string contents)
{
WriteAllText(path, contents, new UTF8Encoding(false, true));
}
static void WriteAllText(string path, string contents, Encoding encoding)
{
using (StreamWriter sw = new StreamWriter(path, false, encoding))
{
sw.Write(contents);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment