SamWM · February 1, 2012 10:02
diff --git a/CleanWordHtml.cs b/CleanWordHtml.cs
 using System;
 using System.Reflection;
 using System.Collections.Specialized;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.IO;

 [assembly: AssemblyTitle("CleanWordHtml")]
 [assembly: AssemblyDescription("Cleans up HTML generated by Microsoft Word")]
 [assembly: AssemblyVersion("1.0.1.*")]

 // original code by Jeff Atwood, backported to .NET 1.1 and turned into a command line tool. http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html

 public class CleanUp
 {
 	static bool Mso = false;
 	static bool IgnoreSpans = false;
 	static bool IgnoreDivs = false;

 	static void Main(string[] args)
 	{
 		string help = "Cleans up HTML generated by Microsoft Word" + Environment.NewLine + Environment.NewLine
 		 + "Usage:" + Environment.NewLine
 		 + "------" + Environment.NewLine
 		 + "CleanWordHtml \"path to file\"" + Environment.NewLine
 		 + "CleanWordHtml -path \"path to file\"" + Environment.NewLine + Environment.NewLine
 		 + "  Other options:" + Environment.NewLine
 		 + "   -mso (remove only classes generated by word)" + Environment.NewLine
 		 + "   -ignorespans (don't remove span tags)" + Environment.NewLine
 		 + "   -ignoredivs (don't remove div tags)";

 		string filepath = string.Empty;
 		if ((args.Length == 0 || IsNullOrEmpty(args[0])))
 		{
 			Console.WriteLine(help);
 			return;
 		}

 		if (args.Length == 1)
 		{
 			filepath = args[0];
 		}
 		else
 		{
 			for (int i = 0;i < args.Length;i++)
 			{
 				if (args[i].ToLower() == "-path")
 				{
 					filepath = args[i + 1];
 				}
 				if (args[i].ToLower() == "-mso")
 				{
 					Mso = true;
 				}
 				if (args[i].ToLower() == "-ignorespans")
 				{
 					IgnoreSpans = true;
 				}
 				if (args[i].ToLower() == "-ignoredivs")
 				{
 					IgnoreDivs = true;
 				}
 			}
 		}
 		if (IsNullOrEmpty(filepath))
 		{
 			Console.WriteLine(help);
 			return;
 		}
 		if (Path.GetFileName(filepath) == filepath)
 		{
 			filepath = Path.Combine(Environment.CurrentDirectory, filepath);
 		}
 		if (!File.Exists(filepath))
 		{
 			Console.WriteLine("File '" + filepath + "' doesn't exist.");
 			return;
 		}
 		string html = ReadAllText(filepath);
 		Console.WriteLine("Input html is " + html.Length + " chars");
 		html = CleanWordHtml(html);
 		html = FixEntities(html);
 		filepath = Path.Combine(Path.GetDirectoryName(filepath), Path.GetFileNameWithoutExtension(filepath) + ".modified" + Path.GetExtension(filepath));
 		WriteAllText(filepath, html);
 		Console.WriteLine("Cleaned html is " + html.Length + " chars. Saved to " + filepath);
 	}

 	static string CleanWordHtml(string html)
 	{
 		StringCollection sc = new StringCollection();
 		if (!IgnoreSpans)
 		{
 			sc.Add(@"<(/?span|!\[)[^>]*?>");
 		}
 		if (!IgnoreDivs)
 		{
 			sc.Add(@"<(/?div|!\[)[^>]*?>");
 		}
 		if (!Mso)
 		{
 			// Get rid of classes
 			sc.Add(@"\s?class=[""']?\w+[""']?");
 		}
 		else
 		{
 			// Get rid of office classes
 			sc.Add(@"\s?class=[""']?Mso\w+[""']?");
 		}
 		// get rid of unnecessary tag spans (comments and title)
 		sc.Add(@"<!--(\w|\W)+?-->");
 		sc.Add(@"<title>(\w|\W)+?</title>");
 		// get rid of inline style
 		sc.Add(@"\s?style=[""']?\w+[""']?");
 		// Get rid of unnecessary tags
 		sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?st\d|/?head|/?html|body|/?body|!\[)[^>]*?>");
 		// Get rid of empty tags (except table cells)
 		sc.Add(@"(<[^/][^(th|d)>]*>){1}(&nbsp;)*(</[^>]+>){1}");
 		// remove bizarre v: element attached to <img> tag
 		sc.Add(@"\s+v:\w+=""[^""]+""");
 		// remove extra lines
 		sc.Add(@"(" + Environment.NewLine + "){2,}");
 		// remove extra spaces
 		sc.Add(@"( ){2,}");
 		foreach (string s in sc)
 		{
 			html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
 		}
 		// quote unquoted attributes
 		//html = Regex.Replace(html, @"(\w+=)(\w+)(?=[ >])", @"$1""$2""", RegexOptions.IgnoreCase);
 		return html;
 	}

 	static string FixEntities(string html)
 	{
 		NameValueCollection nvc = new NameValueCollection();
 		nvc.Add("“", "&ldquo;");
 		nvc.Add("”", "&rdquo;");
 		nvc.Add("—", "&mdash;");
 		foreach (string key in nvc.Keys)
 		{
 			html = html.Replace(key, nvc[key]);
 		}
 		return html;
 	}

 	static bool IsNullOrEmpty(string value)
 	{
 		if (value != null)
 		{
 			return (value.Length == 0);
 		}
 		return true;
 	}

 	static string ReadAllText(string path)
 	{
 		StringBuilder sb = new StringBuilder();
 		using (StreamReader sr = new StreamReader(path))
 		{
 			String line;
 			// Read and display lines from the file until the end of 
 			// the file is reached.
 			while ((line = sr.ReadLine()) != null)
 			{
 				sb.Append(line + Environment.NewLine);
 			}
 		}
 		return sb.ToString();
 	}

 	static void WriteAllText(string path, string contents)
 	{
 		WriteAllText(path, contents, new UTF8Encoding(false, true));
 	}

 	static void WriteAllText(string path, string contents, Encoding encoding)
 	{
 		using (StreamWriter sw = new StreamWriter(path, false, encoding))
 		{
 			sw.Write(contents);
 		}
 	}
 }
	using System;
	using System.Reflection;
	using System.Collections.Specialized;
	using System.Text;
	using System.Text.RegularExpressions;
	using System.IO;

	[assembly: AssemblyTitle("CleanWordHtml")]
	[assembly: AssemblyDescription("Cleans up HTML generated by Microsoft Word")]
	[assembly: AssemblyVersion("1.0.1.*")]

	// original code by Jeff Atwood, backported to .NET 1.1 and turned into a command line tool. http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html

	public class CleanUp
	{
	static bool Mso = false;
	static bool IgnoreSpans = false;
	static bool IgnoreDivs = false;

	static void Main(string[] args)
	{
	string help = "Cleans up HTML generated by Microsoft Word" + Environment.NewLine + Environment.NewLine
	+ "Usage:" + Environment.NewLine
	+ "------" + Environment.NewLine
	+ "CleanWordHtml \"path to file\"" + Environment.NewLine
	+ "CleanWordHtml -path \"path to file\"" + Environment.NewLine + Environment.NewLine
	+ " Other options:" + Environment.NewLine
	+ " -mso (remove only classes generated by word)" + Environment.NewLine
	+ " -ignorespans (don't remove span tags)" + Environment.NewLine
	+ " -ignoredivs (don't remove div tags)";

	string filepath = string.Empty;
	if ((args.Length == 0 \|\| IsNullOrEmpty(args[0])))
	{
	Console.WriteLine(help);
	return;
	}

	if (args.Length == 1)
	{
	filepath = args[0];
	}
	else
	{
	for (int i = 0;i < args.Length;i++)
	{
	if (args[i].ToLower() == "-path")
	{
	filepath = args[i + 1];
	}
	if (args[i].ToLower() == "-mso")
	{
	Mso = true;
	}
	if (args[i].ToLower() == "-ignorespans")
	{
	IgnoreSpans = true;
	}
	if (args[i].ToLower() == "-ignoredivs")
	{
	IgnoreDivs = true;
	}
	}
	}
	if (IsNullOrEmpty(filepath))
	{
	Console.WriteLine(help);
	return;
	}
	if (Path.GetFileName(filepath) == filepath)
	{
	filepath = Path.Combine(Environment.CurrentDirectory, filepath);
	}
	if (!File.Exists(filepath))
	{
	Console.WriteLine("File '" + filepath + "' doesn't exist.");
	return;
	}
	string html = ReadAllText(filepath);
	Console.WriteLine("Input html is " + html.Length + " chars");
	html = CleanWordHtml(html);
	html = FixEntities(html);
	filepath = Path.Combine(Path.GetDirectoryName(filepath), Path.GetFileNameWithoutExtension(filepath) + ".modified" + Path.GetExtension(filepath));
	WriteAllText(filepath, html);
	Console.WriteLine("Cleaned html is " + html.Length + " chars. Saved to " + filepath);
	}

	static string CleanWordHtml(string html)
	{
	StringCollection sc = new StringCollection();
	if (!IgnoreSpans)
	{
	sc.Add(@"<(/?span\|!\[)[^>]*?>");
	}
	if (!IgnoreDivs)
	{
	sc.Add(@"<(/?div\|!\[)[^>]*?>");
	}
	if (!Mso)
	{
	// Get rid of classes
	sc.Add(@"\s?class=[""']?\w+[""']?");
	}
	else
	{
	// Get rid of office classes
	sc.Add(@"\s?class=[""']?Mso\w+[""']?");
	}
	// get rid of unnecessary tag spans (comments and title)
	sc.Add(@"<!--(\w\|\W)+?-->");
	sc.Add(@"<title>(\w\|\W)+?</title>");
	// get rid of inline style
	sc.Add(@"\s?style=[""']?\w+[""']?");
	// Get rid of unnecessary tags
	sc.Add(@"<(meta\|link\|/?o:\|/?style\|/?font\|/?st\d\|/?head\|/?html\|body\|/?body\|!\[)[^>]*?>");
	// Get rid of empty tags (except table cells)
	sc.Add(@"(<[^/][^(th\|d)>]>){1}( )(</[^>]+>){1}");
	// remove bizarre v: element attached to <img> tag
	sc.Add(@"\s+v:\w+=""[^""]+""");
	// remove extra lines
	sc.Add(@"(" + Environment.NewLine + "){2,}");
	// remove extra spaces
	sc.Add(@"( ){2,}");
	foreach (string s in sc)
	{
	html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
	}
	// quote unquoted attributes
	//html = Regex.Replace(html, @"(\w+=)(\w+)(?=[ >])", @"$1""$2""", RegexOptions.IgnoreCase);
	return html;
	}

	static string FixEntities(string html)
	{
	NameValueCollection nvc = new NameValueCollection();
	nvc.Add("“", "“");
	nvc.Add("”", "”");
	nvc.Add("—", "—");
	foreach (string key in nvc.Keys)
	{
	html = html.Replace(key, nvc[key]);
	}
	return html;
	}

	static bool IsNullOrEmpty(string value)
	{
	if (value != null)
	{
	return (value.Length == 0);
	}
	return true;
	}

	static string ReadAllText(string path)
	{
	StringBuilder sb = new StringBuilder();
	using (StreamReader sr = new StreamReader(path))
	{
	String line;
	// Read and display lines from the file until the end of
	// the file is reached.
	while ((line = sr.ReadLine()) != null)
	{
	sb.Append(line + Environment.NewLine);
	}
	}
	return sb.ToString();
	}

	static void WriteAllText(string path, string contents)
	{
	WriteAllText(path, contents, new UTF8Encoding(false, true));
	}

	static void WriteAllText(string path, string contents, Encoding encoding)
	{
	using (StreamWriter sw = new StreamWriter(path, false, encoding))
	{
	sw.Write(contents);
	}
	}
	}