Created
February 1, 2012 10:02
-
-
Save SamWM/1716310 to your computer and use it in GitHub Desktop.
Clean Up Words Messy HTML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Reflection; | |
using System.Collections.Specialized; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.IO; | |
[assembly: AssemblyTitle("CleanWordHtml")] | |
[assembly: AssemblyDescription("Cleans up HTML generated by Microsoft Word")] | |
[assembly: AssemblyVersion("1.0.1.*")] | |
// original code by Jeff Atwood, backported to .NET 1.1 and turned into a command line tool. http://www.codinghorror.com/blog/2006/01/cleaning-words-nasty-html.html | |
public class CleanUp | |
{ | |
static bool Mso = false; | |
static bool IgnoreSpans = false; | |
static bool IgnoreDivs = false; | |
static void Main(string[] args) | |
{ | |
string help = "Cleans up HTML generated by Microsoft Word" + Environment.NewLine + Environment.NewLine | |
+ "Usage:" + Environment.NewLine | |
+ "------" + Environment.NewLine | |
+ "CleanWordHtml \"path to file\"" + Environment.NewLine | |
+ "CleanWordHtml -path \"path to file\"" + Environment.NewLine + Environment.NewLine | |
+ " Other options:" + Environment.NewLine | |
+ " -mso (remove only classes generated by word)" + Environment.NewLine | |
+ " -ignorespans (don't remove span tags)" + Environment.NewLine | |
+ " -ignoredivs (don't remove div tags)"; | |
string filepath = string.Empty; | |
if ((args.Length == 0 || IsNullOrEmpty(args[0]))) | |
{ | |
Console.WriteLine(help); | |
return; | |
} | |
if (args.Length == 1) | |
{ | |
filepath = args[0]; | |
} | |
else | |
{ | |
for (int i = 0;i < args.Length;i++) | |
{ | |
if (args[i].ToLower() == "-path") | |
{ | |
filepath = args[i + 1]; | |
} | |
if (args[i].ToLower() == "-mso") | |
{ | |
Mso = true; | |
} | |
if (args[i].ToLower() == "-ignorespans") | |
{ | |
IgnoreSpans = true; | |
} | |
if (args[i].ToLower() == "-ignoredivs") | |
{ | |
IgnoreDivs = true; | |
} | |
} | |
} | |
if (IsNullOrEmpty(filepath)) | |
{ | |
Console.WriteLine(help); | |
return; | |
} | |
if (Path.GetFileName(filepath) == filepath) | |
{ | |
filepath = Path.Combine(Environment.CurrentDirectory, filepath); | |
} | |
if (!File.Exists(filepath)) | |
{ | |
Console.WriteLine("File '" + filepath + "' doesn't exist."); | |
return; | |
} | |
string html = ReadAllText(filepath); | |
Console.WriteLine("Input html is " + html.Length + " chars"); | |
html = CleanWordHtml(html); | |
html = FixEntities(html); | |
filepath = Path.Combine(Path.GetDirectoryName(filepath), Path.GetFileNameWithoutExtension(filepath) + ".modified" + Path.GetExtension(filepath)); | |
WriteAllText(filepath, html); | |
Console.WriteLine("Cleaned html is " + html.Length + " chars. Saved to " + filepath); | |
} | |
static string CleanWordHtml(string html) | |
{ | |
StringCollection sc = new StringCollection(); | |
if (!IgnoreSpans) | |
{ | |
sc.Add(@"<(/?span|!\[)[^>]*?>"); | |
} | |
if (!IgnoreDivs) | |
{ | |
sc.Add(@"<(/?div|!\[)[^>]*?>"); | |
} | |
if (!Mso) | |
{ | |
// Get rid of classes | |
sc.Add(@"\s?class=[""']?\w+[""']?"); | |
} | |
else | |
{ | |
// Get rid of office classes | |
sc.Add(@"\s?class=[""']?Mso\w+[""']?"); | |
} | |
// get rid of unnecessary tag spans (comments and title) | |
sc.Add(@"<!--(\w|\W)+?-->"); | |
sc.Add(@"<title>(\w|\W)+?</title>"); | |
// get rid of inline style | |
sc.Add(@"\s?style=[""']?\w+[""']?"); | |
// Get rid of unnecessary tags | |
sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?st\d|/?head|/?html|body|/?body|!\[)[^>]*?>"); | |
// Get rid of empty tags (except table cells) | |
sc.Add(@"(<[^/][^(th|d)>]*>){1}( )*(</[^>]+>){1}"); | |
// remove bizarre v: element attached to <img> tag | |
sc.Add(@"\s+v:\w+=""[^""]+"""); | |
// remove extra lines | |
sc.Add(@"(" + Environment.NewLine + "){2,}"); | |
// remove extra spaces | |
sc.Add(@"( ){2,}"); | |
foreach (string s in sc) | |
{ | |
html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); | |
} | |
// quote unquoted attributes | |
//html = Regex.Replace(html, @"(\w+=)(\w+)(?=[ >])", @"$1""$2""", RegexOptions.IgnoreCase); | |
return html; | |
} | |
static string FixEntities(string html) | |
{ | |
NameValueCollection nvc = new NameValueCollection(); | |
nvc.Add("“", "“"); | |
nvc.Add("”", "”"); | |
nvc.Add("—", "—"); | |
foreach (string key in nvc.Keys) | |
{ | |
html = html.Replace(key, nvc[key]); | |
} | |
return html; | |
} | |
static bool IsNullOrEmpty(string value) | |
{ | |
if (value != null) | |
{ | |
return (value.Length == 0); | |
} | |
return true; | |
} | |
static string ReadAllText(string path) | |
{ | |
StringBuilder sb = new StringBuilder(); | |
using (StreamReader sr = new StreamReader(path)) | |
{ | |
String line; | |
// Read and display lines from the file until the end of | |
// the file is reached. | |
while ((line = sr.ReadLine()) != null) | |
{ | |
sb.Append(line + Environment.NewLine); | |
} | |
} | |
return sb.ToString(); | |
} | |
static void WriteAllText(string path, string contents) | |
{ | |
WriteAllText(path, contents, new UTF8Encoding(false, true)); | |
} | |
static void WriteAllText(string path, string contents, Encoding encoding) | |
{ | |
using (StreamWriter sw = new StreamWriter(path, false, encoding)) | |
{ | |
sw.Write(contents); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment