Created
July 11, 2013 00:32
-
-
Save plioi/5971498 to your computer and use it in GitHub Desktop.
Wordpress's automatic <p> tag insertion meat grinder, portedn to C#.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static string linebreaks_wp(string body) | |
{ | |
if (body.Trim() == "") | |
return ""; | |
//Ensure all newlines are simply \n and that we end with a \n | |
body = body.Replace("\r\n", "\n") | |
.Replace("\r", "\n"); | |
body = body + "\n"; | |
//Convert br-pairs into \n\n. | |
body = Regex.Replace(body, @"<br />\s*<br />", "\n\n"); | |
const string allblocks = @"(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)"; | |
//Ensure that all 'block' open tags appear at the start of a line. | |
body = Regex.Replace(body, @"(<" + allblocks + "[^>]*>)", m => "\n" + m.Groups[1].Value); | |
//Ensure that all '/block' close tags are followed by a blank line. | |
body = Regex.Replace(body, @"(</" + allblocks + ">)", m => m.Groups[1].Value+"\n\n"); | |
if (body.Contains("<object")){ | |
body = Regex.Replace(body, @"\s*<param([^>]*)>\s*", m => "<param"+m.Groups[1].Value+">");// no body inside object/embed | |
body = Regex.Replace(body, @"\s*</embed>\s*", "</embed>"); | |
} | |
//Shrink long \n\n\n\n\n chains down to a single blank line, \n\n. | |
body = Regex.Replace(body, @"\n\n+", "\n\n"); | |
//Split the whole body by blank lines. | |
var chunks = Regex.Split(body, @"\n\s*\n"); // since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through pees and remove any empty strings. | |
//Optimistically surround each chunk in a <p>..</p>. | |
body = String.Join("", chunks.Select(chunk => "<p>"+chunk.Trim('\n')+"</p>\n")); | |
//Clean away all-whitespace paragraphs. | |
body = Regex.Replace(body, @"<p>\s*</p>", ""); | |
// For a <p> that precedes [no tags] followed by a closing div/address/form tag, | |
// close the paragraph before closing teh div/address/form. | |
body = Regex.Replace(body, @"<p>([^<]+)</(div|address|form)>", | |
m => "<p>" + m.Groups[1].Value + "</p></" + m.Groups[2].Value + ">"); | |
// Remove the optimistic <p>..</p> around block tags, like <p><h2>text</h2></p> => <h2>text</h2>. | |
body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value); // don't body all over a tag | |
// ??LOLWUT?? "Problem with nested lists. | |
body = Regex.Replace(body, @"<p>(<li.+?)</p>", m => m.Groups[1].Value); | |
//Optimistic <p>..</p> around a <blockquote /> should become <blockquote><p>..</p></blockquote>. | |
body = Regex.Replace(body, @"<p><blockquote([^>]*)>", m => "<blockquote"+m.Groups[1].Value+"><p>", RegexOptions.IgnoreCase); | |
body = body.Replace("</blockquote></p>", "</p></blockquote>"); | |
//Strip optimistic <p>..<p> from bar block tags like <p><block></p> or <p></block></p> tags. | |
body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)", m => m.Groups[1].Value); | |
body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value); | |
// Attempt to preserve \n found in script and style tags. | |
body = Regex.Replace(body, @"<(script|style).*?</\1>", m => m.Groups[0].Value.Replace("\n", "<WPPreserveNewline />"), RegexOptions.Singleline); | |
// Convert [not-br-tag] [whitespace] [\n] into [br-tag] [\n] | |
// iow, introduce a br tag for any \n that doesn't already have a br tag. | |
body = Regex.Replace(body, @"(?<!<br />)\s*\n", "<br />\n"); // make line breaks | |
//Include any explicitly requested \n (ie introduced by _autop_newline_preservation_helper) | |
body = body.Replace("<WPPreserveNewline />", "\n"); | |
//Strip any whitespace-and-br-tag that followed a <block> opener or </block> closer. | |
body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*<br />", m => m.Groups[1].Value); | |
//Strip any br tag that precedes one of these special tags, since they already have their own linebreak behavior. | |
body = Regex.Replace(body, @"<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)", m=> m.Groups[1].Value); | |
if (body.Contains("<pre")) | |
{ | |
//(?is) is trying to do case insensitive single-line rules in the subexpression, | |
// but I dont' think this is quite how it is done in c#. | |
body = Regex.Replace(body, "(?is)(<pre[^>]*>)(.*?)</pre>", clean_pre); | |
} | |
body = Regex.Replace(body, @"\n</p>$", "</p>"); | |
return body; | |
} | |
private static string clean_pre(Match m) | |
{ | |
string text; | |
if (m.Groups[1].Success && m.Groups[2].Success) | |
{ | |
text = m.Groups[2].Value; | |
text = text.Replace("<br />", ""); | |
text = text.Replace("<p>", "\n"); | |
text = text.Replace("</p>", ""); | |
text = m.Groups[1].Value + HttpUtility.HtmlEncode(text) + "</pre>"; | |
} | |
else | |
{ | |
text = m.Groups[0].Value; | |
text = text.Replace("<br />", ""); | |
text = text.Replace("<p>", "\n"); | |
text = text.Replace("</p>", ""); | |
} | |
return text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment