plioi · July 11, 2013 00:32
diff --git a/linebreaks_wp.cs b/linebreaks_wp.cs
        static string linebreaks_wp(string body)
        {
            if (body.Trim() == "")
                return "";

            //Ensure all newlines are simply \n and that we end with a \n
            body = body.Replace("\r\n", "\n")
                       .Replace("\r", "\n");
            body = body + "\n";

            //Convert br-pairs into \n\n.
            body = Regex.Replace(body, @"<br />\s*<br />", "\n\n");

            const string allblocks = @"(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)";
    
            //Ensure that all 'block' open tags appear at the start of a line.
            body = Regex.Replace(body, @"(<" + allblocks + "[^>]*>)", m => "\n" + m.Groups[1].Value);
            
            //Ensure that all '/block' close tags are followed by a blank line.
            body = Regex.Replace(body, @"(</" + allblocks + ">)", m => m.Groups[1].Value+"\n\n");

            if (body.Contains("<object")){
                body = Regex.Replace(body, @"\s*<param([^>]*)>\s*", m => "<param"+m.Groups[1].Value+">");// no body inside object/embed
                body = Regex.Replace(body, @"\s*</embed>\s*", "</embed>");
            }

            //Shrink long \n\n\n\n\n chains down to a single blank line, \n\n.
            body = Regex.Replace(body, @"\n\n+", "\n\n");
            
            //Split the whole body by blank lines.
            var chunks = Regex.Split(body, @"\n\s*\n"); // since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through pees and remove any empty strings.

            //Optimistically surround each chunk in a <p>..</p>.
            body = String.Join("", chunks.Select(chunk => "<p>"+chunk.Trim('\n')+"</p>\n"));

            //Clean away all-whitespace paragraphs.
            body = Regex.Replace(body, @"<p>\s*</p>", "");

            // For a <p> that precedes [no tags] followed by a closing div/address/form tag,
            //  close the paragraph before closing teh div/address/form.
            body = Regex.Replace(body, @"<p>([^<]+)</(div|address|form)>",
                                 m => "<p>" + m.Groups[1].Value + "</p></" + m.Groups[2].Value + ">");
            
            // Remove the optimistic <p>..</p> around block tags, like <p><h2>text</h2></p> => <h2>text</h2>.
            body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value); // don't body all over a tag
            
            // ??LOLWUT?? "Problem with nested lists.
            body = Regex.Replace(body, @"<p>(<li.+?)</p>", m => m.Groups[1].Value);
            
            //Optimistic <p>..</p> around a <blockquote /> should become <blockquote><p>..</p></blockquote>.
            body = Regex.Replace(body, @"<p><blockquote([^>]*)>", m => "<blockquote"+m.Groups[1].Value+"><p>", RegexOptions.IgnoreCase);
            body = body.Replace("</blockquote></p>", "</p></blockquote>");
            
            //Strip optimistic <p>..<p> from bar block tags like <p><block></p> or <p></block></p> tags.
            body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)", m => m.Groups[1].Value);
            body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value);

            // Attempt to preserve \n found in script and style tags.
            body = Regex.Replace(body, @"<(script|style).*?</\1>", m => m.Groups[0].Value.Replace("\n", "<WPPreserveNewline />"), RegexOptions.Singleline);
            
            // Convert [not-br-tag] [whitespace] [\n] into [br-tag] [\n]
            //  iow, introduce a br tag for any \n that doesn't already have a br tag.
            body = Regex.Replace(body, @"(?<!<br />)\s*\n", "<br />\n"); // make line breaks
            
            //Include any explicitly requested \n (ie introduced by _autop_newline_preservation_helper)
            body = body.Replace("<WPPreserveNewline />", "\n");

            //Strip any whitespace-and-br-tag that followed a <block> opener or </block> closer.
            body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*<br />", m => m.Groups[1].Value);

            //Strip any br tag that precedes one of these special tags, since they already have their own linebreak behavior.
            body = Regex.Replace(body, @"<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)", m=> m.Groups[1].Value);

            if (body.Contains("<pre"))
            {
                //(?is) is trying to do case insensitive single-line rules in the subexpression,
                // but I dont' think this is quite how it is done in c#.
                body = Regex.Replace(body, "(?is)(<pre[^>]*>)(.*?)</pre>", clean_pre);
            }

            body = Regex.Replace(body, @"\n</p>$", "</p>");
            return body;
        }

        private static string clean_pre(Match m)
        {
            string text;
            if (m.Groups[1].Success && m.Groups[2].Success)
            {
                text = m.Groups[2].Value;
                text = text.Replace("<br />", "");
                text = text.Replace("<p>", "\n");
                text = text.Replace("</p>", "");
                text = m.Groups[1].Value + HttpUtility.HtmlEncode(text) + "</pre>";
            }
            else
            {
                text = m.Groups[0].Value;
                text = text.Replace("<br />", "");
                text = text.Replace("<p>", "\n");
                text = text.Replace("</p>", "");
            }
            return text;
        }
	static string linebreaks_wp(string body)
	{
	if (body.Trim() == "")
	return "";

	//Ensure all newlines are simply \n and that we end with a \n
	body = body.Replace("\r\n", "\n")
	.Replace("\r", "\n");
	body = body + "\n";

	//Convert br-pairs into \n\n.
	body = Regex.Replace(body, @"<br />\s*<br />", "\n\n");

	const string allblocks = @"(?:table\|thead\|tfoot\|caption\|col\|colgroup\|tbody\|tr\|td\|th\|div\|dl\|dd\|dt\|ul\|ol\|li\|pre\|select\|option\|form\|map\|area\|blockquote\|address\|math\|style\|input\|p\|h[1-6]\|hr\|fieldset\|legend\|section\|article\|aside\|hgroup\|header\|footer\|nav\|figure\|figcaption\|details\|menu\|summary)";

	//Ensure that all 'block' open tags appear at the start of a line.
	body = Regex.Replace(body, @"(<" + allblocks + "[^>]*>)", m => "\n" + m.Groups[1].Value);

	//Ensure that all '/block' close tags are followed by a blank line.
	body = Regex.Replace(body, @"(</" + allblocks + ">)", m => m.Groups[1].Value+"\n\n");

	if (body.Contains("<object")){
	body = Regex.Replace(body, @"\s<param([^>])>\s*", m => "<param"+m.Groups[1].Value+">");// no body inside object/embed
	body = Regex.Replace(body, @"\s</embed>\s", "</embed>");
	}

	//Shrink long \n\n\n\n\n chains down to a single blank line, \n\n.
	body = Regex.Replace(body, @"\n\n+", "\n\n");

	//Split the whole body by blank lines.
	var chunks = Regex.Split(body, @"\n\s*\n"); // since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through pees and remove any empty strings.

	//Optimistically surround each chunk in a <p>..</p>.
	body = String.Join("", chunks.Select(chunk => "<p>"+chunk.Trim('\n')+"</p>\n"));

	//Clean away all-whitespace paragraphs.
	body = Regex.Replace(body, @"<p>\s*</p>", "");

	// For a <p> that precedes [no tags] followed by a closing div/address/form tag,
	// close the paragraph before closing teh div/address/form.
	body = Regex.Replace(body, @"<p>([^<]+)</(div\|address\|form)>",
	m => "<p>" + m.Groups[1].Value + "</p></" + m.Groups[2].Value + ">");

	// Remove the optimistic <p>..</p> around block tags, like <p><h2>text</h2></p> => <h2>text</h2>.
	body = Regex.Replace(body, @"<p>\s(</?" + allblocks + @"[^>]>)\s*</p>", m => m.Groups[1].Value); // don't body all over a tag

	// ??LOLWUT?? "Problem with nested lists.
	body = Regex.Replace(body, @"<p>(<li.+?)</p>", m => m.Groups[1].Value);

	//Optimistic <p>..</p> around a <blockquote /> should become <blockquote><p>..</p></blockquote>.
	body = Regex.Replace(body, @"<p><blockquote([^>]*)>", m => "<blockquote"+m.Groups[1].Value+"><p>", RegexOptions.IgnoreCase);
	body = body.Replace("</blockquote></p>", "</p></blockquote>");

	//Strip optimistic <p>..<p> from bar block tags like <p><block></p> or <p></block></p> tags.
	body = Regex.Replace(body, @"<p>\s(</?" + allblocks + @"[^>]>)", m => m.Groups[1].Value);
	body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]>)\s</p>", m => m.Groups[1].Value);

	// Attempt to preserve \n found in script and style tags.
	body = Regex.Replace(body, @"<(script\|style).*?</\1>", m => m.Groups[0].Value.Replace("\n", "<WPPreserveNewline />"), RegexOptions.Singleline);

	// Convert [not-br-tag] [whitespace] [\n] into [br-tag] [\n]
	// iow, introduce a br tag for any \n that doesn't already have a br tag.
	body = Regex.Replace(body, @"(?<!<br />)\s*\n", "<br />\n"); // make line breaks

	//Include any explicitly requested \n (ie introduced by _autop_newline_preservation_helper)
	body = body.Replace("<WPPreserveNewline />", "\n");

	//Strip any whitespace-and-br-tag that followed a <block> opener or </block> closer.
	body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]>)\s<br />", m => m.Groups[1].Value);

	//Strip any br tag that precedes one of these special tags, since they already have their own linebreak behavior.
	body = Regex.Replace(body, @"<br />(\s</?(?:p\|li\|div\|dl\|dd\|dt\|th\|pre\|td\|ul\|ol)[^>]>)", m=> m.Groups[1].Value);

	if (body.Contains("<pre"))
	{
	//(?is) is trying to do case insensitive single-line rules in the subexpression,
	// but I dont' think this is quite how it is done in c#.
	body = Regex.Replace(body, "(?is)(<pre[^>]>)(.?)</pre>", clean_pre);
	}

	body = Regex.Replace(body, @"\n</p>$", "</p>");
	return body;
	}

	private static string clean_pre(Match m)
	{
	string text;
	if (m.Groups[1].Success && m.Groups[2].Success)
	{
	text = m.Groups[2].Value;
	text = text.Replace("<br />", "");
	text = text.Replace("<p>", "\n");
	text = text.Replace("</p>", "");
	text = m.Groups[1].Value + HttpUtility.HtmlEncode(text) + "</pre>";
	}
	else
	{
	text = m.Groups[0].Value;
	text = text.Replace("<br />", "");
	text = text.Replace("<p>", "\n");
	text = text.Replace("</p>", "");
	}
	return text;
	}