bensaufley · October 16, 2015 10:24
diff --git a/wpautop.rb b/wpautop.rb
 ###
 # Replaces double line-breaks with paragraph elements.
 #
 # A group of regex replaces used to identify text formatted with newlines and
 # replace double line-breaks with HTML paragraph tags. The remaining
 # line-breaks after conversion become <<br />> tags, unless $br is set to '0'
 # or 'false'.
 #
 # @since 0.71
 #
 # @param string pee The text which has to be formatted.
 # @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
 # @return string Text which has been converted into correct paragraph tags.
 ###
 def wpautop(pee, br = true)
  pre_tags = []

  return if pee.strip == ''

  pee = "#{pee}\n" # just to make things a little easier, pad the end

  if pee.include? '<pre'
    pee_parts = pee.split '</pre>'
    last_pee = pee_parts.pop
    pee = ''

    pee_parts.each_with_index do |pee_part, i|
      start = pee_part.index '<pre'

      # Malformed html?
      if start.nil?
        pee += pee_part
        next
      end

      name = "<pre wp-pre-tag-#{i}></pre>"
      pre_tags[name] = pee_part[start..-1]

      pee += pee_part[0..start] + name
    end

    pee += last_pee
  end

  # Extra text formatting
  require 'htmlentities'
  encoder = HTMLEntities.new(:expanded)
  pee.gsub! /(?<=[^\s\n])((?<=\s)---?(?=\s)|--)(?=[^\s\n])/,'—'
  pee.gsub! /(?<=[^\s\n]\s)--?(?=\s[^\s\n])/, '–'
  pee.gsub! /(?<=\A|>)(.+?)(?=\z|<)/ do |match|
    encoder.encode match, :decimal
  end

  pee.gsub!(/<br \/>\s*<br \/>/, "\n\n")
  # Space things out a little
  all_blocks = '(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|form|map|area|blockquote|address|math|style|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
  pee.gsub! Regexp.new("(<#{all_blocks}[^>]*>)"), "\n\\1"
  pee.gsub! Regexp.new("(</#{all_blocks}>)"), "\\1\n\n"

  if pee.include? '<option'
    # no P/BR around option
    pee.gsub! /\s*<option/, '<option'
    pee.gsub! /<\/option>\s*/, '</option>'
  end

  if pee.include? '</object>'
    # no P/BR around param
    pee.gsub! /(<object[^>]*)\s*/, '\1'
    pee.gsub! /\s*<\/object>/, '</object>'
    pee.gsub! /\s*(<\/?(?:param|embed)[^>]*>)\s*/, '\1'
  end

  if pee.include?('<source') || pee.include?('<track')
    # no P/BR around source and track
    pee.gsub! /([<\[](?:audio|video)[^>\]]*[>\]])\s*/, '\1'
    pee.gsub! /\s*([<\[]\/(?:audio|video)[>\]])/, '\1'
    pee.gsub! /\s*(<(?:source|track)[^>]*>)\s*/, '\1'
  end

  pee.gsub! /\n\n+/, "\n\n" # take care of duplicates
  # make paragraphs, including one at the end
  pees = pee.split /\n\s*\n/
  pee = ''

  pees.each do |tinkle|
    pee += "<p>#{tinkle.gsub(/\A\n*(.+?)\n*\z/m, '\1')}</p>\n"
  end

  pee.gsub! /<p>\s*<\/p>/, '' # under certain strange conditions it could create a P of entirely whitespace
  pee.gsub! /<p>([^<]+)<\/(div|address|form)>/, '<p>\1</p></\2>'
  pee.gsub! Regexp.new("<p>\s*(</?#{all_blocks}[^>]*>)\s*</p>"), '\1' # don't pee all over a tag
  pee.gsub! /<p>(<li.+?)<\/p>/, '\1' # problem with nested lists
  pee.gsub! /<p><blockquote([^>]*)>/i, '<blockquote\1><p>'
  pee.gsub! '</blockquote></p>', '</p></blockquote>'
  pee.gsub! Regexp.new("<p>\s*(</?#{all_blocks}[^>]*>)"), '\1'
  pee.gsub! Regexp.new("(</?#{all_blocks}[^>]*>)\s*</p>"), '\1'

  if br
    pee.gsub! /<(script|style).*?<\/\\1>/ do |match|
      match.gsub "\n", '<WPPreserveNewline />'
    end
    pee.gsub! /(?<!<br \/>)\s*\n/, "<br />\n"
    pee.gsub! '<WPPreserveNewline />', "\n"
  end

  pee.gsub! Regexp.new("(</?#{all_blocks}[^>]*>)\s*<br />"), '\1'
  pee.gsub! /<br \/>(\s*<\/?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)/, '\1'
  pee.gsub! /\n<\/p>$/, '</p>'

  unless pre_tags.empty?
    pre_tags.each do |k, v|
      pee.gsub! k, v
    end
  end

  pee
 end
	###
	# Replaces double line-breaks with paragraph elements.
	#
	# A group of regex replaces used to identify text formatted with newlines and
	# replace double line-breaks with HTML paragraph tags. The remaining
	# line-breaks after conversion become <<br />> tags, unless $br is set to '0'
	# or 'false'.
	#
	# @since 0.71
	#
	# @param string pee The text which has to be formatted.
	# @param bool br Optional. If set, this will convert all remaining line-breaks after paragraphing. Default true.
	# @return string Text which has been converted into correct paragraph tags.
	###
	def wpautop(pee, br = true)
	pre_tags = []

	return if pee.strip == ''

	pee = "#{pee}\n" # just to make things a little easier, pad the end

	if pee.include? '<pre'
	pee_parts = pee.split '</pre>'
	last_pee = pee_parts.pop
	pee = ''

	pee_parts.each_with_index do \|pee_part, i\|
	start = pee_part.index '<pre'

	# Malformed html?
	if start.nil?
	pee += pee_part
	next
	end

	name = "<pre wp-pre-tag-#{i}></pre>"
	pre_tags[name] = pee_part[start..-1]

	pee += pee_part[0..start] + name
	end

	pee += last_pee
	end

	# Extra text formatting
	require 'htmlentities'
	encoder = HTMLEntities.new(:expanded)
	pee.gsub! /(?<=[^\s\n])((?<=\s)---?(?=\s)\|--)(?=[^\s\n])/,'—'
	pee.gsub! /(?<=[^\s\n]\s)--?(?=\s[^\s\n])/, '–'
	pee.gsub! /(?<=\A\|>)(.+?)(?=\z\|<)/ do \|match\|
	encoder.encode match, :decimal
	end

	pee.gsub!(/<br \/>\s*<br \/>/, "\n\n")
	# Space things out a little
	all_blocks = '(?:table\|thead\|tfoot\|caption\|col\|colgroup\|tbody\|tr\|td\|th\|div\|dl\|dd\|dt\|ul\|ol\|li\|pre\|form\|map\|area\|blockquote\|address\|math\|style\|p\|h[1-6]\|hr\|fieldset\|legend\|section\|article\|aside\|hgroup\|header\|footer\|nav\|figure\|figcaption\|details\|menu\|summary)'
	pee.gsub! Regexp.new("(<#{all_blocks}[^>]*>)"), "\n\\1"
	pee.gsub! Regexp.new("(</#{all_blocks}>)"), "\\1\n\n"

	if pee.include? '<option'
	# no P/BR around option
	pee.gsub! /\s*<option/, '<option'
	pee.gsub! /<\/option>\s*/, '</option>'
	end

	if pee.include? '</object>'
	# no P/BR around param
	pee.gsub! /(<object[^>])\s/, '\1'
	pee.gsub! /\s*<\/object>/, '</object>'
	pee.gsub! /\s(<\/?(?:param\|embed)[^>]>)\s*/, '\1'
	end

	if pee.include?('<source') \|\| pee.include?('<track')
	# no P/BR around source and track
	pee.gsub! /([<\[](?:audio\|video)[^>\]][>\]])\s/, '\1'
	pee.gsub! /\s*([<\[]\/(?:audio\|video)[>\]])/, '\1'
	pee.gsub! /\s(<(?:source\|track)[^>]>)\s*/, '\1'
	end

	pee.gsub! /\n\n+/, "\n\n" # take care of duplicates
	# make paragraphs, including one at the end
	pees = pee.split /\n\s*\n/
	pee = ''

	pees.each do \|tinkle\|
	pee += "<p>#{tinkle.gsub(/\A\n(.+?)\n\z/m, '\1')}</p>\n"
	end

	pee.gsub! /<p>\s*<\/p>/, '' # under certain strange conditions it could create a P of entirely whitespace
	pee.gsub! /<p>([^<]+)<\/(div\|address\|form)>/, '<p>\1</p></\2>'
	pee.gsub! Regexp.new("<p>\s(</?#{all_blocks}[^>]>)\s*</p>"), '\1' # don't pee all over a tag
	pee.gsub! /<p>(<li.+?)<\/p>/, '\1' # problem with nested lists
	pee.gsub! /<p><blockquote([^>]*)>/i, '<blockquote\1><p>'
	pee.gsub! '</blockquote></p>', '</p></blockquote>'
	pee.gsub! Regexp.new("<p>\s(</?#{all_blocks}[^>]>)"), '\1'
	pee.gsub! Regexp.new("(</?#{all_blocks}[^>]>)\s</p>"), '\1'

	if br
	pee.gsub! /<(script\|style).*?<\/\\1>/ do \|match\|
	match.gsub "\n", '<WPPreserveNewline />'
	end
	pee.gsub! /(?<!<br \/>)\s*\n/, "<br />\n"
	pee.gsub! '<WPPreserveNewline />', "\n"
	end

	pee.gsub! Regexp.new("(</?#{all_blocks}[^>]>)\s<br />"), '\1'
	pee.gsub! /<br \/>(\s<\/?(?:p\|li\|div\|dl\|dd\|dt\|th\|pre\|td\|ul\|ol)[^>]>)/, '\1'
	pee.gsub! /\n<\/p>$/, '</p>'

	unless pre_tags.empty?
	pre_tags.each do \|k, v\|
	pee.gsub! k, v
	end
	end

	pee
	end