searls · January 28, 2025 11:47
diff --git a/identifies_pattern_ranges.rb b/identifies_pattern_ranges.rb
 class Platforms::TruncatesContent::IdentifiesPatternRanges
  HASHTAG_PATTERN = /\B#\p{L}[\p{L}\p{M}\p{Nd}_]*/

  # Ripped outta this https://github.com/amogil/url_regex/blob/master/lib/url_regex.rb
  URL_PATTERN_STRING = '
    # scheme
    (?:(?:https?|ftp)://)?
    # user:pass authentication
    (?:\S+(?::\S*)?@)?

    (?:
      # IP address exclusion
      # private & local networks
      (?!(?:10|127)(?:\.\d{1,3}){3})
      (?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})
      (?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
      # IP address dotted notation octets
      # excludes loopback network 0.0.0.0
      # excludes reserved space >= 224.0.0.0
      # excludes network & broadcast addresses
      # (first & last IP address of each class)
      (?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
      (?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
      (?:\.(?:[0-9]\d?|1\d\d|2[0-4]\d|25[0-5]))
      |
      # host name
      (?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)
      # domain name
      (?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*
      # TLD identifier based on top 200 https://dnsinstitute.com/research/popular-tld-rank/
      (?:\.(?:com|net|ru|org|info|in|ir|uk|au|de|ua|ca|tr|co|jp|vn|cn|gr|fr|tk|tw|id|br|io|xyz|it|nl|pl|za|us|eu|mx|ch|biz|me|il|es|online|by|xn--p1ai|nz|kr|cz|ro|cf|ar|club|my|tv|kz|cl|pk|pro|site|th|se|sg|cc|be|rs|top|ga|ma|hu|ae|su|dk|hk|at|ml|shop|store|ng|np|no|app|live|pe|ph|ie|lk|gq|edu|fi|ai|sa|pw|tech|bd|sk|ke|pt|az|space|mk|ge|tn|lt|dev|to|gov|md|asia|lv|uz|hr|mn|website|am|ws|life|fun|news|mobi|vip|ee|bg|la|ec|blog|cloud|si|work|uy|link|nu|ba|agency|icu|media|im|digital|do|bz|kg|is|world|al|ug|design|xxx|cm|mu|one|today|so|sh|tj|name|network|gg|ac|guru|best|studio|eg|fm|ms|cx|sc|ve|global|dz|sx|vc|group|qa|nf|cat|py|win|ki|ps|buzz|gt|finance|academy|host|ly|bo|travel|company|art|tz|zw|center|jo|cr|ltd|click|nyc|solutions|lu|tokyo|rocks|team|cy|games|coop|aero|market|cyou|video|ci))
      # TLD may end with dot
      \.?
    )

    # port number
    (?::\d{2,5})?

    # resource path
    (?:[/?#]\S*)?
  '.freeze
  URL_PATTERN = /#{URL_PATTERN_STRING}/xi

  def identify(str, pattern)
    matches = []
    str.scan(pattern) do |match|
      match_data = Regexp.last_match
      grapheme_index = str[0...match_data.begin(0)].unicode_normalize(:nfc).grapheme_clusters.count
      grapheme_length = match.unicode_normalize(:nfc).grapheme_clusters.count
      char_index = match_data.begin(0)
      char_length = match.length
      matches << {
        substring: match,
        grapheme_index: grapheme_index,
        grapheme_length: grapheme_length,
        char_index: char_index,
        char_length: char_length
      }
    end
    matches
  end
 end
diff --git a/identifies_pattern_ranges_test.rb b/identifies_pattern_ranges_test.rb
 require "test_helper"

 class Platforms::TruncatesContent
  class IdentifiesPatternRangesTest < ActiveSupport::TestCase
    setup do
      @subject = IdentifiesPatternRanges.new
    end

    def test_stuff
      assert_equal [
        {substring: "a", grapheme_index: 0, grapheme_length: 1, char_index: 0, char_length: 1},
        {substring: "b", grapheme_index: 3, grapheme_length: 1, char_index: 3, char_length: 1},
        {substring: "c", grapheme_index: 6, grapheme_length: 1, char_index: 6, char_length: 1}
      ], @subject.identify("a1 b2 c3", /[a-z]/)
    end

    def test_urls
      assert_equal [
        {substring: "example.com", grapheme_index: 20, grapheme_length: 11, char_index: 20, char_length: 11},
        {substring: "http://www.aol.com/hi?name=justin", grapheme_index: 36, grapheme_length: 33, char_index: 36, char_length: 33}
      ], @subject.identify("My favorite site is example.com and http://www.aol.com/hi?name=justin -- how about you?", IdentifiesPatternRanges::URL_PATTERN)

      assert_equal [
        {substring: "https://example.com?peeps=true", grapheme_index: 21, grapheme_length: 30, char_index: 21, char_length: 30}
      ], @subject.identify("I'm just full of for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
      assert_equal [
        {substring: "https://example.com?peeps=true", grapheme_index: 23, grapheme_length: 30, char_index: 23, char_length: 30}
      ], @subject.identify("I'm just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
      assert_equal [
        {substring: "https://example.com?peeps=true", grapheme_index: 25, grapheme_length: 30, char_index: 27, char_length: 30}
      ], @subject.identify("I'm 🐈‍⬛ just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
      assert_equal [
        {substring: "https://example.com?peeps=true", grapheme_index: 27, grapheme_length: 30, char_index: 31, char_length: 30}
      ], @subject.identify("I'm 🐈‍⬛ just full of 💚 for 🐈‍⬛ https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)

      assert_equal [
        {substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 7, grapheme_length: 1, char_index: 7, char_length: 7},
        {substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 31, grapheme_length: 1, char_index: 39, char_length: 7}
      ], @subject.identify("Humble 🧑‍🧑‍🧒‍🧒 with a 🐈‍⬛ but mostly a 🧑‍🧑‍🧒‍🧒 of humans", /🧑‍🧑‍🧒‍🧒/)

      assert_equal [
        {substring: "🧑‍🧑‍🧒‍🧒a🐈‍⬛", grapheme_index: 15, grapheme_length: 3, char_index: 23, char_length: 11},
        {substring: "🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 32, grapheme_length: 6, char_index: 48, char_length: 14}
      ], @subject.identify("What if 🧑‍🧑‍🧒‍🧒🐈‍⬛ but 🧑‍🧑‍🧒‍🧒a🐈‍⬛ is sometimes 🧑‍🧑‍🧒‍🧒boop🐈‍⬛ but never 🧑‍🧑‍🧒‍🧒c🐈 in this", /🧑‍🧑‍🧒‍🧒\w+🐈‍⬛/)

      assert_equal [
        {substring: "http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛", grapheme_index: 8, grapheme_length: 20, char_index: 8, char_length: 28},
        {substring: "https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 37, grapheme_length: 32, char_index: 45, char_length: 40}
      ], @subject.identify("site 1: http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛ site 2: https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛ ", IdentifiesPatternRanges::URL_PATTERN)

      # limit TLDs to top 200 known ones
      assert_empty @subject.identify("Sometimes at the end of sentences.he forgets to put a space after the period", IdentifiesPatternRanges::URL_PATTERN)
    end

    def test_hashtags
      # As of 1/22/25, only mastodon supports emoji in hashtags so just #foo will count at the end here
      assert_equal [
        {substring: "#cool", grapheme_index: 7, grapheme_length: 5, char_index: 7, char_length: 5},
        {substring: "#cat", grapheme_index: 13, grapheme_length: 4, char_index: 13, char_length: 4},
        {substring: "#foo", grapheme_index: 37, grapheme_length: 4, char_index: 37, char_length: 4}
      ], @subject.identify("I am a #cool #cat #1notavalidhashtag #foo🐈‍⬛", IdentifiesPatternRanges::HASHTAG_PATTERN)
    end
  end
 end
diff --git a/truncates_content.rb b/truncates_content.rb
 module Platforms
  # This is a fucking nightmare FYI nbd
  # Basically, if you want to truncate content to fit a social post (blog, tweet, etc.),
  # you need to be careful not to cut off URLs, hashtags, or else you might end up generating
  # invalid URLs and unintentional hashtags and mentions in the truncated content.
  # So this uses a class (IdentifiesPatternRanges) which will mark the start and end of
  # every offending match in the content, then the TruncatesContent class will use that
  # information to truncate the content in a way that doesn't cut off any of those matches.
  #
  # When space allows, add a marker (…) to the end of the string, but if you can squeeze in one more unbreakable
  # token (e.g. URL, hashtag) before the marker, leaves it off.
  class TruncatesContent
    def initialize
      @identifies_pattern_ranges = IdentifiesPatternRanges.new
    end

    def truncate(content, limit, mode: :grapheme_clusters, marker: "…")
      chars = split(content, mode)
      return content.strip if chars.size <= limit

      # In theory, we could have alternate ellipsis markers
      marker_size = split(marker, mode).size
      limit_less_marker = limit - marker_size # Need room for the ellipsis or ⋯ or whatever

      # Find all unbreakable tokens (e.g. URLs, hashtags) that might disrupt naive truncation, get the earliest one that intersects the limit
      unbreakables = @identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::URL_PATTERN)
        .concat(@identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::HASHTAG_PATTERN))
        .map { |match| normalize_match(match, mode) }
      unbreakable_token = unbreakables
        .select { |match| intersects?(match, limit_less_marker) }
        .min_by { |match| match[:index] }

      # Base the limit on the beginning of the earliest intersecting unbreakable token (e.g. URL, hashtag) or else the limit less the ellipsis
      truncated_chars = if unbreakable_token.present?
        if (perfect_fit = chars[0...(unbreakable_token[:index] + unbreakable_token[:length])]).size <= limit
          # include the unbreakable token if it fits without the marker appended
          perfect_fit
        else
          chars[0...unbreakable_token[:index]]
        end
      else
        chars[0...limit_less_marker]
      end

      # If we're going to end the string on the boundary of an unbreakable, we can't put an ellipsis/marker after (b/c UI may fail to render links, hashtags correctly)
      truncated_str = truncated_chars.join.strip
      if unbreakables.select { |match| intersects?(match, count(truncated_str, mode) - 1) }.any?
        # If there's room for the marker after a space, do that
        if count((spaced_marker = "#{truncated_str} #{marker}"), mode) <= limit
          spaced_marker
        else
          # otherwise just end on the unbreakable token itself
          truncated_str
        end
      else
        "#{truncated_str}#{marker}"
      end
    end

    private

    def count(content, mode)
      split(content, mode).size
    end

    def split(content, mode)
      case mode
      when :grapheme_clusters
        content.grapheme_clusters
      when :characters
        content.chars
      else
        raise ArgumentError, "Unknown mode: #{mode}"
      end
    end

    def normalize_match(match, mode)
      case mode
      when :grapheme_clusters
        {substring: match[:substring], index: match[:grapheme_index], length: match[:grapheme_length]}
      when :characters
        {substring: match[:substring], index: match[:char_index], length: match[:char_length]}
      else
        raise ArgumentError, "Unknown mode: #{mode}"
      end
    end

    def intersects?(match, limit)
      match[:index] < limit && (match[:index] + match[:length]) > limit
    end
  end
 end
diff --git a/truncates_content_test.rb b/truncates_content_test.rb
 require "test_helper"

 module Platforms
  class TruncatesContentTest < ActiveSupport::TestCase
    setup do
      @subject = TruncatesContent.new
    end

    def test_basics_counting_grapheme_clusters
      # Regardless of whether there's plenty of length
      assert_equal "Hello", @subject.truncate("   Hello  ", 50)

      assert_equal "Hello", @subject.truncate("Hello", 5)
      assert_equal "Hello…", @subject.truncate("Hello, world!", 6)
      assert_equal "Hello, world!", @subject.truncate("Hello, world!", 14)
      assert_equal "Hello, world!", @subject.truncate("Hello, world!", 13)
      assert_equal "Hello, worl…", @subject.truncate("Hello, world!", 12)
      assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 10)
      assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 9)
      assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 8)
      assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 www.example.com is cool", 8)
      # Room for the URL:
      assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 30)
      # Don't put a marker right after an unbreakable token (since URLs, hashtags might break if followed by …)
      assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true …", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 29)
      # Skip the marker entirely if it would be right after the URL and there's no room for it
      assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 28)
      # TODO: failing because we are limiting with space for the marker first, which results in the marker limit impinging on the URL even though it'd fit
      assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 27)
      # No room for the url
      assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 26)
      # Really not enough room for the url
      assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 4)

      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 27)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 26)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 25)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 24)

      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 42)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 41)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 30)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 29)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 28)
      assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 27)

      # Using a multi-grapheme marker
      assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 14, marker: "🧑‍🧑‍🧒‍🧒")
      assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 13, marker: "🧑‍🧑‍🧒‍🧒")
      assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 12, marker: "🧑‍🧑‍🧒‍🧒")
      assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 5, marker: "🧑‍🧑‍🧒‍🧒")
      assert_equal "I a🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 4, marker: "🧑‍🧑‍🧒‍🧒")
    end
  end
 end
	class Platforms::TruncatesContent::IdentifiesPatternRanges
	HASHTAG_PATTERN = /\B#\p{L}[\p{L}\p{M}\p{Nd}_]*/

	# Ripped outta this https://github.com/amogil/url_regex/blob/master/lib/url_regex.rb
	URL_PATTERN_STRING = '
	# scheme
	(?:(?:https?\|ftp)://)?
	# user:pass authentication
	(?:\S+(?::\S*)?@)?

	(?:
	# IP address exclusion
	# private & local networks
	(?!(?:10\|127)(?:\.\d{1,3}){3})
	(?!(?:169\.254\|192\.168)(?:\.\d{1,3}){2})
	(?!172\.(?:1[6-9]\|2\d\|3[0-1])(?:\.\d{1,3}){2})
	# IP address dotted notation octets
	# excludes loopback network 0.0.0.0
	# excludes reserved space >= 224.0.0.0
	# excludes network & broadcast addresses
	# (first & last IP address of each class)
	(?:[1-9]\d?\|1\d\d\|2[01]\d\|22[0-3])
	(?:\.(?:1?\d{1,2}\|2[0-4]\d\|25[0-5])){2}
	(?:\.(?:[0-9]\d?\|1\d\d\|2[0-4]\d\|25[0-5]))
	\|
	# host name
	(?:(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)
	# domain name
	(?:\.(?:[a-z\u00a1-\uffff0-9]-)[a-z\u00a1-\uffff0-9]+)*
	# TLD identifier based on top 200 https://dnsinstitute.com/research/popular-tld-rank/
	(?:\.(?:com\|net\|ru\|org\|info\|in\|ir\|uk\|au\|de\|ua\|ca\|tr\|co\|jp\|vn\|cn\|gr\|fr\|tk\|tw\|id\|br\|io\|xyz\|it\|nl\|pl\|za\|us\|eu\|mx\|ch\|biz\|me\|il\|es\|online\|by\|xn--p1ai\|nz\|kr\|cz\|ro\|cf\|ar\|club\|my\|tv\|kz\|cl\|pk\|pro\|site\|th\|se\|sg\|cc\|be\|rs\|top\|ga\|ma\|hu\|ae\|su\|dk\|hk\|at\|ml\|shop\|store\|ng\|np\|no\|app\|live\|pe\|ph\|ie\|lk\|gq\|edu\|fi\|ai\|sa\|pw\|tech\|bd\|sk\|ke\|pt\|az\|space\|mk\|ge\|tn\|lt\|dev\|to\|gov\|md\|asia\|lv\|uz\|hr\|mn\|website\|am\|ws\|life\|fun\|news\|mobi\|vip\|ee\|bg\|la\|ec\|blog\|cloud\|si\|work\|uy\|link\|nu\|ba\|agency\|icu\|media\|im\|digital\|do\|bz\|kg\|is\|world\|al\|ug\|design\|xxx\|cm\|mu\|one\|today\|so\|sh\|tj\|name\|network\|gg\|ac\|guru\|best\|studio\|eg\|fm\|ms\|cx\|sc\|ve\|global\|dz\|sx\|vc\|group\|qa\|nf\|cat\|py\|win\|ki\|ps\|buzz\|gt\|finance\|academy\|host\|ly\|bo\|travel\|company\|art\|tz\|zw\|center\|jo\|cr\|ltd\|click\|nyc\|solutions\|lu\|tokyo\|rocks\|team\|cy\|games\|coop\|aero\|market\|cyou\|video\|ci))
	# TLD may end with dot
	\.?
	)

	# port number
	(?::\d{2,5})?

	# resource path
	(?:[/?#]\S*)?
	'.freeze
	URL_PATTERN = /#{URL_PATTERN_STRING}/xi

	def identify(str, pattern)
	matches = []
	str.scan(pattern) do \|match\|
	match_data = Regexp.last_match
	grapheme_index = str[0...match_data.begin(0)].unicode_normalize(:nfc).grapheme_clusters.count
	grapheme_length = match.unicode_normalize(:nfc).grapheme_clusters.count
	char_index = match_data.begin(0)
	char_length = match.length
	matches << {
	substring: match,
	grapheme_index: grapheme_index,
	grapheme_length: grapheme_length,
	char_index: char_index,
	char_length: char_length
	}
	end
	matches
	end
	end
	require "test_helper"

	class Platforms::TruncatesContent
	class IdentifiesPatternRangesTest < ActiveSupport::TestCase
	setup do
	@subject = IdentifiesPatternRanges.new
	end

	def test_stuff
	assert_equal [
	{substring: "a", grapheme_index: 0, grapheme_length: 1, char_index: 0, char_length: 1},
	{substring: "b", grapheme_index: 3, grapheme_length: 1, char_index: 3, char_length: 1},
	{substring: "c", grapheme_index: 6, grapheme_length: 1, char_index: 6, char_length: 1}
	], @subject.identify("a1 b2 c3", /[a-z]/)
	end

	def test_urls
	assert_equal [
	{substring: "example.com", grapheme_index: 20, grapheme_length: 11, char_index: 20, char_length: 11},
	{substring: "http://www.aol.com/hi?name=justin", grapheme_index: 36, grapheme_length: 33, char_index: 36, char_length: 33}
	], @subject.identify("My favorite site is example.com and http://www.aol.com/hi?name=justin -- how about you?", IdentifiesPatternRanges::URL_PATTERN)

	assert_equal [
	{substring: "https://example.com?peeps=true", grapheme_index: 21, grapheme_length: 30, char_index: 21, char_length: 30}
	], @subject.identify("I'm just full of for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
	assert_equal [
	{substring: "https://example.com?peeps=true", grapheme_index: 23, grapheme_length: 30, char_index: 23, char_length: 30}
	], @subject.identify("I'm just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
	assert_equal [
	{substring: "https://example.com?peeps=true", grapheme_index: 25, grapheme_length: 30, char_index: 27, char_length: 30}
	], @subject.identify("I'm 🐈‍⬛ just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
	assert_equal [
	{substring: "https://example.com?peeps=true", grapheme_index: 27, grapheme_length: 30, char_index: 31, char_length: 30}
	], @subject.identify("I'm 🐈‍⬛ just full of 💚 for 🐈‍⬛ https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)

	assert_equal [
	{substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 7, grapheme_length: 1, char_index: 7, char_length: 7},
	{substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 31, grapheme_length: 1, char_index: 39, char_length: 7}
	], @subject.identify("Humble 🧑‍🧑‍🧒‍🧒 with a 🐈‍⬛ but mostly a 🧑‍🧑‍🧒‍🧒 of humans", /🧑‍🧑‍🧒‍🧒/)

	assert_equal [
	{substring: "🧑‍🧑‍🧒‍🧒a🐈‍⬛", grapheme_index: 15, grapheme_length: 3, char_index: 23, char_length: 11},
	{substring: "🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 32, grapheme_length: 6, char_index: 48, char_length: 14}
	], @subject.identify("What if 🧑‍🧑‍🧒‍🧒🐈‍⬛ but 🧑‍🧑‍🧒‍🧒a🐈‍⬛ is sometimes 🧑‍🧑‍🧒‍🧒boop🐈‍⬛ but never 🧑‍🧑‍🧒‍🧒c🐈 in this", /🧑‍🧑‍🧒‍🧒\w+🐈‍⬛/)

	assert_equal [
	{substring: "http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛", grapheme_index: 8, grapheme_length: 20, char_index: 8, char_length: 28},
	{substring: "https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 37, grapheme_length: 32, char_index: 45, char_length: 40}
	], @subject.identify("site 1: http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛ site 2: https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛ ", IdentifiesPatternRanges::URL_PATTERN)

	# limit TLDs to top 200 known ones
	assert_empty @subject.identify("Sometimes at the end of sentences.he forgets to put a space after the period", IdentifiesPatternRanges::URL_PATTERN)
	end

	def test_hashtags
	# As of 1/22/25, only mastodon supports emoji in hashtags so just #foo will count at the end here
	assert_equal [
	{substring: "#cool", grapheme_index: 7, grapheme_length: 5, char_index: 7, char_length: 5},
	{substring: "#cat", grapheme_index: 13, grapheme_length: 4, char_index: 13, char_length: 4},
	{substring: "#foo", grapheme_index: 37, grapheme_length: 4, char_index: 37, char_length: 4}
	], @subject.identify("I am a #cool #cat #1notavalidhashtag #foo🐈‍⬛", IdentifiesPatternRanges::HASHTAG_PATTERN)
	end
	end
	end
	module Platforms
	# This is a fucking nightmare FYI nbd
	# Basically, if you want to truncate content to fit a social post (blog, tweet, etc.),
	# you need to be careful not to cut off URLs, hashtags, or else you might end up generating
	# invalid URLs and unintentional hashtags and mentions in the truncated content.
	# So this uses a class (IdentifiesPatternRanges) which will mark the start and end of
	# every offending match in the content, then the TruncatesContent class will use that
	# information to truncate the content in a way that doesn't cut off any of those matches.
	#
	# When space allows, add a marker (…) to the end of the string, but if you can squeeze in one more unbreakable
	# token (e.g. URL, hashtag) before the marker, leaves it off.
	class TruncatesContent
	def initialize
	@identifies_pattern_ranges = IdentifiesPatternRanges.new
	end

	def truncate(content, limit, mode: :grapheme_clusters, marker: "…")
	chars = split(content, mode)
	return content.strip if chars.size <= limit

	# In theory, we could have alternate ellipsis markers
	marker_size = split(marker, mode).size
	limit_less_marker = limit - marker_size # Need room for the ellipsis or ⋯ or whatever

	# Find all unbreakable tokens (e.g. URLs, hashtags) that might disrupt naive truncation, get the earliest one that intersects the limit
	unbreakables = @identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::URL_PATTERN)
	.concat(@identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::HASHTAG_PATTERN))
	.map { \|match\| normalize_match(match, mode) }
	unbreakable_token = unbreakables
	.select { \|match\| intersects?(match, limit_less_marker) }
	.min_by { \|match\| match[:index] }

	# Base the limit on the beginning of the earliest intersecting unbreakable token (e.g. URL, hashtag) or else the limit less the ellipsis
	truncated_chars = if unbreakable_token.present?
	if (perfect_fit = chars[0...(unbreakable_token[:index] + unbreakable_token[:length])]).size <= limit
	# include the unbreakable token if it fits without the marker appended
	perfect_fit
	else
	chars[0...unbreakable_token[:index]]
	end
	else
	chars[0...limit_less_marker]
	end

	# If we're going to end the string on the boundary of an unbreakable, we can't put an ellipsis/marker after (b/c UI may fail to render links, hashtags correctly)
	truncated_str = truncated_chars.join.strip
	if unbreakables.select { \|match\| intersects?(match, count(truncated_str, mode) - 1) }.any?
	# If there's room for the marker after a space, do that
	if count((spaced_marker = "#{truncated_str} #{marker}"), mode) <= limit
	spaced_marker
	else
	# otherwise just end on the unbreakable token itself
	truncated_str
	end
	else
	"#{truncated_str}#{marker}"
	end
	end

	private

	def count(content, mode)
	split(content, mode).size
	end

	def split(content, mode)
	case mode
	when :grapheme_clusters
	content.grapheme_clusters
	when :characters
	content.chars
	else
	raise ArgumentError, "Unknown mode: #{mode}"
	end
	end

	def normalize_match(match, mode)
	case mode
	when :grapheme_clusters
	{substring: match[:substring], index: match[:grapheme_index], length: match[:grapheme_length]}
	when :characters
	{substring: match[:substring], index: match[:char_index], length: match[:char_length]}
	else
	raise ArgumentError, "Unknown mode: #{mode}"
	end
	end

	def intersects?(match, limit)
	match[:index] < limit && (match[:index] + match[:length]) > limit
	end
	end
	end
	require "test_helper"

	module Platforms
	class TruncatesContentTest < ActiveSupport::TestCase
	setup do
	@subject = TruncatesContent.new
	end

	def test_basics_counting_grapheme_clusters
	# Regardless of whether there's plenty of length
	assert_equal "Hello", @subject.truncate(" Hello ", 50)

	assert_equal "Hello", @subject.truncate("Hello", 5)
	assert_equal "Hello…", @subject.truncate("Hello, world!", 6)
	assert_equal "Hello, world!", @subject.truncate("Hello, world!", 14)
	assert_equal "Hello, world!", @subject.truncate("Hello, world!", 13)
	assert_equal "Hello, worl…", @subject.truncate("Hello, world!", 12)
	assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 10)
	assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 9)
	assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 8)
	assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 www.example.com is cool", 8)
	# Room for the URL:
	assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 30)
	# Don't put a marker right after an unbreakable token (since URLs, hashtags might break if followed by …)
	assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true …", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 29)
	# Skip the marker entirely if it would be right after the URL and there's no room for it
	assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 28)
	# TODO: failing because we are limiting with space for the marker first, which results in the marker limit impinging on the URL even though it'd fit
	assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 27)
	# No room for the url
	assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 26)
	# Really not enough room for the url
	assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 4)

	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 27)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 26)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 25)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 24)

	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 42)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 41)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 30)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 29)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 28)
	assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 27)

	# Using a multi-grapheme marker
	assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 14, marker: "🧑‍🧑‍🧒‍🧒")
	assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 13, marker: "🧑‍🧑‍🧒‍🧒")
	assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 12, marker: "🧑‍🧑‍🧒‍🧒")
	assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 5, marker: "🧑‍🧑‍🧒‍🧒")
	assert_equal "I a🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 4, marker: "🧑‍🧑‍🧒‍🧒")
	end
	end
	end