Skip to content

Instantly share code, notes, and snippets.

@searls
Last active January 28, 2025 11:47
Show Gist options
  • Save searls/9d8ee42929da99ae268477eb20818da6 to your computer and use it in GitHub Desktop.
Save searls/9d8ee42929da99ae268477eb20818da6 to your computer and use it in GitHub Desktop.
Truncating content to fit into a tweet-like post without accidentally truncating in the middle of a URL or a hashtag is… not fun?
class Platforms::TruncatesContent::IdentifiesPatternRanges
HASHTAG_PATTERN = /\B#\p{L}[\p{L}\p{M}\p{Nd}_]*/
# Ripped outta this https://github.com/amogil/url_regex/blob/master/lib/url_regex.rb
URL_PATTERN_STRING = '
# scheme
(?:(?:https?|ftp)://)?
# user:pass authentication
(?:\S+(?::\S*)?@)?
(?:
# IP address exclusion
# private & local networks
(?!(?:10|127)(?:\.\d{1,3}){3})
(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
(?:\.(?:[0-9]\d?|1\d\d|2[0-4]\d|25[0-5]))
|
# host name
(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)
# domain name
(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*
# TLD identifier based on top 200 https://dnsinstitute.com/research/popular-tld-rank/
(?:\.(?:com|net|ru|org|info|in|ir|uk|au|de|ua|ca|tr|co|jp|vn|cn|gr|fr|tk|tw|id|br|io|xyz|it|nl|pl|za|us|eu|mx|ch|biz|me|il|es|online|by|xn--p1ai|nz|kr|cz|ro|cf|ar|club|my|tv|kz|cl|pk|pro|site|th|se|sg|cc|be|rs|top|ga|ma|hu|ae|su|dk|hk|at|ml|shop|store|ng|np|no|app|live|pe|ph|ie|lk|gq|edu|fi|ai|sa|pw|tech|bd|sk|ke|pt|az|space|mk|ge|tn|lt|dev|to|gov|md|asia|lv|uz|hr|mn|website|am|ws|life|fun|news|mobi|vip|ee|bg|la|ec|blog|cloud|si|work|uy|link|nu|ba|agency|icu|media|im|digital|do|bz|kg|is|world|al|ug|design|xxx|cm|mu|one|today|so|sh|tj|name|network|gg|ac|guru|best|studio|eg|fm|ms|cx|sc|ve|global|dz|sx|vc|group|qa|nf|cat|py|win|ki|ps|buzz|gt|finance|academy|host|ly|bo|travel|company|art|tz|zw|center|jo|cr|ltd|click|nyc|solutions|lu|tokyo|rocks|team|cy|games|coop|aero|market|cyou|video|ci))
# TLD may end with dot
\.?
)
# port number
(?::\d{2,5})?
# resource path
(?:[/?#]\S*)?
'.freeze
URL_PATTERN = /#{URL_PATTERN_STRING}/xi
def identify(str, pattern)
matches = []
str.scan(pattern) do |match|
match_data = Regexp.last_match
grapheme_index = str[0...match_data.begin(0)].unicode_normalize(:nfc).grapheme_clusters.count
grapheme_length = match.unicode_normalize(:nfc).grapheme_clusters.count
char_index = match_data.begin(0)
char_length = match.length
matches << {
substring: match,
grapheme_index: grapheme_index,
grapheme_length: grapheme_length,
char_index: char_index,
char_length: char_length
}
end
matches
end
end
require "test_helper"
class Platforms::TruncatesContent
class IdentifiesPatternRangesTest < ActiveSupport::TestCase
setup do
@subject = IdentifiesPatternRanges.new
end
def test_stuff
assert_equal [
{substring: "a", grapheme_index: 0, grapheme_length: 1, char_index: 0, char_length: 1},
{substring: "b", grapheme_index: 3, grapheme_length: 1, char_index: 3, char_length: 1},
{substring: "c", grapheme_index: 6, grapheme_length: 1, char_index: 6, char_length: 1}
], @subject.identify("a1 b2 c3", /[a-z]/)
end
def test_urls
assert_equal [
{substring: "example.com", grapheme_index: 20, grapheme_length: 11, char_index: 20, char_length: 11},
{substring: "http://www.aol.com/hi?name=justin", grapheme_index: 36, grapheme_length: 33, char_index: 36, char_length: 33}
], @subject.identify("My favorite site is example.com and http://www.aol.com/hi?name=justin -- how about you?", IdentifiesPatternRanges::URL_PATTERN)
assert_equal [
{substring: "https://example.com?peeps=true", grapheme_index: 21, grapheme_length: 30, char_index: 21, char_length: 30}
], @subject.identify("I'm just full of for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
assert_equal [
{substring: "https://example.com?peeps=true", grapheme_index: 23, grapheme_length: 30, char_index: 23, char_length: 30}
], @subject.identify("I'm just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
assert_equal [
{substring: "https://example.com?peeps=true", grapheme_index: 25, grapheme_length: 30, char_index: 27, char_length: 30}
], @subject.identify("I'm 🐈‍⬛ just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
assert_equal [
{substring: "https://example.com?peeps=true", grapheme_index: 27, grapheme_length: 30, char_index: 31, char_length: 30}
], @subject.identify("I'm 🐈‍⬛ just full of 💚 for 🐈‍⬛ https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN)
assert_equal [
{substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 7, grapheme_length: 1, char_index: 7, char_length: 7},
{substring: "🧑‍🧑‍🧒‍🧒", grapheme_index: 31, grapheme_length: 1, char_index: 39, char_length: 7}
], @subject.identify("Humble 🧑‍🧑‍🧒‍🧒 with a 🐈‍⬛ but mostly a 🧑‍🧑‍🧒‍🧒 of humans", /🧑‍🧑‍🧒‍🧒/)
assert_equal [
{substring: "🧑‍🧑‍🧒‍🧒a🐈‍⬛", grapheme_index: 15, grapheme_length: 3, char_index: 23, char_length: 11},
{substring: "🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 32, grapheme_length: 6, char_index: 48, char_length: 14}
], @subject.identify("What if 🧑‍🧑‍🧒‍🧒🐈‍⬛ but 🧑‍🧑‍🧒‍🧒a🐈‍⬛ is sometimes 🧑‍🧑‍🧒‍🧒boop🐈‍⬛ but never 🧑‍🧑‍🧒‍🧒c🐈 in this", /🧑‍🧑‍🧒‍🧒\w+🐈‍⬛/)
assert_equal [
{substring: "http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛", grapheme_index: 8, grapheme_length: 20, char_index: 8, char_length: 28},
{substring: "https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛", grapheme_index: 37, grapheme_length: 32, char_index: 45, char_length: 40}
], @subject.identify("site 1: http://www.foo.co/🧑‍🧑‍🧒‍🧒🐈‍⬛ site 2: https://me.com/pants?name=🧑‍🧑‍🧒‍🧒boop🐈‍⬛ ", IdentifiesPatternRanges::URL_PATTERN)
# limit TLDs to top 200 known ones
assert_empty @subject.identify("Sometimes at the end of sentences.he forgets to put a space after the period", IdentifiesPatternRanges::URL_PATTERN)
end
def test_hashtags
# As of 1/22/25, only mastodon supports emoji in hashtags so just #foo will count at the end here
assert_equal [
{substring: "#cool", grapheme_index: 7, grapheme_length: 5, char_index: 7, char_length: 5},
{substring: "#cat", grapheme_index: 13, grapheme_length: 4, char_index: 13, char_length: 4},
{substring: "#foo", grapheme_index: 37, grapheme_length: 4, char_index: 37, char_length: 4}
], @subject.identify("I am a #cool #cat #1notavalidhashtag #foo🐈‍⬛", IdentifiesPatternRanges::HASHTAG_PATTERN)
end
end
end
module Platforms
# This is a fucking nightmare FYI nbd
# Basically, if you want to truncate content to fit a social post (blog, tweet, etc.),
# you need to be careful not to cut off URLs, hashtags, or else you might end up generating
# invalid URLs and unintentional hashtags and mentions in the truncated content.
# So this uses a class (IdentifiesPatternRanges) which will mark the start and end of
# every offending match in the content, then the TruncatesContent class will use that
# information to truncate the content in a way that doesn't cut off any of those matches.
#
# When space allows, add a marker (…) to the end of the string, but if you can squeeze in one more unbreakable
# token (e.g. URL, hashtag) before the marker, leaves it off.
class TruncatesContent
def initialize
@identifies_pattern_ranges = IdentifiesPatternRanges.new
end
def truncate(content, limit, mode: :grapheme_clusters, marker: "…")
chars = split(content, mode)
return content.strip if chars.size <= limit
# In theory, we could have alternate ellipsis markers
marker_size = split(marker, mode).size
limit_less_marker = limit - marker_size # Need room for the ellipsis or ⋯ or whatever
# Find all unbreakable tokens (e.g. URLs, hashtags) that might disrupt naive truncation, get the earliest one that intersects the limit
unbreakables = @identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::URL_PATTERN)
.concat(@identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::HASHTAG_PATTERN))
.map { |match| normalize_match(match, mode) }
unbreakable_token = unbreakables
.select { |match| intersects?(match, limit_less_marker) }
.min_by { |match| match[:index] }
# Base the limit on the beginning of the earliest intersecting unbreakable token (e.g. URL, hashtag) or else the limit less the ellipsis
truncated_chars = if unbreakable_token.present?
if (perfect_fit = chars[0...(unbreakable_token[:index] + unbreakable_token[:length])]).size <= limit
# include the unbreakable token if it fits without the marker appended
perfect_fit
else
chars[0...unbreakable_token[:index]]
end
else
chars[0...limit_less_marker]
end
# If we're going to end the string on the boundary of an unbreakable, we can't put an ellipsis/marker after (b/c UI may fail to render links, hashtags correctly)
truncated_str = truncated_chars.join.strip
if unbreakables.select { |match| intersects?(match, count(truncated_str, mode) - 1) }.any?
# If there's room for the marker after a space, do that
if count((spaced_marker = "#{truncated_str} #{marker}"), mode) <= limit
spaced_marker
else
# otherwise just end on the unbreakable token itself
truncated_str
end
else
"#{truncated_str}#{marker}"
end
end
private
def count(content, mode)
split(content, mode).size
end
def split(content, mode)
case mode
when :grapheme_clusters
content.grapheme_clusters
when :characters
content.chars
else
raise ArgumentError, "Unknown mode: #{mode}"
end
end
def normalize_match(match, mode)
case mode
when :grapheme_clusters
{substring: match[:substring], index: match[:grapheme_index], length: match[:grapheme_length]}
when :characters
{substring: match[:substring], index: match[:char_index], length: match[:char_length]}
else
raise ArgumentError, "Unknown mode: #{mode}"
end
end
def intersects?(match, limit)
match[:index] < limit && (match[:index] + match[:length]) > limit
end
end
end
require "test_helper"
module Platforms
class TruncatesContentTest < ActiveSupport::TestCase
setup do
@subject = TruncatesContent.new
end
def test_basics_counting_grapheme_clusters
# Regardless of whether there's plenty of length
assert_equal "Hello", @subject.truncate(" Hello ", 50)
assert_equal "Hello", @subject.truncate("Hello", 5)
assert_equal "Hello…", @subject.truncate("Hello, world!", 6)
assert_equal "Hello, world!", @subject.truncate("Hello, world!", 14)
assert_equal "Hello, world!", @subject.truncate("Hello, world!", 13)
assert_equal "Hello, worl…", @subject.truncate("Hello, world!", 12)
assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 10)
assert_equal "🧑‍🧑‍🧒‍🧒 acme.co", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 9)
assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 acme.co", 8)
assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 www.example.com is cool", 8)
# Room for the URL:
assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 30)
# Don't put a marker right after an unbreakable token (since URLs, hashtags might break if followed by …)
assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true …", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 29)
# Skip the marker entirely if it would be right after the URL and there's no room for it
assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 28)
# TODO: failing because we are limiting with space for the marker first, which results in the marker limit impinging on the URL even though it'd fit
assert_equal "🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 27)
# No room for the url
assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 26)
# Really not enough room for the url
assert_equal "🧑‍🧑‍🧒‍🧒…", @subject.truncate("🧑‍🧑‍🧒‍🧒 https://a.co/foo?bar=true <-- free links!", 4)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 27)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 26)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 25)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 #followfriday #peeps", 24)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 42)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 41)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co …", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 30)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 29)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 28)
assert_equal "Follow me 🧑‍🧑‍🧒‍🧒…", @subject.truncate("Follow me 🧑‍🧑‍🧒‍🧒 justin.searls.co #followfriday", 27)
# Using a multi-grapheme marker
assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 14, marker: "🧑‍🧑‍🧒‍🧒")
assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 13, marker: "🧑‍🧑‍🧒‍🧒")
assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 12, marker: "🧑‍🧑‍🧒‍🧒")
assert_equal "I am🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 5, marker: "🧑‍🧑‍🧒‍🧒")
assert_equal "I a🧑‍🧑‍🧒‍🧒", @subject.truncate("I am #thirsty", 4, marker: "🧑‍🧑‍🧒‍🧒")
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment