Last active
January 28, 2025 11:47
-
-
Save searls/9d8ee42929da99ae268477eb20818da6 to your computer and use it in GitHub Desktop.
Truncating content to fit into a tweet-like post without accidentally truncating in the middle of a URL or a hashtag is… not fun?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Platforms::TruncatesContent::IdentifiesPatternRanges | |
HASHTAG_PATTERN = /\B#\p{L}[\p{L}\p{M}\p{Nd}_]*/ | |
# Ripped outta this https://github.com/amogil/url_regex/blob/master/lib/url_regex.rb | |
URL_PATTERN_STRING = ' | |
# scheme | |
(?:(?:https?|ftp)://)? | |
# user:pass authentication | |
(?:\S+(?::\S*)?@)? | |
(?: | |
# IP address exclusion | |
# private & local networks | |
(?!(?:10|127)(?:\.\d{1,3}){3}) | |
(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2}) | |
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2}) | |
# IP address dotted notation octets | |
# excludes loopback network 0.0.0.0 | |
# excludes reserved space >= 224.0.0.0 | |
# excludes network & broadcast addresses | |
# (first & last IP address of each class) | |
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3]) | |
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2} | |
(?:\.(?:[0-9]\d?|1\d\d|2[0-4]\d|25[0-5])) | |
| | |
# host name | |
(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+) | |
# domain name | |
(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)* | |
# TLD identifier based on top 200 https://dnsinstitute.com/research/popular-tld-rank/ | |
(?:\.(?:com|net|ru|org|info|in|ir|uk|au|de|ua|ca|tr|co|jp|vn|cn|gr|fr|tk|tw|id|br|io|xyz|it|nl|pl|za|us|eu|mx|ch|biz|me|il|es|online|by|xn--p1ai|nz|kr|cz|ro|cf|ar|club|my|tv|kz|cl|pk|pro|site|th|se|sg|cc|be|rs|top|ga|ma|hu|ae|su|dk|hk|at|ml|shop|store|ng|np|no|app|live|pe|ph|ie|lk|gq|edu|fi|ai|sa|pw|tech|bd|sk|ke|pt|az|space|mk|ge|tn|lt|dev|to|gov|md|asia|lv|uz|hr|mn|website|am|ws|life|fun|news|mobi|vip|ee|bg|la|ec|blog|cloud|si|work|uy|link|nu|ba|agency|icu|media|im|digital|do|bz|kg|is|world|al|ug|design|xxx|cm|mu|one|today|so|sh|tj|name|network|gg|ac|guru|best|studio|eg|fm|ms|cx|sc|ve|global|dz|sx|vc|group|qa|nf|cat|py|win|ki|ps|buzz|gt|finance|academy|host|ly|bo|travel|company|art|tz|zw|center|jo|cr|ltd|click|nyc|solutions|lu|tokyo|rocks|team|cy|games|coop|aero|market|cyou|video|ci)) | |
# TLD may end with dot | |
\.? | |
) | |
# port number | |
(?::\d{2,5})? | |
# resource path | |
(?:[/?#]\S*)? | |
'.freeze | |
URL_PATTERN = /#{URL_PATTERN_STRING}/xi | |
def identify(str, pattern) | |
matches = [] | |
str.scan(pattern) do |match| | |
match_data = Regexp.last_match | |
grapheme_index = str[0...match_data.begin(0)].unicode_normalize(:nfc).grapheme_clusters.count | |
grapheme_length = match.unicode_normalize(:nfc).grapheme_clusters.count | |
char_index = match_data.begin(0) | |
char_length = match.length | |
matches << { | |
substring: match, | |
grapheme_index: grapheme_index, | |
grapheme_length: grapheme_length, | |
char_index: char_index, | |
char_length: char_length | |
} | |
end | |
matches | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "test_helper" | |
class Platforms::TruncatesContent | |
class IdentifiesPatternRangesTest < ActiveSupport::TestCase | |
setup do | |
@subject = IdentifiesPatternRanges.new | |
end | |
def test_stuff | |
assert_equal [ | |
{substring: "a", grapheme_index: 0, grapheme_length: 1, char_index: 0, char_length: 1}, | |
{substring: "b", grapheme_index: 3, grapheme_length: 1, char_index: 3, char_length: 1}, | |
{substring: "c", grapheme_index: 6, grapheme_length: 1, char_index: 6, char_length: 1} | |
], @subject.identify("a1 b2 c3", /[a-z]/) | |
end | |
def test_urls | |
assert_equal [ | |
{substring: "example.com", grapheme_index: 20, grapheme_length: 11, char_index: 20, char_length: 11}, | |
{substring: "http://www.aol.com/hi?name=justin", grapheme_index: 36, grapheme_length: 33, char_index: 36, char_length: 33} | |
], @subject.identify("My favorite site is example.com and http://www.aol.com/hi?name=justin -- how about you?", IdentifiesPatternRanges::URL_PATTERN) | |
assert_equal [ | |
{substring: "https://example.com?peeps=true", grapheme_index: 21, grapheme_length: 30, char_index: 21, char_length: 30} | |
], @subject.identify("I'm just full of for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN) | |
assert_equal [ | |
{substring: "https://example.com?peeps=true", grapheme_index: 23, grapheme_length: 30, char_index: 23, char_length: 30} | |
], @subject.identify("I'm just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN) | |
assert_equal [ | |
{substring: "https://example.com?peeps=true", grapheme_index: 25, grapheme_length: 30, char_index: 27, char_length: 30} | |
], @subject.identify("I'm 🐈⬛ just full of 💚 for https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN) | |
assert_equal [ | |
{substring: "https://example.com?peeps=true", grapheme_index: 27, grapheme_length: 30, char_index: 31, char_length: 30} | |
], @subject.identify("I'm 🐈⬛ just full of 💚 for 🐈⬛ https://example.com?peeps=true <- this is awesome! ", IdentifiesPatternRanges::URL_PATTERN) | |
assert_equal [ | |
{substring: "🧑🧑🧒🧒", grapheme_index: 7, grapheme_length: 1, char_index: 7, char_length: 7}, | |
{substring: "🧑🧑🧒🧒", grapheme_index: 31, grapheme_length: 1, char_index: 39, char_length: 7} | |
], @subject.identify("Humble 🧑🧑🧒🧒 with a 🐈⬛ but mostly a 🧑🧑🧒🧒 of humans", /🧑🧑🧒🧒/) | |
assert_equal [ | |
{substring: "🧑🧑🧒🧒a🐈⬛", grapheme_index: 15, grapheme_length: 3, char_index: 23, char_length: 11}, | |
{substring: "🧑🧑🧒🧒boop🐈⬛", grapheme_index: 32, grapheme_length: 6, char_index: 48, char_length: 14} | |
], @subject.identify("What if 🧑🧑🧒🧒🐈⬛ but 🧑🧑🧒🧒a🐈⬛ is sometimes 🧑🧑🧒🧒boop🐈⬛ but never 🧑🧑🧒🧒c🐈 in this", /🧑🧑🧒🧒\w+🐈⬛/) | |
assert_equal [ | |
{substring: "http://www.foo.co/🧑🧑🧒🧒🐈⬛", grapheme_index: 8, grapheme_length: 20, char_index: 8, char_length: 28}, | |
{substring: "https://me.com/pants?name=🧑🧑🧒🧒boop🐈⬛", grapheme_index: 37, grapheme_length: 32, char_index: 45, char_length: 40} | |
], @subject.identify("site 1: http://www.foo.co/🧑🧑🧒🧒🐈⬛ site 2: https://me.com/pants?name=🧑🧑🧒🧒boop🐈⬛ ", IdentifiesPatternRanges::URL_PATTERN) | |
# limit TLDs to top 200 known ones | |
assert_empty @subject.identify("Sometimes at the end of sentences.he forgets to put a space after the period", IdentifiesPatternRanges::URL_PATTERN) | |
end | |
def test_hashtags | |
# As of 1/22/25, only mastodon supports emoji in hashtags so just #foo will count at the end here | |
assert_equal [ | |
{substring: "#cool", grapheme_index: 7, grapheme_length: 5, char_index: 7, char_length: 5}, | |
{substring: "#cat", grapheme_index: 13, grapheme_length: 4, char_index: 13, char_length: 4}, | |
{substring: "#foo", grapheme_index: 37, grapheme_length: 4, char_index: 37, char_length: 4} | |
], @subject.identify("I am a #cool #cat #1notavalidhashtag #foo🐈⬛", IdentifiesPatternRanges::HASHTAG_PATTERN) | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Platforms | |
# This is a fucking nightmare FYI nbd | |
# Basically, if you want to truncate content to fit a social post (blog, tweet, etc.), | |
# you need to be careful not to cut off URLs, hashtags, or else you might end up generating | |
# invalid URLs and unintentional hashtags and mentions in the truncated content. | |
# So this uses a class (IdentifiesPatternRanges) which will mark the start and end of | |
# every offending match in the content, then the TruncatesContent class will use that | |
# information to truncate the content in a way that doesn't cut off any of those matches. | |
# | |
# When space allows, add a marker (…) to the end of the string, but if you can squeeze in one more unbreakable | |
# token (e.g. URL, hashtag) before the marker, leaves it off. | |
class TruncatesContent | |
def initialize | |
@identifies_pattern_ranges = IdentifiesPatternRanges.new | |
end | |
def truncate(content, limit, mode: :grapheme_clusters, marker: "…") | |
chars = split(content, mode) | |
return content.strip if chars.size <= limit | |
# In theory, we could have alternate ellipsis markers | |
marker_size = split(marker, mode).size | |
limit_less_marker = limit - marker_size # Need room for the ellipsis or ⋯ or whatever | |
# Find all unbreakable tokens (e.g. URLs, hashtags) that might disrupt naive truncation, get the earliest one that intersects the limit | |
unbreakables = @identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::URL_PATTERN) | |
.concat(@identifies_pattern_ranges.identify(content, IdentifiesPatternRanges::HASHTAG_PATTERN)) | |
.map { |match| normalize_match(match, mode) } | |
unbreakable_token = unbreakables | |
.select { |match| intersects?(match, limit_less_marker) } | |
.min_by { |match| match[:index] } | |
# Base the limit on the beginning of the earliest intersecting unbreakable token (e.g. URL, hashtag) or else the limit less the ellipsis | |
truncated_chars = if unbreakable_token.present? | |
if (perfect_fit = chars[0...(unbreakable_token[:index] + unbreakable_token[:length])]).size <= limit | |
# include the unbreakable token if it fits without the marker appended | |
perfect_fit | |
else | |
chars[0...unbreakable_token[:index]] | |
end | |
else | |
chars[0...limit_less_marker] | |
end | |
# If we're going to end the string on the boundary of an unbreakable, we can't put an ellipsis/marker after (b/c UI may fail to render links, hashtags correctly) | |
truncated_str = truncated_chars.join.strip | |
if unbreakables.select { |match| intersects?(match, count(truncated_str, mode) - 1) }.any? | |
# If there's room for the marker after a space, do that | |
if count((spaced_marker = "#{truncated_str} #{marker}"), mode) <= limit | |
spaced_marker | |
else | |
# otherwise just end on the unbreakable token itself | |
truncated_str | |
end | |
else | |
"#{truncated_str}#{marker}" | |
end | |
end | |
private | |
def count(content, mode) | |
split(content, mode).size | |
end | |
def split(content, mode) | |
case mode | |
when :grapheme_clusters | |
content.grapheme_clusters | |
when :characters | |
content.chars | |
else | |
raise ArgumentError, "Unknown mode: #{mode}" | |
end | |
end | |
def normalize_match(match, mode) | |
case mode | |
when :grapheme_clusters | |
{substring: match[:substring], index: match[:grapheme_index], length: match[:grapheme_length]} | |
when :characters | |
{substring: match[:substring], index: match[:char_index], length: match[:char_length]} | |
else | |
raise ArgumentError, "Unknown mode: #{mode}" | |
end | |
end | |
def intersects?(match, limit) | |
match[:index] < limit && (match[:index] + match[:length]) > limit | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "test_helper" | |
module Platforms | |
class TruncatesContentTest < ActiveSupport::TestCase | |
setup do | |
@subject = TruncatesContent.new | |
end | |
def test_basics_counting_grapheme_clusters | |
# Regardless of whether there's plenty of length | |
assert_equal "Hello", @subject.truncate(" Hello ", 50) | |
assert_equal "Hello", @subject.truncate("Hello", 5) | |
assert_equal "Hello…", @subject.truncate("Hello, world!", 6) | |
assert_equal "Hello, world!", @subject.truncate("Hello, world!", 14) | |
assert_equal "Hello, world!", @subject.truncate("Hello, world!", 13) | |
assert_equal "Hello, worl…", @subject.truncate("Hello, world!", 12) | |
assert_equal "🧑🧑🧒🧒 acme.co", @subject.truncate("🧑🧑🧒🧒 acme.co", 10) | |
assert_equal "🧑🧑🧒🧒 acme.co", @subject.truncate("🧑🧑🧒🧒 acme.co", 9) | |
assert_equal "🧑🧑🧒🧒…", @subject.truncate("🧑🧑🧒🧒 acme.co", 8) | |
assert_equal "🧑🧑🧒🧒…", @subject.truncate("🧑🧑🧒🧒 www.example.com is cool", 8) | |
# Room for the URL: | |
assert_equal "🧑🧑🧒🧒 https://a.co/foo?bar=true <…", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 30) | |
# Don't put a marker right after an unbreakable token (since URLs, hashtags might break if followed by …) | |
assert_equal "🧑🧑🧒🧒 https://a.co/foo?bar=true …", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 29) | |
# Skip the marker entirely if it would be right after the URL and there's no room for it | |
assert_equal "🧑🧑🧒🧒 https://a.co/foo?bar=true", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 28) | |
# TODO: failing because we are limiting with space for the marker first, which results in the marker limit impinging on the URL even though it'd fit | |
assert_equal "🧑🧑🧒🧒 https://a.co/foo?bar=true", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 27) | |
# No room for the url | |
assert_equal "🧑🧑🧒🧒…", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 26) | |
# Really not enough room for the url | |
assert_equal "🧑🧑🧒🧒…", @subject.truncate("🧑🧑🧒🧒 https://a.co/foo?bar=true <-- free links!", 4) | |
assert_equal "Follow me 🧑🧑🧒🧒 #followfriday …", @subject.truncate("Follow me 🧑🧑🧒🧒 #followfriday #peeps", 27) | |
assert_equal "Follow me 🧑🧑🧒🧒 #followfriday", @subject.truncate("Follow me 🧑🧑🧒🧒 #followfriday #peeps", 26) | |
assert_equal "Follow me 🧑🧑🧒🧒 #followfriday", @subject.truncate("Follow me 🧑🧑🧒🧒 #followfriday #peeps", 25) | |
assert_equal "Follow me 🧑🧑🧒🧒…", @subject.truncate("Follow me 🧑🧑🧒🧒 #followfriday #peeps", 24) | |
assert_equal "Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 42) | |
assert_equal "Follow me 🧑🧑🧒🧒 justin.searls.co …", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 41) | |
assert_equal "Follow me 🧑🧑🧒🧒 justin.searls.co …", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 30) | |
assert_equal "Follow me 🧑🧑🧒🧒 justin.searls.co", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 29) | |
assert_equal "Follow me 🧑🧑🧒🧒 justin.searls.co", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 28) | |
assert_equal "Follow me 🧑🧑🧒🧒…", @subject.truncate("Follow me 🧑🧑🧒🧒 justin.searls.co #followfriday", 27) | |
# Using a multi-grapheme marker | |
assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 14, marker: "🧑🧑🧒🧒") | |
assert_equal "I am #thirsty", @subject.truncate("I am #thirsty", 13, marker: "🧑🧑🧒🧒") | |
assert_equal "I am🧑🧑🧒🧒", @subject.truncate("I am #thirsty", 12, marker: "🧑🧑🧒🧒") | |
assert_equal "I am🧑🧑🧒🧒", @subject.truncate("I am #thirsty", 5, marker: "🧑🧑🧒🧒") | |
assert_equal "I a🧑🧑🧒🧒", @subject.truncate("I am #thirsty", 4, marker: "🧑🧑🧒🧒") | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment