Created
July 30, 2025 22:04
-
-
Save bensheldon/e544bc7a7b6e8f2da89a1f4a6cee0909 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
# https://www.twilio.com/docs/glossary/what-is-gsm-7-character-encoding | |
class SmsCounter | |
MAX_SEGMENTS = 10 | |
GSM7_SINGLE_SEGMENT_LENGTH = 160 | |
GSM7_MULTI_SEGMENT_LENGTH = 153 | |
USC2_SINGLE_SEGMENT_LENGTH = 70 | |
USC2_MULTI_SEGMENT_LENGTH = 67 | |
GSM7_BASE_CHARACTERS = [ | |
'@', '£', '$', '¥', 'è', 'é', 'ù', 'ì', 'ò', 'Ç', "\n", 'Ø', 'ø', "\r", 'Å', 'å', | |
'Δ', '_', 'Φ', 'Γ', 'Λ', 'Ω', 'Π', 'Ψ', 'Σ', 'Θ', 'Ξ', 'Æ', 'æ', 'ß', 'É', | |
' ', '!', '"', '#', '¤', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', | |
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', | |
'¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', | |
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ñ', 'Ü', '§', | |
'¿', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', | |
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ñ', 'ü', 'à' | |
].to_set.freeze | |
GSM7_ESCAPED_CHARACTERS = [ | |
'|', '^', '€', '{', '}', '[', ']', '~', '\\' | |
].to_set.freeze | |
GSM7_CHARACTERS = (GSM7_BASE_CHARACTERS + GSM7_ESCAPED_CHARACTERS).freeze | |
attr_reader :message | |
def initialize(message) | |
raise ArgumentError, "message must be a string" unless message.is_a?(String) | |
@message = message | |
end | |
def sanitizer | |
Sanitizer.new(message) | |
end | |
# returns a new instance of SmsCounter whose message has been sanitized | |
def sanitized | |
self.class.new(sanitizer.message) | |
end | |
def sanitizable? | |
sanitized.message != message | |
end | |
def gsm7? | |
unicode_characters.empty? | |
end | |
def ucs2? | |
!gsm7? | |
end | |
def characters | |
if gsm7? | |
message.each_char.sum { |char| GSM7_ESCAPED_CHARACTERS.include?(char) ? 2 : 1 } | |
else | |
(message.encode('UTF-16BE').bytesize / 2.0).ceil | |
end | |
end | |
def max_characters | |
if gsm7? | |
MAX_SEGMENTS * 153 | |
else | |
MAX_SEGMENTS * 67 | |
end | |
end | |
def unicode_characters | |
@_unicode_characters ||= begin | |
chars = Set.new | |
message.each_char do |char| | |
chars << char unless GSM7_CHARACTERS.include?(char) | |
end | |
chars.to_a | |
end | |
end | |
def segments | |
if gsm7? | |
if characters <= GSM7_SINGLE_SEGMENT_LENGTH | |
1 | |
else | |
(characters.to_f / GSM7_MULTI_SEGMENT_LENGTH).ceil | |
end | |
elsif characters <= USC2_SINGLE_SEGMENT_LENGTH | |
1 | |
else | |
(characters.to_f / USC2_MULTI_SEGMENT_LENGTH).ceil | |
end | |
end | |
def human | |
"#{characters} of #{max_characters} characters; #{segments} of #{MAX_SEGMENTS} segments".then do |msg| | |
if unicode_characters.any? | |
msg + "; Complex characters: #{unicode_characters.join(', ')}" | |
else | |
msg | |
end | |
end | |
end | |
class Sanitizer | |
GSM7_CHARACTER_LOOKALIKES = { | |
'–' => '-', # en dash | |
'—' => '-', # em dash | |
'‘' => "'", # left single quotation mark | |
'’' => "'", # right single quotation mark | |
'“' => '"', # left double quotation mark | |
'”' => '"', # right double quotation mark | |
'…' => '...', # ellipsis | |
'°' => 'o', # degree symbol | |
'©' => '(c)', # copyright symbol | |
'®' => '(r)', # registered trademark symbol | |
'™' => '(tm)', # trademark symbol | |
'€' => 'EUR', # euro sign | |
'•' => '*', # bullet point | |
'»' => '>>', # right-pointing double angle quotation mark | |
'«' => '<<', # left-pointing double angle quotation mark | |
' ' => ' ', # non-breaking space (HTML entity) | |
}.freeze | |
attr_reader :original_message | |
def initialize(original_message) | |
raise ArgumentError, "original_message must be a string" unless original_message.is_a?(String) | |
@original_message = original_message | |
end | |
def message | |
sanitized[:sanitized_message] | |
end | |
def illegal_characters | |
sanitized[:illegal_characters] | |
end | |
def sanitized_characters | |
sanitized[:sanitized_characters] | |
end | |
private | |
def sanitized | |
@_sanitized ||= sanitize | |
end | |
def sanitize | |
sanitized_message = original_message.dup.strip | |
illegal_characters = Set.new | |
sanitized_characters = Set.new | |
sanitized_message.each_char do |char| | |
next if GSM7_CHARACTERS.include?(char) | |
illegal_characters << char | |
end | |
GSM7_CHARACTER_LOOKALIKES.each do |lookalike, replacement| | |
next unless sanitized_message.include?(lookalike) | |
sanitized_message.gsub!(lookalike, replacement) | |
illegal_characters.delete(lookalike) | |
sanitized_characters << lookalike | |
end | |
{ | |
sanitized_message: sanitized_message, | |
illegal_characters: illegal_characters.to_a, | |
sanitized_characters: sanitized_characters.to_a, | |
} | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment