Skip to content

Instantly share code, notes, and snippets.

@bensheldon
Created July 30, 2025 22:04
Show Gist options
  • Save bensheldon/e544bc7a7b6e8f2da89a1f4a6cee0909 to your computer and use it in GitHub Desktop.
Save bensheldon/e544bc7a7b6e8f2da89a1f4a6cee0909 to your computer and use it in GitHub Desktop.
# frozen_string_literal: true
# https://www.twilio.com/docs/glossary/what-is-gsm-7-character-encoding
class SmsCounter
MAX_SEGMENTS = 10
GSM7_SINGLE_SEGMENT_LENGTH = 160
GSM7_MULTI_SEGMENT_LENGTH = 153
USC2_SINGLE_SEGMENT_LENGTH = 70
USC2_MULTI_SEGMENT_LENGTH = 67
GSM7_BASE_CHARACTERS = [
'@', '£', '$', '¥', 'è', 'é', 'ù', 'ì', 'ò', 'Ç', "\n", 'Ø', 'ø', "\r", 'Å', 'å',
'Δ', '_', 'Φ', 'Γ', 'Λ', 'Ω', 'Π', 'Ψ', 'Σ', 'Θ', 'Ξ', 'Æ', 'æ', 'ß', 'É',
' ', '!', '"', '#', '¤', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
'¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ñ', 'Ü', '§',
'¿', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ñ', 'ü', 'à'
].to_set.freeze
GSM7_ESCAPED_CHARACTERS = [
'|', '^', '€', '{', '}', '[', ']', '~', '\\'
].to_set.freeze
GSM7_CHARACTERS = (GSM7_BASE_CHARACTERS + GSM7_ESCAPED_CHARACTERS).freeze
attr_reader :message
def initialize(message)
raise ArgumentError, "message must be a string" unless message.is_a?(String)
@message = message
end
def sanitizer
Sanitizer.new(message)
end
# returns a new instance of SmsCounter whose message has been sanitized
def sanitized
self.class.new(sanitizer.message)
end
def sanitizable?
sanitized.message != message
end
def gsm7?
unicode_characters.empty?
end
def ucs2?
!gsm7?
end
def characters
if gsm7?
message.each_char.sum { |char| GSM7_ESCAPED_CHARACTERS.include?(char) ? 2 : 1 }
else
(message.encode('UTF-16BE').bytesize / 2.0).ceil
end
end
def max_characters
if gsm7?
MAX_SEGMENTS * 153
else
MAX_SEGMENTS * 67
end
end
def unicode_characters
@_unicode_characters ||= begin
chars = Set.new
message.each_char do |char|
chars << char unless GSM7_CHARACTERS.include?(char)
end
chars.to_a
end
end
def segments
if gsm7?
if characters <= GSM7_SINGLE_SEGMENT_LENGTH
1
else
(characters.to_f / GSM7_MULTI_SEGMENT_LENGTH).ceil
end
elsif characters <= USC2_SINGLE_SEGMENT_LENGTH
1
else
(characters.to_f / USC2_MULTI_SEGMENT_LENGTH).ceil
end
end
def human
"#{characters} of #{max_characters} characters; #{segments} of #{MAX_SEGMENTS} segments".then do |msg|
if unicode_characters.any?
msg + "; Complex characters: #{unicode_characters.join(', ')}"
else
msg
end
end
end
class Sanitizer
GSM7_CHARACTER_LOOKALIKES = {
'–' => '-', # en dash
'—' => '-', # em dash
'‘' => "'", # left single quotation mark
'’' => "'", # right single quotation mark
'“' => '"', # left double quotation mark
'”' => '"', # right double quotation mark
'…' => '...', # ellipsis
'°' => 'o', # degree symbol
'©' => '(c)', # copyright symbol
'®' => '(r)', # registered trademark symbol
'™' => '(tm)', # trademark symbol
'€' => 'EUR', # euro sign
'•' => '*', # bullet point
'»' => '>>', # right-pointing double angle quotation mark
'«' => '<<', # left-pointing double angle quotation mark
' ' => ' ', # non-breaking space (HTML entity)
}.freeze
attr_reader :original_message
def initialize(original_message)
raise ArgumentError, "original_message must be a string" unless original_message.is_a?(String)
@original_message = original_message
end
def message
sanitized[:sanitized_message]
end
def illegal_characters
sanitized[:illegal_characters]
end
def sanitized_characters
sanitized[:sanitized_characters]
end
private
def sanitized
@_sanitized ||= sanitize
end
def sanitize
sanitized_message = original_message.dup.strip
illegal_characters = Set.new
sanitized_characters = Set.new
sanitized_message.each_char do |char|
next if GSM7_CHARACTERS.include?(char)
illegal_characters << char
end
GSM7_CHARACTER_LOOKALIKES.each do |lookalike, replacement|
next unless sanitized_message.include?(lookalike)
sanitized_message.gsub!(lookalike, replacement)
illegal_characters.delete(lookalike)
sanitized_characters << lookalike
end
{
sanitized_message: sanitized_message,
illegal_characters: illegal_characters.to_a,
sanitized_characters: sanitized_characters.to_a,
}
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment