bensheldon · July 30, 2025 22:04
diff --git a/sms_counter.rb b/sms_counter.rb
 # frozen_string_literal: true

 # https://www.twilio.com/docs/glossary/what-is-gsm-7-character-encoding
 class SmsCounter
  MAX_SEGMENTS = 10
  GSM7_SINGLE_SEGMENT_LENGTH = 160
  GSM7_MULTI_SEGMENT_LENGTH = 153
  USC2_SINGLE_SEGMENT_LENGTH = 70
  USC2_MULTI_SEGMENT_LENGTH = 67

  GSM7_BASE_CHARACTERS = [
    '@', '£', '$', '¥', 'è', 'é', 'ù', 'ì', 'ò', 'Ç', "\n", 'Ø', 'ø', "\r", 'Å', 'å',
    'Δ', '_', 'Φ', 'Γ', 'Λ', 'Ω', 'Π', 'Ψ', 'Σ', 'Θ', 'Ξ', 'Æ', 'æ', 'ß', 'É',
    ' ', '!', '"', '#', '¤', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
    '¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ñ', 'Ü', '§',
    '¿', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ñ', 'ü', 'à'
  ].to_set.freeze

  GSM7_ESCAPED_CHARACTERS = [
    '|', '^', '€', '{', '}', '[', ']', '~', '\\'
  ].to_set.freeze

  GSM7_CHARACTERS = (GSM7_BASE_CHARACTERS + GSM7_ESCAPED_CHARACTERS).freeze

  attr_reader :message

  def initialize(message)
    raise ArgumentError, "message must be a string" unless message.is_a?(String)

    @message = message
  end

  def sanitizer
    Sanitizer.new(message)
  end

  # returns a new instance of SmsCounter whose message has been sanitized
  def sanitized
    self.class.new(sanitizer.message)
  end

  def sanitizable?
    sanitized.message != message
  end

  def gsm7?
    unicode_characters.empty?
  end

  def ucs2?
    !gsm7?
  end

  def characters
    if gsm7?
      message.each_char.sum { |char| GSM7_ESCAPED_CHARACTERS.include?(char) ? 2 : 1 }
    else
      (message.encode('UTF-16BE').bytesize / 2.0).ceil
    end
  end

  def max_characters
    if gsm7?
      MAX_SEGMENTS * 153
    else
      MAX_SEGMENTS * 67
    end
  end

  def unicode_characters
    @_unicode_characters ||= begin
      chars = Set.new
      message.each_char do |char|
        chars << char unless GSM7_CHARACTERS.include?(char)
      end
      chars.to_a
    end
  end

  def segments
    if gsm7?
      if characters <= GSM7_SINGLE_SEGMENT_LENGTH
        1
      else
        (characters.to_f / GSM7_MULTI_SEGMENT_LENGTH).ceil
      end
    elsif characters <= USC2_SINGLE_SEGMENT_LENGTH
      1
    else
      (characters.to_f / USC2_MULTI_SEGMENT_LENGTH).ceil
    end
  end

  def human
    "#{characters} of #{max_characters} characters; #{segments} of #{MAX_SEGMENTS} segments".then do |msg|
      if unicode_characters.any?
        msg + "; Complex characters: #{unicode_characters.join(', ')}"
      else
        msg
      end
    end
  end

  class Sanitizer
    GSM7_CHARACTER_LOOKALIKES = {
      '–' => '-', # en dash
      '—' => '-', # em dash
      '‘' => "'", # left single quotation mark
      '’' => "'", # right single quotation mark
      '“' => '"', # left double quotation mark
      '”' => '"', # right double quotation mark
      '…' => '...', # ellipsis
      '°' => 'o', # degree symbol
      '©' => '(c)', # copyright symbol
      '®' => '(r)', # registered trademark symbol
      '™' => '(tm)', # trademark symbol
      '€' => 'EUR', # euro sign
      '•' => '*', # bullet point
      '»' => '>>', # right-pointing double angle quotation mark
      '«' => '<<', # left-pointing double angle quotation mark
      ' ' => ' ', # non-breaking space (HTML entity)
    }.freeze

    attr_reader :original_message

    def initialize(original_message)
      raise ArgumentError, "original_message must be a string" unless original_message.is_a?(String)

      @original_message = original_message
    end

    def message
      sanitized[:sanitized_message]
    end

    def illegal_characters
      sanitized[:illegal_characters]
    end

    def sanitized_characters
      sanitized[:sanitized_characters]
    end

    private

    def sanitized
      @_sanitized ||= sanitize
    end

    def sanitize
      sanitized_message = original_message.dup.strip
      illegal_characters = Set.new
      sanitized_characters = Set.new

      sanitized_message.each_char do |char|
        next if GSM7_CHARACTERS.include?(char)

        illegal_characters << char
      end

      GSM7_CHARACTER_LOOKALIKES.each do |lookalike, replacement|
        next unless sanitized_message.include?(lookalike)

        sanitized_message.gsub!(lookalike, replacement)
        illegal_characters.delete(lookalike)
        sanitized_characters << lookalike
      end

      {
        sanitized_message: sanitized_message,
        illegal_characters: illegal_characters.to_a,
        sanitized_characters: sanitized_characters.to_a,
      }
    end
  end
 end
	# frozen_string_literal: true

	# https://www.twilio.com/docs/glossary/what-is-gsm-7-character-encoding
	class SmsCounter
	MAX_SEGMENTS = 10
	GSM7_SINGLE_SEGMENT_LENGTH = 160
	GSM7_MULTI_SEGMENT_LENGTH = 153
	USC2_SINGLE_SEGMENT_LENGTH = 70
	USC2_MULTI_SEGMENT_LENGTH = 67

	GSM7_BASE_CHARACTERS = [
	'@', '£', '$', '¥', 'è', 'é', 'ù', 'ì', 'ò', 'Ç', "\n", 'Ø', 'ø', "\r", 'Å', 'å',
	'Δ', '_', 'Φ', 'Γ', 'Λ', 'Ω', 'Π', 'Ψ', 'Σ', 'Θ', 'Ξ', 'Æ', 'æ', 'ß', 'É',
	' ', '!', '"', '#', '¤', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
	'¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
	'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ñ', 'Ü', '§',
	'¿', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
	'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ñ', 'ü', 'à'
	].to_set.freeze

	GSM7_ESCAPED_CHARACTERS = [
	'\|', '^', '€', '{', '}', '[', ']', '~', '\\'
	].to_set.freeze

	GSM7_CHARACTERS = (GSM7_BASE_CHARACTERS + GSM7_ESCAPED_CHARACTERS).freeze

	attr_reader :message

	def initialize(message)
	raise ArgumentError, "message must be a string" unless message.is_a?(String)

	@message = message
	end

	def sanitizer
	Sanitizer.new(message)
	end

	# returns a new instance of SmsCounter whose message has been sanitized
	def sanitized
	self.class.new(sanitizer.message)
	end

	def sanitizable?
	sanitized.message != message
	end

	def gsm7?
	unicode_characters.empty?
	end

	def ucs2?
	!gsm7?
	end

	def characters
	if gsm7?
	message.each_char.sum { \|char\| GSM7_ESCAPED_CHARACTERS.include?(char) ? 2 : 1 }
	else
	(message.encode('UTF-16BE').bytesize / 2.0).ceil
	end
	end

	def max_characters
	if gsm7?
	MAX_SEGMENTS * 153
	else
	MAX_SEGMENTS * 67
	end
	end

	def unicode_characters
	@_unicode_characters \|\|= begin
	chars = Set.new
	message.each_char do \|char\|
	chars << char unless GSM7_CHARACTERS.include?(char)
	end
	chars.to_a
	end
	end

	def segments
	if gsm7?
	if characters <= GSM7_SINGLE_SEGMENT_LENGTH
	1
	else
	(characters.to_f / GSM7_MULTI_SEGMENT_LENGTH).ceil
	end
	elsif characters <= USC2_SINGLE_SEGMENT_LENGTH
	1
	else
	(characters.to_f / USC2_MULTI_SEGMENT_LENGTH).ceil
	end
	end

	def human
	"#{characters} of #{max_characters} characters; #{segments} of #{MAX_SEGMENTS} segments".then do \|msg\|
	if unicode_characters.any?
	msg + "; Complex characters: #{unicode_characters.join(', ')}"
	else
	msg
	end
	end
	end

	class Sanitizer
	GSM7_CHARACTER_LOOKALIKES = {
	'–' => '-', # en dash
	'—' => '-', # em dash
	'‘' => "'", # left single quotation mark
	'’' => "'", # right single quotation mark
	'“' => '"', # left double quotation mark
	'”' => '"', # right double quotation mark
	'…' => '...', # ellipsis
	'°' => 'o', # degree symbol
	'©' => '(c)', # copyright symbol
	'®' => '(r)', # registered trademark symbol
	'™' => '(tm)', # trademark symbol
	'€' => 'EUR', # euro sign
	'•' => '*', # bullet point
	'»' => '>>', # right-pointing double angle quotation mark
	'«' => '<<', # left-pointing double angle quotation mark
	' ' => ' ', # non-breaking space (HTML entity)
	}.freeze

	attr_reader :original_message

	def initialize(original_message)
	raise ArgumentError, "original_message must be a string" unless original_message.is_a?(String)

	@original_message = original_message
	end

	def message
	sanitized[:sanitized_message]
	end

	def illegal_characters
	sanitized[:illegal_characters]
	end

	def sanitized_characters
	sanitized[:sanitized_characters]
	end

	private

	def sanitized
	@_sanitized \|\|= sanitize
	end

	def sanitize
	sanitized_message = original_message.dup.strip
	illegal_characters = Set.new
	sanitized_characters = Set.new

	sanitized_message.each_char do \|char\|
	next if GSM7_CHARACTERS.include?(char)

	illegal_characters << char
	end

	GSM7_CHARACTER_LOOKALIKES.each do \|lookalike, replacement\|
	next unless sanitized_message.include?(lookalike)

	sanitized_message.gsub!(lookalike, replacement)
	illegal_characters.delete(lookalike)
	sanitized_characters << lookalike
	end

	{
	sanitized_message: sanitized_message,
	illegal_characters: illegal_characters.to_a,
	sanitized_characters: sanitized_characters.to_a,
	}
	end
	end
	end