Last active
August 29, 2015 14:20
-
-
Save cypres/dcc12fbd8a17bdf1ff94 to your computer and use it in GitHub Desktop.
Sample code to count number of chars in a SMS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cassert> // cassert | |
#include <cmath> // ceil | |
#include <iostream> // cout | |
#include <cstring> // memset, memcpy | |
// Given a UTF-8 encoded string, calculate the length of the resulting GSM | |
// 03.38 converted string. | |
// It assumes the input is valid UTF-8 and UTF-8 chars that can not be | |
// represented will be replaced with a single ? char. | |
// High optimized code, runs best with clang and optimizations (-O) enabled. | |
size_t SmsLength(const char *str) { | |
size_t l = 0; | |
signed char c; | |
for (; ; ++l) { | |
c = *str; | |
if (c == '\0') break; | |
if (__builtin_expect(c > 0, 1)) { | |
// This is ASCII | |
++str; | |
switch (c) { // Check if this a special char in GSM 03.38 context? | |
default: break; | |
case '\'': case '{': case '}': case '~': | |
case '[': case ']': case '\\': case '|': | |
++l; // Add an extra since these chars will be escaped | |
break; | |
} | |
} else { | |
// The only UTF-8 char that needs to be escaped for GSM 03.38 is € | |
// All others that are not supported will be replaced with a single ? | |
if (__builtin_expect(c == '\xE2', 0) && | |
(*(str+1)) == '\x82' && | |
(*(str+2)) == '\xAC') { | |
++l; | |
} | |
// Skip ahead based on the type of UTF-8 escape sequence | |
switch (0xF0 & c) { | |
case 0xE0: | |
str += 3; | |
break; | |
case 0xF0: | |
str += 4; | |
break; | |
default: | |
str += 2; | |
break; | |
} | |
} | |
} | |
return l; | |
} | |
// Get the number of SMSes an UTF-8 string will be split into | |
// Assumes | |
// - The string will be sent using GSM 03.38 encoding | |
// - The SMSC will use 8-bit reference numbers, leaving 153 chars pr. SMS | |
// If message contains unicode chars outside GSM 03.38 it should be sent as | |
// UCS-2, with lower amounts of chars for each split, see comments below. | |
int SmsSplits(const char *str) { | |
size_t length = SmsLength(str); | |
size_t chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2 | |
size_t chars_single_sms = 160; // 70 for UCS-2 | |
if (length <= chars_single_sms) { | |
return 1; | |
} else { | |
return ceil(static_cast<double>(length)/chars_per_sms); | |
} | |
} | |
// Compile with: `clang++ -Ofast -g smslen.cc -o smslen` | |
// Run `time ./smslen` | |
int main() { | |
const char t[] = "Hej v€rden. Hvordan har du det i dag? " | |
"Jeg glæder mig til at se dig."; | |
// Benchmark length calculation | |
for (int i = 0; i < 10000000; i++) { | |
assert(SmsLength(t) == 68); | |
} | |
// Try some unicode | |
assert(SmsLength("~€") == 4); | |
assert(SmsLength("こんにちは") == 5); // would be sent as ????? with GSM | |
// Various test cases for SmsSplits() | |
const char s160[] = "Lorem ipsum dolor sit amet, consectetur adipiscing " | |
"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare " | |
"augue vel nisi tempus, id cras amet."; | |
int split160 = SmsSplits(s160); | |
std::cout << "SMS with 160 bytes of lipsum: " << split160 << std::endl; | |
assert(split160 == 1); | |
char s161[160]; | |
memcpy(s161, s160, 160); | |
s161[159] = '~'; // ~ is two GSM 03.38 chars, so the result will be 161 | |
int split161 = SmsSplits(s161); | |
std::cout << "SMS with 161 bytes of lipsum: " << split161 << std::endl; | |
assert(split161 == 2); | |
char s305[306] = { 0 }; | |
memset(s305, 'a', 305); | |
int split305 = SmsSplits(s305); | |
std::cout << "SMS with 305 'a's: " << split305 << std::endl; | |
assert(split305 == 2); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function countGsm0338Length($utf8String) | |
{ | |
$len = mb_strlen($utf8String,'utf-8'); | |
$len += preg_match_all('/[\\^{}\\\~€|\\[\\]]/mu',$utf8String,$m); | |
return $len; | |
} | |
function countSmsSplits($message) | |
{ | |
$len = countGsm0338Length($message); | |
$chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2 | |
$chars_single_sms = 160; // 70 for UCS-2 | |
if ($len <= $chars_single_sms) { | |
return 1; | |
} else { | |
return ceil($len/$chars_per_sms); | |
} | |
} | |
$t = "Hej v€rden. Hvordan har du det i dag? Jeg glæder mig til at se dig."; | |
assert(countGsm0338Length($t) == 68); | |
assert(countSmsSplits($t) == 1); | |
$s160 = "Lorem ipsum dolor sit amet, consectetur adipiscing "; | |
$s160 .= "elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. "; | |
$s160 .= "Nam ornare augue vel nisi tempus, id cras amet."; | |
assert(countSmsSplits($s160) == 1); | |
$s160[160] = '~'; | |
assert(countSmsSplits($s160) == 2); | |
$s305 = str_repeat('Lorem', 61); | |
assert(countSmsSplits($s305) == 2); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf8 | |
from __future__ import division | |
import re | |
from math import ceil | |
def smslen(message): | |
if not isinstance(message, unicode): | |
raise Exception('Need unicode aware strings to get sms len') | |
escaped = re.findall(ur'[\^{}\~€|\[\]]', message, flags=re.M+re.U) | |
return len(message) + len(escaped) | |
def smssplit(message): | |
chars_per_sms = 153; # 152 for 16-bit reference numbers, 66 for UCS-2 | |
chars_single_sms = 160; # 70 for UCS-2 | |
length = smslen(message) | |
if length <= chars_single_sms: | |
return 1 | |
else: | |
return ceil(length / chars_per_sms) | |
t = unicode("Hej v€rden. Hvordan har du det i dag? " | |
"Jeg glæder mig til at se dig.", 'utf8') | |
assert(smslen(t) == 68) | |
assert(smslen(u"こんにちは") == 5) | |
s160 = (u"Lorem ipsum dolor sit amet, consectetur adipiscing " | |
"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare " | |
"augue vel nisi tempus, id cras amet.") | |
assert(smssplit(s160) == 1) | |
s161 = s160[:159] + '~' | |
assert(smssplit(s161) == 2) | |
s305 = u"Lorem" * 61 | |
assert(smssplit(s305) == 2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function CountSmsSplits(string message): | |
length := number of UTF-8 decoded characters in message; // unicode aware, ie. mb_strlen | |
// Add number of times these specials chars occurs in message: '^{}~€[]\|', ie. with regex. | |
length := length + count of regular expression matches("/[\\^{}\\\~€|\\[\\]]/mu", string) | |
// Calculate number of splits | |
charsPerMessageInChain := 153 // 152 for 16-bit reference numbers, 66 for UCS-2 | |
charsSingleMessage := 160 // 70 for UCS-2 | |
if length <= charsSingleMessage: | |
return 1 | |
else: | |
return ceiling(length / charsPerMessageInChain) | |
end if | |
end function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment