Last active
October 20, 2016 05:15
-
-
Save EdisonChendi/7b6f8f3c3415e990263be7b1a588b77a to your computer and use it in GitHub Desktop.
sensibly trunc a str/bytes(py3) or str/unicode string(py2) to some limit by counting bytes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
def trunc(s, limit, coding="UTF-8", postfix="..."): | |
''' | |
sensibly trunc a str/bytes(py3) or str/unicode string(py2) to some limit by counting bytes | |
''' | |
unicode_s = s.decode(coding) if type(s) == bytes else s | |
nums = (len(u.encode(coding)) for u in unicode_s) | |
sum, i = 0, 0 | |
use_postfix = "" | |
for i,n in enumerate(nums): | |
if sum+n > limit: | |
use_postfix = postfix | |
break | |
else: | |
sum += n | |
return unicode_s[:i] + use_postfix | |
# py2 | |
a = u"你好世界," * 100 | |
trunc_a = trunc(a, 50) | |
print(trunc_a) | |
b = "你好世界," * 100 | |
trunc_b = trunc(b, 50) | |
print(trunc_b) | |
c = u'你好上你你你你你好上海,好上海,好上海,好上海,好上海,海'.encode("gb2312") | |
trunc_c = trunc(c, 20, coding="gb2312", postfix=u"呃呃呃") # use unicode in py2 for postfix | |
print(trunc_c) | |
# py3 | |
# a = "你好世界," * 100 | |
# trunc_a = trunc(a, 50) | |
# print(trunc_a) | |
# b = bytes("你好世界," * 100, "UTF-8") | |
# trunc_b = trunc(b, 50) | |
# print(trunc_b) | |
# c = '你好上你你你你你好上海,好上海,好上海,好上海,好上海,海'.encode("gb2312") | |
# trunc_c = trunc(c, 20, coding="gb2312") | |
# print(trunc_c) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment