Skip to content

Instantly share code, notes, and snippets.

@kovrov
Created July 28, 2011 14:20
Show Gist options
  • Save kovrov/1111629 to your computer and use it in GitHub Desktop.
Save kovrov/1111629 to your computer and use it in GitHub Desktop.
def isFirstCodepoint(b):
return 0b10000000 & b == 0 or 0b11000000 & b != 0b10000000
def splitUtf8(arr, i):
if isFirstCodepoint(arr[i]):
return arr[:i], arr[i:]
while i > 0:
i -= 1
if isFirstCodepoint(arr[i]):
break;
return arr[:i], arr[i:]
b = bytearray([0b01110111, # one byte lating char
0b11010000, # two byte cyrillic char
0b10110110,
0b11101000, # three byte hieroglyph
0b10001000,
0b10010001,
0b11110001, # sorry, couldnt find valid four-byte character
0b10000001,
0b10000001,
0b10000001])
print "sampe utf-8 stream '%s'\n" % b.decode('utf-8')
for i in range(1, len(b)):
print i,
for s in splitUtf8(b, i):
print "'%s'" % s,
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment