Skip to content

Instantly share code, notes, and snippets.

@pathumego
Forked from azmfaridee/iscii2unicode.py
Created January 3, 2018 15:34
Show Gist options
  • Save pathumego/81672787807c23f19518c622d9e7ebb8 to your computer and use it in GitHub Desktop.
Save pathumego/81672787807c23f19518c622d9e7ebb8 to your computer and use it in GitHub Desktop.
ISCII to Unicode Converter Script
#!/usr/bin/env python
# released under BSD License
# mary <[email protected]> aka meyarivan <[email protected]>
# inspired by ICU [ http://oss.software.ibm.com/icu/ ]
# code still in alpha stage.. lots of redundant code.. and probably incorrect
# if ya find errors, pls submit bug reports at indlinux
# for usage, either run the script or scroll down to end of the script
import sys
import array
# Generic Constants
ISCII_ATR = 0x00EF
ATR_MASK = 0x004F
DANDA = 0x0964
DELTA = 0x0080
DEV_ANUDATTA = 0x0952
DOUBLE_DANDA = 0x0965
ISCII_EXT = 0x00F0
EXT_RANGE_BEGIN = 0x00A1
EXT_RANGE_END = 0x00EE
HALANT = 0x094d
INDIC_BLOCK_BEGIN = 0x0900
INDIC_BLOCK_END = 0x0D7F
INVALID_CHAR = 0xFFFF
ISCII_BEGIN = 0x00A0
ISCII_DANDA = 0x00EA
ISCII_HALANT = 0x00E8
ISCII_INV = 0x00D9
ISCII_NUKTA = 0x00E9
LF = 0x000A
NO_CHAR = 0xFFFE
UNI_BEGIN = 0x0900
UNI_END = 0x097F
ZWJ = 0x200d
ZWNJ = 0x200c
# map between the ISCII scripts as specified via the ATR switch
# and the script in unicode
# bengali and assamese have same script except for two characters
# (according to ISCII 91
ISCII_SCRIPTS = {
0x40 : -1, # DEFAULT
0x42 : 0, # DEVNAG
0x43 : 1, # BENGALI
0x44 : 5, # TAMIL
0x45 : 6, # TELUGU
0x46 : 1, # ASSAMESE
0x47 : 4, # ORIYA
0x48 : 7, # KANNADA
0x49 : 8, # MALAYALAM
0x4A : 3, # GUJARATI
0x4B : 2 # PUNJABI
}
# ISCII_SPECIAL CHARS
ISCII_SPECIALS = [ISCII_ATR, ISCII_EXT, ISCII_INV]
iscii_to_unicode = {
0xa0 : 0x900,
0xa1 : 0x901,
0xa2 : 0x902,
0xa3 : 0x903,
0xa4 : 0x905,
0xa5 : 0x906,
0xa6 : 0x907,
0xa7 : 0x908,
0xa8 : 0x909,
0xa9 : 0x90a,
0xaa : 0x90b,
0xab : 0x90e,
0xac : 0x90f,
0xad : 0x910,
0xae : 0x90d,
0xaf : 0x912,
0xb0 : 0x913,
0xb1 : 0x914,
0xb2 : 0x911,
0xb3 : 0x915,
0xb4 : 0x916,
0xb5 : 0x917,
0xb6 : 0x918,
0xb7 : 0x919,
0xb8 : 0x91a,
0xb9 : 0x91b,
0xba : 0x91c,
0xbb : 0x91d,
0xbc : 0x91e,
0xbd : 0x91f,
0xbe : 0x920,
0xbf : 0x921,
0xc0 : 0x922,
0xc1 : 0x923,
0xc2 : 0x924,
0xc3 : 0x925,
0xc4 : 0x926,
0xc5 : 0x927,
0xc6 : 0x928,
0xc7 : 0x929,
0xc8 : 0x92a,
0xc9 : 0x92b,
0xca : 0x92c,
0xcb : 0x92d,
0xcc : 0x92e,
0xcd : 0x92f,
0xce : 0x95f,
0xcf : 0x930,
0xd0 : 0x931,
0xd1 : 0x932,
0xd2 : 0x933,
0xd3 : 0x934,
0xd4 : 0x935,
0xd5 : 0x936,
0xd6 : 0x937,
0xd7 : 0x938,
0xd8 : 0x939,
0xd9 : 0x200d,
0xda : 0x93e,
0xdb : 0x93f,
0xdc : 0x940,
0xdd : 0x941,
0xde : 0x942,
0xdf : 0x943,
0xe0 : 0x946,
0xe1 : 0x947,
0xe2 : 0x948,
0xe3 : 0x945,
0xe4 : 0x94a,
0xe5 : 0x94b,
0xe6 : 0x94c,
0xe7 : 0x949,
0xe8 : 0x94d,
0xe9 : 0x93c,
0xea : 0x964,
0xeb : 0xffff,
0xec : 0xffff,
0xed : 0xffff,
0xee : 0xffff,
0xef : 0xffff,
0xf0 : 0xffff,
0xf1 : 0x966,
0xf2 : 0x967,
0xf3 : 0x968,
0xf4 : 0x969,
0xf5 : 0x96a,
0xf6 : 0x96b,
0xf7 : 0x96c,
0xf8 : 0x96d,
0xf9 : 0x96e,
0xfa : 0x96f,
0xfb : 0xffff,
0xfc : 0xffff,
0xfd : 0xffff,
0xfe : 0xffff,
0xff : 0xffff
}
"""
# code to generate the validation_table
( ya ya .. agreed that it is kludgy)
# need python 2.3
import unicodedata
UNI_BEGIN = 0x0900
UNI_END = 0x097F
DELTA = 0x80
SCRS = 9
table = []
for char in range(UNI_END - UNI_BEGIN + 1):
res = [0] * SCRS
for scr in range(0, SCRS):
val = unichr(UNI_BEGIN + (scr * DELTA) + char)
res[scr] = int(unicodedata.name(val, None) is not None)
table.append(res)
"""
validation_table = [
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x0 0
[1, 1, 0, 1, 1, 0, 1, 0, 0], # 0x1 1
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2 2
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x3 3
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4 4
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x5 5
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6 6
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x7 7
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x8 8
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x9 9
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xa 10
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0xb 11
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0xc 12
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0xd 13
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0xe 14
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xf 15
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x10 16
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x11 17
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x12 18
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x13 19
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x14 20
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x15 21
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x16 22
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x17 23
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x18 24
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x19 25
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1a 26
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1b 27
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1c 28
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1d 29
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1e 30
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1f 31
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x20 32
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x21 33
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x22 34
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x23 35
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x24 36
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x25 37
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x26 38
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x27 39
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x28 40
[1, 0, 0, 0, 0, 1, 0, 0, 0], # 0x29 41
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2a 42
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2b 43
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2c 44
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2d 45
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2e 46
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2f 47
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x30 48
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x31 49
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x32 50
[1, 0, 1, 1, 1, 1, 1, 1, 1], # 0x33 51
[1, 0, 0, 0, 0, 1, 0, 0, 1], # 0x34 52
[1, 0, 1, 1, 0, 1, 1, 1, 1], # 0x35 53
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x36 54
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x37 55
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x38 56
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x39 57
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3a 58
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3b 59
[1, 1, 1, 1, 1, 0, 0, 0, 0], # 0x3c 60
[1, 0, 0, 1, 1, 0, 0, 0, 0], # 0x3d 61
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3e 62
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3f 63
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x40 64
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x41 65
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x42 66
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x43 67
[1, 1, 0, 1, 0, 0, 1, 1, 0], # 0x44 68
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x45 69
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x46 70
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x47 71
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x48 72
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x49 73
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x4a 74
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4b 75
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4c 76
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4d 77
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4e 78
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4f 79
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x50 80
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x51 81
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x52 82
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x53 83
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x54 84
[0, 0, 0, 0, 0, 0, 1, 1, 0], # 0x55 85
[0, 0, 0, 0, 1, 0, 1, 1, 0], # 0x56 86
[0, 1, 0, 0, 1, 1, 0, 0, 1], # 0x57 87
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x58 88
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x59 89
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5a 90
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5b 91
[1, 1, 1, 0, 1, 0, 0, 0, 0], # 0x5c 92
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5d 93
[1, 0, 1, 0, 0, 0, 0, 1, 0], # 0x5e 94
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5f 95
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x60 96
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0x61 97
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x62 98
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x63 99
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x64 100
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x65 101
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x66 102
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x67 103
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x68 104
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x69 105
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6a 106
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6b 107
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6c 108
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6d 109
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6e 110
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6f 111
[1, 1, 1, 0, 1, 1, 0, 0, 0], # 0x70 112
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x71 113
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x72 114
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x73 115
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x74 116
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x75 117
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x76 118
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x77 119
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x78 120
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x79 121
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x7a 122
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7b 123
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7c 124
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7d 125
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7e 126
[0, 0, 0, 0, 0, 0, 0, 0, 0] # 0x7f 127
]
# special characters formed by combination of consonants and nukta
nukta_specials = {
0xA6 : 0x090c,
0xEA : 0x093D,
0xDF : 0x0944,
0xA1 : 0x0950,
0xB3 : 0x0958,
0xB4 : 0x0959,
0xB5 : 0x095A,
0xBA : 0x095B,
0xBF : 0x095C,
0xC0 : 0x095D,
0xC9 : 0x095E,
0xAA : 0x0960,
0xA7 : 0x0961,
0xDB : 0x0962,
0xDC : 0x0963
}
special_maps = {
## the two points which are different between assamese and bengali
## (according to the charts in ISCII-91 documentation)
(5, 0xCF) : 0x09F0,
(5, 0xD4) : 0x09F1,
}
def make_script_maps():
_invalid_range = range(0xEB, 0xF1) + range(0xFB, 0xFF + 1)
scripts = {}
for i in range(9):
curr_scr = {}
for ch in range(0xA0):
curr_scr[ch] = ch
for ch in range(0xA0, 0xFF + 1):
if (ch in _invalid_range):
continue
t = iscii_to_unicode[ch]
if validation_table[t & 0xFF][i]:
curr_scr[ch] = t
curr_scr[ch] += (i * 0x80)
scripts[i] = curr_scr
for i in special_maps:
scripts[i[0]][i[1]] = special_maps[i]
return scripts
## setup the script map (iscii -> unicode) for all the supported scripts
## hope it speeds up things
script_maps = make_script_maps()
####
def make_invalid_maps():
maps = {}
for i in range(9):
curr_map = {}
for j in range(0xFF + 1):
curr_map[j] = (not script_maps[i].has_key(j)) and \
(j not in ISCII_SPECIALS)
maps[i] = curr_map
return maps
## setup the map of invalid characters for each script
invalid_chars = make_invalid_maps()
## tmp code
iscii_modifying = {}
_tmp = ISCII_SPECIALS + [ISCII_HALANT, ISCII_NUKTA, ISCII_DANDA]
for i in range(0xFF + 1):
iscii_modifying[i] = int(i in _tmp)
## end of tmp code
def to_utf8(y):
"""
converts an array of integers to utf8 string
"""
out = []
for x in y:
if x < 0x080:
out.append(x)
elif x < 0x0800:
out.append((x >> 6) | 0xC0)
out.append((x & 0x3F) | 0x80)
elif x < 0x10000:
out.append((x >> 12) | 0xE0)
out.append(((x >> 6) & 0x3F) | 0x80)
out.append((x & 0x3F) | 0x80)
else:
out.append((x >> 18) | 0xF0)
out.append((x >> 12) & 0x3F)
out.append(((x >> 6) & 0x3F) | 0x80)
out.append((x & 0x3F) | 0x80)
return ''.join(map(chr, out))
class IllegalInput(Exception):
def __init__(self, e):
self.exception = e
def __str__(self):
return repr(self.exception)
class Parser:
def __init__(self):
self.delta = 0
self.curr_mask = 0 # current mask to unicode
self.prev_char = self.src_char = self.dest_char = NO_CHAR
self.dest = []
self.pos = 0
self.stat = [0] * 10
def write_output(self):
out = to_utf8(self.dest)
sys.stdout.write(out)
self.dest = []
def set_script(self, i):
"""
set the value of delta to reflect the current codepage
"""
if i in range(1, 10):
n = i - 1
else:
raise IllegalInput, "Invalid Value for ATR %s" % (hex(i))
if n > -1: # n = -1 is the default script ..
self.curr_script = n
self.delta = n * DELTA
return
def isvalid(self, i):
return validation_table[i & 0xFF][self.curr_script]
def isvalid_iscii(self, x):
return not invalid_chars[self.curr_script].has_key(x)
def is_nukta_special(self, i):
x = nukta_specials.get(i, None)
return x
def handle_ext(self, curr_char):
self.pos += 1 # for EXT
for a in range(1):
if not ((EXT_RANGE_END >= curr_char) and\
(EXT_RANGE_BEGIN <= curr_char)):
break
if curr_char not in [0xBF, 0xB8]:
break
if curr_char == 0xBF:
dest_char = DEV_ABBR_SIGN
else:
dest_char = DEV_ANUDATTA
if self.isvalid(dest_char):
return dest_char
print >> sys.stderr, "Invalid input after EXT %s" % (hex(i))
return None
def handle_atr(self, i):
print >> sys.stderr, "Handling ATR:",
if i in ISCII_SCRIPTS.keys():
self.set_script(ISCII_SCRIPTS[i])
print >> sys.stderr, "setting script to", i
else:
# ignore all other ATR markers
print >> sys.stderr, "ignored"
pass
self.pos += 2 # for ATR and the following char
return None
def handle_inv(self, i):
if i == ISCII_HALANT:
ret = 0x0020
else:
ret = ZWJ
self.pos += 1 # for INV
return ret
def post_analysis(self, prev_char, src_char):
if prev_char == ISCII_ATR:
ret = self.handle_atr(src_char)
elif prev_char == ISCII_EXT:
ret = self.handle_ext(src_char)
elif prev_char == ISCII_INV:
ret = self.handle_inv(src_char)
return ret
def iscii2utf8(self, src, flush = 0):
dest = self.dest
src = array.array('B', src).tolist()
curr_char = prev_char = NO_CHAR
n = len(src)
self.pos = 0
stat = self.stat
for i in range(n):
curr_char = src[i]
dest_char = NO_CHAR
add_prev = 0
if invalid_chars[self.curr_script][curr_char]:
# just ignore the invalid iscii characters
print >> sys.stderr, 'ignoring invalid iscii char', \
hex(curr_char)
self.pos += 1
continue
if flush and (i == (n - 1)):
dest_char = curr_char
elif (prev_char == NO_CHAR):
prev_char = curr_char
continue
elif prev_char in ISCII_SPECIALS:
ret = self.post_analysis(prev_char, curr_char)
if ret is not None:
dest_char = ret
prev_char = NO_CHAR
elif not iscii_modifying[curr_char]:
pass
elif curr_char in ISCII_SPECIALS:
dest_char = prev_char
prev_char = curr_char
elif (curr_char == ISCII_DANDA) and (prev_char == ISCII_DANDA):
dest_char = DOUBLE_DANDA
prev_char = NO_CHAR
self.pos += 1
elif (curr_char == ISCII_HALANT) and (prev_char == ISCII_HALANT):
dest_char = ZWNJ
add_prev = 1
elif curr_char == ISCII_NUKTA:
if prev_char == ISCII_HALANT:
dest_char = ZWJ
add_prev = 1
else:
tmp = self.is_nukta_special(prev_char)
if tmp: # nukta special
dest_char = tmp
prev_char = NO_CHAR
self.pos += 1
to_add = []
if add_prev == 1:
to_add.append(prev_char)
prev_char = NO_CHAR
if dest_char != NO_CHAR:
to_add.append(dest_char)
elif prev_char != NO_CHAR:
to_add.append(prev_char)
prev_char = curr_char
for ch in to_add:
if (ch <= 0xFF):
m = script_maps[self.curr_script][ch]
else:
m = ch
# end of mapping
self.pos += 1
self.dest.append(m)
return self.pos
def show_usage(name):
usage = """
Usage:
%s script
where script is a number between 1-9
1 - devnag
2 - bengali / assamese
3 - punjabi
4 - gujarati
5 - oriya
6 - tamil
7 - telugu
8 - kannada
9 - malayalam
the program reads from stdin and writes to stdout
any msgs to the user (error msgs etc) are printed on stderr
""" % (name)
print >> sys.stderr, usage
sys.exit(1)
chunk_size = 4096
if __name__ == '__main__':
try:
i = int(sys.argv[1])
if i not in range(1, 10):
raise ValueError
except (ValueError, IndexError):
show_usage(sys.argv[0])
mypar = Parser()
mypar.set_script(i)
y = ''
flush = 0
while 1:
if flush:
break
x = sys.stdin.read(chunk_size)
if not x:
flush = 1
x = y + x
n = mypar.iscii2utf8(x, flush)
y = x[n:]
mypar.write_output()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment