-
-
Save pathumego/81672787807c23f19518c622d9e7ebb8 to your computer and use it in GitHub Desktop.
ISCII to Unicode Converter Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# released under BSD License | |
# mary <[email protected]> aka meyarivan <[email protected]> | |
# inspired by ICU [ http://oss.software.ibm.com/icu/ ] | |
# code still in alpha stage.. lots of redundant code.. and probably incorrect | |
# if ya find errors, pls submit bug reports at indlinux | |
# for usage, either run the script or scroll down to end of the script | |
import sys | |
import array | |
# Generic Constants | |
ISCII_ATR = 0x00EF | |
ATR_MASK = 0x004F | |
DANDA = 0x0964 | |
DELTA = 0x0080 | |
DEV_ANUDATTA = 0x0952 | |
DOUBLE_DANDA = 0x0965 | |
ISCII_EXT = 0x00F0 | |
EXT_RANGE_BEGIN = 0x00A1 | |
EXT_RANGE_END = 0x00EE | |
HALANT = 0x094d | |
INDIC_BLOCK_BEGIN = 0x0900 | |
INDIC_BLOCK_END = 0x0D7F | |
INVALID_CHAR = 0xFFFF | |
ISCII_BEGIN = 0x00A0 | |
ISCII_DANDA = 0x00EA | |
ISCII_HALANT = 0x00E8 | |
ISCII_INV = 0x00D9 | |
ISCII_NUKTA = 0x00E9 | |
LF = 0x000A | |
NO_CHAR = 0xFFFE | |
UNI_BEGIN = 0x0900 | |
UNI_END = 0x097F | |
ZWJ = 0x200d | |
ZWNJ = 0x200c | |
# map between the ISCII scripts as specified via the ATR switch | |
# and the script in unicode | |
# bengali and assamese have same script except for two characters | |
# (according to ISCII 91 | |
ISCII_SCRIPTS = { | |
0x40 : -1, # DEFAULT | |
0x42 : 0, # DEVNAG | |
0x43 : 1, # BENGALI | |
0x44 : 5, # TAMIL | |
0x45 : 6, # TELUGU | |
0x46 : 1, # ASSAMESE | |
0x47 : 4, # ORIYA | |
0x48 : 7, # KANNADA | |
0x49 : 8, # MALAYALAM | |
0x4A : 3, # GUJARATI | |
0x4B : 2 # PUNJABI | |
} | |
# ISCII_SPECIAL CHARS | |
ISCII_SPECIALS = [ISCII_ATR, ISCII_EXT, ISCII_INV] | |
iscii_to_unicode = { | |
0xa0 : 0x900, | |
0xa1 : 0x901, | |
0xa2 : 0x902, | |
0xa3 : 0x903, | |
0xa4 : 0x905, | |
0xa5 : 0x906, | |
0xa6 : 0x907, | |
0xa7 : 0x908, | |
0xa8 : 0x909, | |
0xa9 : 0x90a, | |
0xaa : 0x90b, | |
0xab : 0x90e, | |
0xac : 0x90f, | |
0xad : 0x910, | |
0xae : 0x90d, | |
0xaf : 0x912, | |
0xb0 : 0x913, | |
0xb1 : 0x914, | |
0xb2 : 0x911, | |
0xb3 : 0x915, | |
0xb4 : 0x916, | |
0xb5 : 0x917, | |
0xb6 : 0x918, | |
0xb7 : 0x919, | |
0xb8 : 0x91a, | |
0xb9 : 0x91b, | |
0xba : 0x91c, | |
0xbb : 0x91d, | |
0xbc : 0x91e, | |
0xbd : 0x91f, | |
0xbe : 0x920, | |
0xbf : 0x921, | |
0xc0 : 0x922, | |
0xc1 : 0x923, | |
0xc2 : 0x924, | |
0xc3 : 0x925, | |
0xc4 : 0x926, | |
0xc5 : 0x927, | |
0xc6 : 0x928, | |
0xc7 : 0x929, | |
0xc8 : 0x92a, | |
0xc9 : 0x92b, | |
0xca : 0x92c, | |
0xcb : 0x92d, | |
0xcc : 0x92e, | |
0xcd : 0x92f, | |
0xce : 0x95f, | |
0xcf : 0x930, | |
0xd0 : 0x931, | |
0xd1 : 0x932, | |
0xd2 : 0x933, | |
0xd3 : 0x934, | |
0xd4 : 0x935, | |
0xd5 : 0x936, | |
0xd6 : 0x937, | |
0xd7 : 0x938, | |
0xd8 : 0x939, | |
0xd9 : 0x200d, | |
0xda : 0x93e, | |
0xdb : 0x93f, | |
0xdc : 0x940, | |
0xdd : 0x941, | |
0xde : 0x942, | |
0xdf : 0x943, | |
0xe0 : 0x946, | |
0xe1 : 0x947, | |
0xe2 : 0x948, | |
0xe3 : 0x945, | |
0xe4 : 0x94a, | |
0xe5 : 0x94b, | |
0xe6 : 0x94c, | |
0xe7 : 0x949, | |
0xe8 : 0x94d, | |
0xe9 : 0x93c, | |
0xea : 0x964, | |
0xeb : 0xffff, | |
0xec : 0xffff, | |
0xed : 0xffff, | |
0xee : 0xffff, | |
0xef : 0xffff, | |
0xf0 : 0xffff, | |
0xf1 : 0x966, | |
0xf2 : 0x967, | |
0xf3 : 0x968, | |
0xf4 : 0x969, | |
0xf5 : 0x96a, | |
0xf6 : 0x96b, | |
0xf7 : 0x96c, | |
0xf8 : 0x96d, | |
0xf9 : 0x96e, | |
0xfa : 0x96f, | |
0xfb : 0xffff, | |
0xfc : 0xffff, | |
0xfd : 0xffff, | |
0xfe : 0xffff, | |
0xff : 0xffff | |
} | |
""" | |
# code to generate the validation_table | |
( ya ya .. agreed that it is kludgy) | |
# need python 2.3 | |
import unicodedata | |
UNI_BEGIN = 0x0900 | |
UNI_END = 0x097F | |
DELTA = 0x80 | |
SCRS = 9 | |
table = [] | |
for char in range(UNI_END - UNI_BEGIN + 1): | |
res = [0] * SCRS | |
for scr in range(0, SCRS): | |
val = unichr(UNI_BEGIN + (scr * DELTA) + char) | |
res[scr] = int(unicodedata.name(val, None) is not None) | |
table.append(res) | |
""" | |
validation_table = [ | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x0 0 | |
[1, 1, 0, 1, 1, 0, 1, 0, 0], # 0x1 1 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2 2 | |
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x3 3 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4 4 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x5 5 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6 6 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x7 7 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x8 8 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x9 9 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xa 10 | |
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0xb 11 | |
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0xc 12 | |
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0xd 13 | |
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0xe 14 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0xf 15 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x10 16 | |
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x11 17 | |
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x12 18 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x13 19 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x14 20 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x15 21 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x16 22 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x17 23 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x18 24 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x19 25 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1a 26 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1b 27 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1c 28 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x1d 29 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1e 30 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x1f 31 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x20 32 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x21 33 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x22 34 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x23 35 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x24 36 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x25 37 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x26 38 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x27 39 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x28 40 | |
[1, 0, 0, 0, 0, 1, 0, 0, 0], # 0x29 41 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2a 42 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2b 43 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2c 44 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x2d 45 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2e 46 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x2f 47 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x30 48 | |
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x31 49 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x32 50 | |
[1, 0, 1, 1, 1, 1, 1, 1, 1], # 0x33 51 | |
[1, 0, 0, 0, 0, 1, 0, 0, 1], # 0x34 52 | |
[1, 0, 1, 1, 0, 1, 1, 1, 1], # 0x35 53 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x36 54 | |
[1, 1, 0, 1, 1, 1, 1, 1, 1], # 0x37 55 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x38 56 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x39 57 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3a 58 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x3b 59 | |
[1, 1, 1, 1, 1, 0, 0, 0, 0], # 0x3c 60 | |
[1, 0, 0, 1, 1, 0, 0, 0, 0], # 0x3d 61 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3e 62 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x3f 63 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x40 64 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x41 65 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x42 66 | |
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x43 67 | |
[1, 1, 0, 1, 0, 0, 1, 1, 0], # 0x44 68 | |
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x45 69 | |
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x46 70 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x47 71 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x48 72 | |
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x49 73 | |
[1, 0, 0, 0, 0, 1, 1, 1, 1], # 0x4a 74 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4b 75 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4c 76 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x4d 77 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4e 78 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x4f 79 | |
[1, 0, 0, 1, 0, 0, 0, 0, 0], # 0x50 80 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x51 81 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x52 82 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x53 83 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x54 84 | |
[0, 0, 0, 0, 0, 0, 1, 1, 0], # 0x55 85 | |
[0, 0, 0, 0, 1, 0, 1, 1, 0], # 0x56 86 | |
[0, 1, 0, 0, 1, 1, 0, 0, 1], # 0x57 87 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x58 88 | |
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x59 89 | |
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5a 90 | |
[1, 0, 1, 0, 0, 0, 0, 0, 0], # 0x5b 91 | |
[1, 1, 1, 0, 1, 0, 0, 0, 0], # 0x5c 92 | |
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5d 93 | |
[1, 0, 1, 0, 0, 0, 0, 1, 0], # 0x5e 94 | |
[1, 1, 0, 0, 1, 0, 0, 0, 0], # 0x5f 95 | |
[1, 1, 0, 1, 1, 0, 1, 1, 1], # 0x60 96 | |
[1, 1, 0, 0, 1, 0, 1, 1, 1], # 0x61 97 | |
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x62 98 | |
[1, 1, 0, 0, 0, 0, 0, 0, 0], # 0x63 99 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x64 100 | |
[1, 0, 0, 0, 0, 0, 0, 0, 0], # 0x65 101 | |
[1, 1, 1, 1, 1, 0, 1, 1, 1], # 0x66 102 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x67 103 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x68 104 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x69 105 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6a 106 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6b 107 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6c 108 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6d 109 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6e 110 | |
[1, 1, 1, 1, 1, 1, 1, 1, 1], # 0x6f 111 | |
[1, 1, 1, 0, 1, 1, 0, 0, 0], # 0x70 112 | |
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x71 113 | |
[0, 1, 1, 0, 0, 1, 0, 0, 0], # 0x72 114 | |
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x73 115 | |
[0, 1, 1, 0, 0, 0, 0, 0, 0], # 0x74 116 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x75 117 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x76 118 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x77 119 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x78 120 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x79 121 | |
[0, 1, 0, 0, 0, 0, 0, 0, 0], # 0x7a 122 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7b 123 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7c 124 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7d 125 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], # 0x7e 126 | |
[0, 0, 0, 0, 0, 0, 0, 0, 0] # 0x7f 127 | |
] | |
# special characters formed by combination of consonants and nukta | |
nukta_specials = { | |
0xA6 : 0x090c, | |
0xEA : 0x093D, | |
0xDF : 0x0944, | |
0xA1 : 0x0950, | |
0xB3 : 0x0958, | |
0xB4 : 0x0959, | |
0xB5 : 0x095A, | |
0xBA : 0x095B, | |
0xBF : 0x095C, | |
0xC0 : 0x095D, | |
0xC9 : 0x095E, | |
0xAA : 0x0960, | |
0xA7 : 0x0961, | |
0xDB : 0x0962, | |
0xDC : 0x0963 | |
} | |
special_maps = { | |
## the two points which are different between assamese and bengali | |
## (according to the charts in ISCII-91 documentation) | |
(5, 0xCF) : 0x09F0, | |
(5, 0xD4) : 0x09F1, | |
} | |
def make_script_maps(): | |
_invalid_range = range(0xEB, 0xF1) + range(0xFB, 0xFF + 1) | |
scripts = {} | |
for i in range(9): | |
curr_scr = {} | |
for ch in range(0xA0): | |
curr_scr[ch] = ch | |
for ch in range(0xA0, 0xFF + 1): | |
if (ch in _invalid_range): | |
continue | |
t = iscii_to_unicode[ch] | |
if validation_table[t & 0xFF][i]: | |
curr_scr[ch] = t | |
curr_scr[ch] += (i * 0x80) | |
scripts[i] = curr_scr | |
for i in special_maps: | |
scripts[i[0]][i[1]] = special_maps[i] | |
return scripts | |
## setup the script map (iscii -> unicode) for all the supported scripts | |
## hope it speeds up things | |
script_maps = make_script_maps() | |
#### | |
def make_invalid_maps(): | |
maps = {} | |
for i in range(9): | |
curr_map = {} | |
for j in range(0xFF + 1): | |
curr_map[j] = (not script_maps[i].has_key(j)) and \ | |
(j not in ISCII_SPECIALS) | |
maps[i] = curr_map | |
return maps | |
## setup the map of invalid characters for each script | |
invalid_chars = make_invalid_maps() | |
## tmp code | |
iscii_modifying = {} | |
_tmp = ISCII_SPECIALS + [ISCII_HALANT, ISCII_NUKTA, ISCII_DANDA] | |
for i in range(0xFF + 1): | |
iscii_modifying[i] = int(i in _tmp) | |
## end of tmp code | |
def to_utf8(y): | |
""" | |
converts an array of integers to utf8 string | |
""" | |
out = [] | |
for x in y: | |
if x < 0x080: | |
out.append(x) | |
elif x < 0x0800: | |
out.append((x >> 6) | 0xC0) | |
out.append((x & 0x3F) | 0x80) | |
elif x < 0x10000: | |
out.append((x >> 12) | 0xE0) | |
out.append(((x >> 6) & 0x3F) | 0x80) | |
out.append((x & 0x3F) | 0x80) | |
else: | |
out.append((x >> 18) | 0xF0) | |
out.append((x >> 12) & 0x3F) | |
out.append(((x >> 6) & 0x3F) | 0x80) | |
out.append((x & 0x3F) | 0x80) | |
return ''.join(map(chr, out)) | |
class IllegalInput(Exception): | |
def __init__(self, e): | |
self.exception = e | |
def __str__(self): | |
return repr(self.exception) | |
class Parser: | |
def __init__(self): | |
self.delta = 0 | |
self.curr_mask = 0 # current mask to unicode | |
self.prev_char = self.src_char = self.dest_char = NO_CHAR | |
self.dest = [] | |
self.pos = 0 | |
self.stat = [0] * 10 | |
def write_output(self): | |
out = to_utf8(self.dest) | |
sys.stdout.write(out) | |
self.dest = [] | |
def set_script(self, i): | |
""" | |
set the value of delta to reflect the current codepage | |
""" | |
if i in range(1, 10): | |
n = i - 1 | |
else: | |
raise IllegalInput, "Invalid Value for ATR %s" % (hex(i)) | |
if n > -1: # n = -1 is the default script .. | |
self.curr_script = n | |
self.delta = n * DELTA | |
return | |
def isvalid(self, i): | |
return validation_table[i & 0xFF][self.curr_script] | |
def isvalid_iscii(self, x): | |
return not invalid_chars[self.curr_script].has_key(x) | |
def is_nukta_special(self, i): | |
x = nukta_specials.get(i, None) | |
return x | |
def handle_ext(self, curr_char): | |
self.pos += 1 # for EXT | |
for a in range(1): | |
if not ((EXT_RANGE_END >= curr_char) and\ | |
(EXT_RANGE_BEGIN <= curr_char)): | |
break | |
if curr_char not in [0xBF, 0xB8]: | |
break | |
if curr_char == 0xBF: | |
dest_char = DEV_ABBR_SIGN | |
else: | |
dest_char = DEV_ANUDATTA | |
if self.isvalid(dest_char): | |
return dest_char | |
print >> sys.stderr, "Invalid input after EXT %s" % (hex(i)) | |
return None | |
def handle_atr(self, i): | |
print >> sys.stderr, "Handling ATR:", | |
if i in ISCII_SCRIPTS.keys(): | |
self.set_script(ISCII_SCRIPTS[i]) | |
print >> sys.stderr, "setting script to", i | |
else: | |
# ignore all other ATR markers | |
print >> sys.stderr, "ignored" | |
pass | |
self.pos += 2 # for ATR and the following char | |
return None | |
def handle_inv(self, i): | |
if i == ISCII_HALANT: | |
ret = 0x0020 | |
else: | |
ret = ZWJ | |
self.pos += 1 # for INV | |
return ret | |
def post_analysis(self, prev_char, src_char): | |
if prev_char == ISCII_ATR: | |
ret = self.handle_atr(src_char) | |
elif prev_char == ISCII_EXT: | |
ret = self.handle_ext(src_char) | |
elif prev_char == ISCII_INV: | |
ret = self.handle_inv(src_char) | |
return ret | |
def iscii2utf8(self, src, flush = 0): | |
dest = self.dest | |
src = array.array('B', src).tolist() | |
curr_char = prev_char = NO_CHAR | |
n = len(src) | |
self.pos = 0 | |
stat = self.stat | |
for i in range(n): | |
curr_char = src[i] | |
dest_char = NO_CHAR | |
add_prev = 0 | |
if invalid_chars[self.curr_script][curr_char]: | |
# just ignore the invalid iscii characters | |
print >> sys.stderr, 'ignoring invalid iscii char', \ | |
hex(curr_char) | |
self.pos += 1 | |
continue | |
if flush and (i == (n - 1)): | |
dest_char = curr_char | |
elif (prev_char == NO_CHAR): | |
prev_char = curr_char | |
continue | |
elif prev_char in ISCII_SPECIALS: | |
ret = self.post_analysis(prev_char, curr_char) | |
if ret is not None: | |
dest_char = ret | |
prev_char = NO_CHAR | |
elif not iscii_modifying[curr_char]: | |
pass | |
elif curr_char in ISCII_SPECIALS: | |
dest_char = prev_char | |
prev_char = curr_char | |
elif (curr_char == ISCII_DANDA) and (prev_char == ISCII_DANDA): | |
dest_char = DOUBLE_DANDA | |
prev_char = NO_CHAR | |
self.pos += 1 | |
elif (curr_char == ISCII_HALANT) and (prev_char == ISCII_HALANT): | |
dest_char = ZWNJ | |
add_prev = 1 | |
elif curr_char == ISCII_NUKTA: | |
if prev_char == ISCII_HALANT: | |
dest_char = ZWJ | |
add_prev = 1 | |
else: | |
tmp = self.is_nukta_special(prev_char) | |
if tmp: # nukta special | |
dest_char = tmp | |
prev_char = NO_CHAR | |
self.pos += 1 | |
to_add = [] | |
if add_prev == 1: | |
to_add.append(prev_char) | |
prev_char = NO_CHAR | |
if dest_char != NO_CHAR: | |
to_add.append(dest_char) | |
elif prev_char != NO_CHAR: | |
to_add.append(prev_char) | |
prev_char = curr_char | |
for ch in to_add: | |
if (ch <= 0xFF): | |
m = script_maps[self.curr_script][ch] | |
else: | |
m = ch | |
# end of mapping | |
self.pos += 1 | |
self.dest.append(m) | |
return self.pos | |
def show_usage(name): | |
usage = """ | |
Usage: | |
%s script | |
where script is a number between 1-9 | |
1 - devnag | |
2 - bengali / assamese | |
3 - punjabi | |
4 - gujarati | |
5 - oriya | |
6 - tamil | |
7 - telugu | |
8 - kannada | |
9 - malayalam | |
the program reads from stdin and writes to stdout | |
any msgs to the user (error msgs etc) are printed on stderr | |
""" % (name) | |
print >> sys.stderr, usage | |
sys.exit(1) | |
chunk_size = 4096 | |
if __name__ == '__main__': | |
try: | |
i = int(sys.argv[1]) | |
if i not in range(1, 10): | |
raise ValueError | |
except (ValueError, IndexError): | |
show_usage(sys.argv[0]) | |
mypar = Parser() | |
mypar.set_script(i) | |
y = '' | |
flush = 0 | |
while 1: | |
if flush: | |
break | |
x = sys.stdin.read(chunk_size) | |
if not x: | |
flush = 1 | |
x = y + x | |
n = mypar.iscii2utf8(x, flush) | |
y = x[n:] | |
mypar.write_output() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment