Last active
December 19, 2015 16:25
-
-
Save lifthrasiir/ac96f6a927cfb27c0f6f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a part of rust-encoding. | |
# Copyright (c) 2013-2015, Kang Seonghoon. | |
# See README.md and LICENSE.txt for details. | |
import urllib | |
import sys | |
import os | |
import os.path | |
def whatwg_index(name, comments): | |
try: os.mkdir('.cache') | |
except Exception: pass | |
cached_path = os.path.join('.cache', '%s.txt' % name) | |
if os.path.exists(cached_path): | |
print >>sys.stderr, '(cached)', | |
else: | |
try: | |
urllib.urlretrieve('http://encoding.spec.whatwg.org/index-%s.txt' % name, | |
cached_path) | |
except Exception: | |
try: os.unlink(cached_path) | |
except Exception: pass | |
raise | |
for line in open(cached_path): | |
line = line.strip() | |
if not line: continue | |
if line.startswith('#'): | |
comments.append('//' + line[1:]) | |
continue | |
parts = line.split(None, 2) | |
key = int(parts[0], 0) | |
value = int(parts[1], 0) | |
yield key, value | |
def mkdir_and_open(crate, name): | |
dirname = os.path.join(os.path.dirname(__file__), crate) | |
try: | |
os.mkdir(dirname) | |
except Exception: | |
pass | |
return open(os.path.join(dirname, '%s.rs' % name.replace('-', '_')), 'wb') | |
def write_header(f, name, comments): | |
print >>f, '// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name | |
print >>f, '//' | |
for line in comments: | |
print >>f, line | |
def write_comma_separated(f, prefix, l, width=80): | |
buffered = '' | |
for i in l: | |
i = str(i) | |
if len(prefix) + len(buffered) + len(i) <= width: | |
buffered += i | |
else: | |
print >>f, prefix + buffered.rstrip() | |
buffered = i | |
if buffered: | |
print >>f, prefix + buffered.rstrip() | |
def make_minimal_trie(invdata, lowerlimit=0x10000, linearlimit=0): | |
maxvalue = max(invdata.keys()) + 1 | |
best = 0xffffffff | |
besttrie = None | |
for triebits in xrange(1, 21): | |
locallinearlimit = min(linearlimit, 16//triebits) | |
lower = [0] | |
upper = [] | |
lowermap = {tuple(lower): 0} | |
for i in xrange(0, maxvalue, 1<<triebits): | |
blk = [invdata.get(j) for j in xrange(i, i + (1<<triebits))] | |
if locallinearlimit: | |
# use a linear search for smaller number of valid entries | |
shift = 0 | |
newblk = [0] | |
for j, x in enumerate(blk): | |
if x is not None: | |
newblk.append(x) | |
newblk[0] |= j << shift | |
shift += triebits | |
if len(newblk) < len(blk) and len(newblk) <= locallinearlimit + 1: | |
if not newblk: newblk.append(0) | |
blk = newblk | |
loweridx = lowermap.get(tuple(blk)) | |
if loweridx is None: | |
loweridx = len(lower) | |
lowermap[tuple(blk)] = loweridx | |
lower += blk | |
upper.append(loweridx) | |
if len(lower) < lowerlimit and best >= len(lower) + len(upper): | |
best = len(lower) + len(upper) | |
besttrie = (triebits, lower, upper) | |
return besttrie | |
def make_minimal_search(invdata): | |
maxvalue = max(invdata.keys()) + 1 | |
best = 0xffffffff | |
bestsearch = None | |
for lowerbits in xrange(2, 7): | |
lower = [] | |
upper = [] | |
import collections; freq = collections.Counter() | |
for i in xrange(0, maxvalue, 1<<lowerbits): | |
blocks = [0] | |
for j in xrange(i, i + (1<<lowerbits)): | |
v = invdata.get(j) | |
if v is None: | |
if not isinstance(blocks[-1], int): | |
blocks.append(0) | |
blocks[-1] += 1 | |
else: | |
if not isinstance(blocks[-1], list): | |
blocks.append([]) | |
blocks[-1].append(v) | |
if len(blocks) % 2 == 1: | |
last = blocks.pop() | |
assert isinstance(last, int) | |
items = [] # (prior gap, list of values) or (prior gap, start value, count) | |
for gap, block in zip(blocks[0::2], blocks[1::2]): | |
last = None | |
if gap <= 1 and items and len(items[-1]) == 2: | |
run = gap | |
gap, current = items.pop() | |
else: | |
run = 0 | |
current = [] | |
assert all(isinstance(x, int) for x in block) | |
block.append(None) # a dummy entry, ignored | |
for x in block: | |
if x is not None and last is not None and x - last == 1: | |
run += 1 | |
else: | |
if run >= 3: | |
assert last is not None | |
if current: | |
items.append((gap, current)) | |
gap = 0 | |
current = [] | |
items.append((gap, last - run + 1, run)) | |
run = 0 | |
if run > 0: | |
for j in xrange(-run + 1, 1): | |
current.append(last + j if last is not None else None) | |
run = 1 | |
last = x | |
assert (last, run) == (None, 1) | |
if current: | |
items.append((gap, current)) | |
upper.append(len(lower)) | |
freq[len(items)] += 1 | |
for item in items: | |
if len(item) == 2: | |
gap, block = item | |
assert 0 <= gap < 0x100 | |
assert 0 < len(block) <= 0x100 | |
lower.append((gap, len(block) - 1, False)) | |
lower += block | |
else: | |
gap, start, count = item | |
assert 0 <= gap < 0x100 | |
assert 0 < count <= 0x100 | |
lower.append((gap, count - 1, True)) | |
lower.append(start) | |
upper.append(len(lower)) | |
if best >= len(lower) * 2 + len(upper) * 2: | |
best = len(lower) * 2 + len(upper) * 2 | |
bestsearch = (lowerbits, lower, upper) | |
bestfreq = freq | |
#print bestfreq, sum(k*v for k,v in bestfreq.items()) | |
#import pprint; pprint.pprint(bestsearch) | |
return bestsearch | |
def make_minimal_lut(data, lowerlimit=0x10000): | |
maxvalue = max(data.keys()) + 1 | |
best = 0xffffffff | |
besttab = None | |
bestfreq = None | |
for lowerbits in xrange(3, 9): | |
lower = [] | |
upper = [] | |
loweroff = 0 | |
import collections; freq = [] | |
for i in xrange(0, maxvalue, 1<<lowerbits): | |
blk = [data[j] - j if j in data else None for j in xrange(i, i + (1<<lowerbits))] | |
if all(x is None for x in blk): | |
minblk = maxblk = 0 | |
else: | |
minblk = min(x for x in blk if x is not None) | |
maxblk = max(x for x in blk if x is not None) | |
pp = sorted(x - minblk for x in blk if x is not None) | |
maxreach = 0 | |
for k in xrange(len(pp)): | |
for l in xrange(k, len(pp)): | |
if pp[l] - pp[k] >= 255: break | |
maxreach = max(maxreach, l - k + 1) | |
freq.append(pp) | |
if maxblk - minblk < 255: | |
blk = [x - minblk if x is not None else 255 for x in blk] | |
blk = [x | (y << 8) for x, y in zip(blk[0::2], blk[1::2])] | |
upper.append(len(lower) | (minblk << 16)) | |
lower += blk | |
else: | |
upper.append(len(lower)) | |
lower += blk | |
if len(lower) < lowerlimit and best >= len(lower) * 2 + len(upper) * 4: | |
best = len(lower) * 2 + len(upper) * 4 | |
besttrie = (lowerbits, lower, upper) | |
bestfreq = freq | |
for i in bestfreq: print i | |
return besttrie | |
def generate_single_byte_index(crate, name): | |
modname = name.replace('-', '_') | |
data = [None] * 128 | |
invdata = {} | |
comments = [] | |
for key, value in whatwg_index(name, comments): | |
assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata | |
data[key] = value | |
invdata[value] = key | |
# generate a trie with a minimal amount of data | |
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) | |
print 'minimal trie:', triebits, len(lower), len(upper), '=', len(lower)*2+len(upper)*2, 'bytes' | |
#import collections | |
#print [collections.Counter(x >> 8 << 8 for x in lower[i:i+(1<<triebits)] if x is not None) for i in xrange(0, len(lower), 1<<triebits)] | |
lowerbitsx, lowerx, upperx = make_minimal_search(invdata) | |
assert all(isinstance(x, tuple) or x is None or 0 <= x < 0x100 for x in lowerx) | |
lowerxsz = sum(2 if isinstance(x, tuple) else 1 for x in lowerx) | |
print 'minimal search:', lowerbitsx, len(lowerx), len(upperx), '=', lowerxsz+len(upperx)*2, 'bytes' | |
with mkdir_and_open(crate, name) as f: | |
write_header(f, name, comments) | |
print >>f | |
print >>f, "static FORWARD_TABLE: &'static [u16] = &[" | |
write_comma_separated(f, ' ', | |
['%d, ' % (0xffff if value is None else value) for value in data]) | |
print >>f, ']; // %d entries' % len(data) | |
print >>f | |
print >>f, '/// Returns the index code point for pointer `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn forward(code: u8) -> u16 {' | |
print >>f, ' FORWARD_TABLE[(code - 0x80) as usize]' | |
print >>f, '}' | |
print >>f | |
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u8] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % (0 if v is None else v+0x80) for v in lower]) | |
print >>f, ']; // %d entries' % len(lower) | |
print >>f | |
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) | |
print >>f, ']; // %d entries' % len(upper) | |
print >>f | |
print >>f, '/// Returns the index pointer for code point `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn backward(code: u32) -> u8 {' | |
print >>f, ' let offset = (code >> %d) as usize;' % triebits | |
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) | |
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1) | |
print >>f, '}' | |
print >>f | |
print >>f, '#[cfg(test)]' | |
print >>f, 'single_byte_tests!(' | |
print >>f, ' mod = %s' % modname | |
print >>f, ');' | |
return 2 * len(data), len(lower) + 2 * len(upper) | |
#return 2 * len(data), lowerxsz + 2 * len(upperx) | |
def generate_multi_byte_index(crate, name): | |
modname = name.replace('-', '_') | |
# some indices need an additional function for efficient mapping. | |
premap = lambda i: i | |
#if name == 'euc-kr': | |
# def premap(i): | |
# r, c = divmod(i, 190) | |
# if c >= 96: | |
# if r < 44: pass | |
# elif r < 47: return None | |
# elif r < 72: r -= 3 | |
# elif r < 73: return None | |
# else: r -= 4 | |
# return r * (190 - 96) + (c - 96) | |
# else: | |
# if c < 26: pass | |
# elif c < 32: return None | |
# elif c < 58: c -= 6 | |
# elif c < 64: return None | |
# else: c -= 12 | |
# return (125 - 4) * (190 - 96) + r * (96 - 12) + c | |
#elif name == 'jis0208': | |
# def premap(i): | |
# if i < 690: pass | |
# elif i < 1128: return None | |
# elif i < 1220: i -= 438 | |
# elif i < 1410: return None | |
# elif i < 7808: i -= 628 | |
# elif i < 8272: return None | |
# elif i < 8648: i -= 1092 | |
# elif i < 10716: return None | |
# else: i -= 3160 | |
# return i | |
#elif name == 'jis0212': | |
# def premap(i): | |
# if i < 175: pass | |
# elif i < 534: return None | |
# elif i < 1027: i -= 359 | |
# elif i < 1410: return None | |
# else: i -= 742 | |
# return i | |
data = {} | |
invdata = {} | |
dups = [] | |
comments = [] | |
morebits = False | |
for key, value in whatwg_index(name, comments): | |
assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data | |
if value >= 0x10001: | |
assert (value >> 16) == 2 | |
morebits = True | |
data[key] = value | |
if value not in invdata: | |
invdata[value] = key | |
else: | |
dups.append(key) | |
# Big5 has four two-letter forward mappings, we use special entries for them | |
if name == 'big5': | |
specialidx = [1133, 1135, 1164, 1166] | |
assert all(key not in data for key in specialidx) | |
assert all(value not in invdata for value in xrange(len(specialidx))) | |
for value, key in enumerate(specialidx): | |
data[key] = value | |
dups.append(key) # no consistency testing for them | |
# generate a trie with a minimal amount of data | |
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) | |
print 'minimal trie:', triebits, len(lower), len(upper), '=', len(lower)*2+len(upper)*2, 'bytes' | |
#import collections | |
#print [collections.Counter(x >> 8 << 8 for x in lower[i:i+(1<<triebits)] if x is not None) for i in xrange(0, len(lower), 1<<triebits)] | |
lowerbitsx, lowerx, upperx = make_minimal_search(invdata) | |
print 'minimal search:', lowerbitsx, len(lowerx), len(upperx), '=', len(lowerx)*2+len(upperx)*2, 'bytes' | |
# JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly | |
# different encodings EUC-JP and Shift_JIS; the default backward function would favor | |
# the former, so we need a separate mapping for the latter. | |
# | |
# fortunately for us, all allocated codes in [8272,8836) have counterparts in others, | |
# so we only need a smaller remapping from [8272,8836) to other counterparts. | |
remap = None | |
if name == 'jis0208': | |
REMAP_MIN = 8272 | |
REMAP_MAX = 8835 | |
invdataminusremap = {} | |
for key, value in data.items(): | |
if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX: | |
invdataminusremap[value] = key | |
remap = [] | |
for i in xrange(REMAP_MIN, REMAP_MAX+1): | |
if i in data: | |
assert data[i] in invdataminusremap | |
value = invdataminusremap[data[i]] | |
assert value < 0x10000 | |
remap.append(value) | |
else: | |
remap.append(0xffff) | |
newdata = {} | |
for key, value in data.items(): | |
key = premap(key) | |
assert key is not None and 0 <= key < 0x10000 and key not in newdata | |
newdata[key] = value | |
data = newdata | |
#print 'uncompressed', len(data)*2 | |
#lowerbits, tablower, tabupper = make_minimal_lut(data) | |
#print 'compressed', lowerbits, len(tablower)*2 + len(tabupper)*4, | |
#print 'ratio', len([x for x in tabupper if x <= 0xffff]), 'out of', len(tabupper) | |
minkey = min(data) | |
maxkey = max(data) + 1 | |
with mkdir_and_open(crate, name) as f: | |
write_header(f, name, comments) | |
print >>f | |
print >>f, "static FORWARD_TABLE: &'static [u16] = &[" | |
write_comma_separated(f, ' ', | |
['%d, ' % ((data.get(key, 0xffff) - key) & 0xffff if key in data else 0xffff) for key in xrange(minkey, maxkey)]) | |
print >>f, ']; // %d entries' % (maxkey - minkey) | |
if morebits: | |
print >>f | |
print >>f, "static FORWARD_TABLE_MORE: &'static [u32] = &[" | |
bits = [] | |
for i in xrange(minkey, maxkey, 32): | |
v = 0 | |
for j in xrange(32): | |
v |= (data.get(i+j, 0) >= 0x10000) << j | |
bits.append(v) | |
write_comma_separated(f, ' ', ['%d, ' % v for v in bits]) | |
print >>f, ']; // %d entries' % len(bits) | |
print >>f | |
print >>f, '/// Returns the index code point for pointer `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn forward(code: u16) -> u32 {' | |
if minkey != 0: | |
print >>f, ' let code = (code as usize).wrapping_sub(%d);' % minkey | |
else: | |
print >>f, ' let code = code as usize;' | |
print >>f, ' if code < %d {' % (maxkey - minkey) | |
if morebits: | |
print >>f, ' (FORWARD_TABLE[code] as u32) | ' + \ | |
'(((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)' | |
else: | |
print >>f, ' FORWARD_TABLE[code] as u32' | |
print >>f, ' } else {' | |
print >>f, ' 0xffff' | |
print >>f, ' }' | |
print >>f, '}' | |
print >>f | |
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u16] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % (0xffff if v is None else v) for v in lower]) | |
print >>f, ']; // %d entries' % len(lower) | |
print >>f | |
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) | |
print >>f, ']; // %d entries' % len(upper) | |
if remap: | |
print >>f | |
print >>f, "static BACKWARD_TABLE_REMAPPED: &'static [u16] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % v for v in remap]) | |
print >>f, ']; // %d entries' % len(remap) | |
print >>f | |
print >>f, '/// Returns the index pointer for code point `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn backward(code: u32) -> u16 {' | |
print >>f, ' let offset = (code >> %d) as usize;' % triebits | |
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) | |
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1) | |
print >>f, '}' | |
if remap: | |
print >>f | |
assert name == 'jis0208' | |
print >>f, '/// Returns the index shift_jis pointer for code point `code`.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn backward_remapped(code: u32) -> u16 {' | |
print >>f, ' let value = backward(code);' | |
print >>f, ' if %d <= value && value <= %d {' % (REMAP_MIN, REMAP_MAX) | |
print >>f, ' BACKWARD_TABLE_REMAPPED[(value - %d) as usize]' % REMAP_MIN | |
print >>f, ' } else {' | |
print >>f, ' value' | |
print >>f, ' }' | |
print >>f, '}' | |
print >>f | |
print >>f, '#[cfg(test)]' | |
print >>f, 'multi_byte_tests!(' | |
print >>f, ' mod = %s,' % modname | |
if remap: | |
print >>f, ' remap = [%d, %d],' % (REMAP_MIN, REMAP_MAX) | |
if dups: | |
print >>f, ' dups = [' | |
write_comma_separated(f, ' ', ['%d, ' % v for v in sorted(dups)]) | |
print >>f, ' ]' | |
else: | |
print >>f, ' dups = []' | |
print >>f, ');' | |
forwardsz = 2 * (maxkey - minkey) | |
backwardsz = 2 * len(lower) + 2 * len(upper) | |
#backwardsz = len(lowerx) * 2 + 2 * len(upperx) | |
if morebits: backwardsz += 4 * ((maxkey - minkey + 31) // 32) | |
if remap: backwardsz += 2 * len(remap) | |
return forwardsz, backwardsz | |
def generate_multi_byte_range_lbound_index(crate, name): | |
modname = name.replace('-', '_') | |
data = [] | |
comments = [] | |
for key, value in whatwg_index(name, comments): | |
data.append((key, value)) | |
assert data and data == sorted(data) | |
minkey, minvalue = data[0] | |
maxkey, maxvalue = data[-1] | |
if data[0] != (0, 0): | |
data.insert(0, (0, 0)) | |
maxlog2 = 0 | |
while 2**(maxlog2 + 1) <= len(data): | |
maxlog2 += 1 | |
if name == 'gb18030-ranges': | |
keyubound = 0x110000 | |
valueubound = 126 * 10 * 126 * 10 | |
else: | |
keyubound = maxkey + 1 | |
valueubound = maxvalue + 1 | |
with mkdir_and_open(crate, name) as f: | |
write_header(f, name, comments) | |
print >>f | |
print >>f, "static FORWARD_TABLE: &'static [u32] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % value for key, value in data]) | |
print >>f, ']; // %d entries' % len(data) | |
print >>f | |
print >>f, "static BACKWARD_TABLE: &'static [u32] = &[" | |
write_comma_separated(f, ' ', ['%d, ' % key for key, value in data]) | |
print >>f, ']; // %d entries' % len(data) | |
print >>f | |
print >>f, '/// Returns the index code point for pointer `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn forward(code: u32) -> u32 {' | |
if minkey > 0: | |
print >>f, ' if code < %d { return 0xffffffff; }' % minkey | |
if name == 'gb18030-ranges': # has "invalid" region inside | |
print >>f, ' if (code > 39419 && code < 189000) || code > 1237575 { return 0xffffffff; }' | |
print >>f, ' let mut i = if code >= BACKWARD_TABLE[%d] {%d} else {0};' % \ | |
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) | |
for i in xrange(maxlog2-1, -1, -1): | |
print >>f, ' if code >= BACKWARD_TABLE[i%s] { i += %d; }' % \ | |
('+%d' % (2**i-1) if i > 0 else '', 2**i) | |
print >>f, ' (code - BACKWARD_TABLE[i-1]) + FORWARD_TABLE[i-1]' | |
print >>f, '}' | |
print >>f | |
print >>f, '/// Returns the index pointer for code point `code` in this index.' | |
print >>f, '#[inline]' | |
print >>f, 'pub fn backward(code: u32) -> u32 {' | |
if minvalue > 0: | |
print >>f, ' if code < %d { return 0xffffffff; }' % minvalue | |
print >>f, ' let mut i = if code >= FORWARD_TABLE[%d] {%d} else {0};' % \ | |
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) | |
for i in xrange(maxlog2-1, -1, -1): | |
print >>f, ' if code >= FORWARD_TABLE[i%s] { i += %d; }' % \ | |
('+%d' % (2**i-1) if i > 0 else '', 2**i) | |
print >>f, ' (code - FORWARD_TABLE[i-1]) + BACKWARD_TABLE[i-1]' | |
print >>f, '}' | |
print >>f | |
print >>f, '#[cfg(test)]' | |
print >>f, 'multi_byte_range_tests!(' | |
print >>f, ' mod = %s,' % modname | |
print >>f, ' key = [%d, %d], key < %d,' % (minkey, maxkey, keyubound) | |
print >>f, ' value = [%d, %d], value < %d' % (minvalue, maxvalue, valueubound) | |
print >>f, ');' | |
return 4 * len(data), 4 * len(data) | |
INDICES = { | |
'singlebyte/ibm866': generate_single_byte_index, | |
'singlebyte/iso-8859-2': generate_single_byte_index, | |
'singlebyte/iso-8859-3': generate_single_byte_index, | |
'singlebyte/iso-8859-4': generate_single_byte_index, | |
'singlebyte/iso-8859-5': generate_single_byte_index, | |
'singlebyte/iso-8859-6': generate_single_byte_index, | |
'singlebyte/iso-8859-7': generate_single_byte_index, | |
'singlebyte/iso-8859-8': generate_single_byte_index, | |
'singlebyte/iso-8859-10': generate_single_byte_index, | |
'singlebyte/iso-8859-13': generate_single_byte_index, | |
'singlebyte/iso-8859-14': generate_single_byte_index, | |
'singlebyte/iso-8859-15': generate_single_byte_index, | |
'singlebyte/iso-8859-16': generate_single_byte_index, | |
'singlebyte/koi8-r': generate_single_byte_index, | |
'singlebyte/koi8-u': generate_single_byte_index, | |
'singlebyte/macintosh': generate_single_byte_index, | |
'singlebyte/windows-874': generate_single_byte_index, | |
'singlebyte/windows-1250': generate_single_byte_index, | |
'singlebyte/windows-1251': generate_single_byte_index, | |
'singlebyte/windows-1252': generate_single_byte_index, | |
'singlebyte/windows-1253': generate_single_byte_index, | |
'singlebyte/windows-1254': generate_single_byte_index, | |
'singlebyte/windows-1255': generate_single_byte_index, | |
'singlebyte/windows-1256': generate_single_byte_index, | |
'singlebyte/windows-1257': generate_single_byte_index, | |
'singlebyte/windows-1258': generate_single_byte_index, | |
'singlebyte/x-mac-cyrillic': generate_single_byte_index, | |
'tradchinese/big5': generate_multi_byte_index, | |
'korean/euc-kr': generate_multi_byte_index, | |
'simpchinese/gb18030': generate_multi_byte_index, | |
'japanese/jis0208': generate_multi_byte_index, | |
'japanese/jis0212': generate_multi_byte_index, | |
'simpchinese/gb18030-ranges': generate_multi_byte_range_lbound_index, | |
} | |
if __name__ == '__main__': | |
import sys | |
filters = sys.argv[1:] | |
totalsz = 0 | |
for index, generate in INDICES.items(): | |
crate, _, index = index.partition('/') | |
if filters and all(s not in index for s in filters): continue | |
print >>sys.stderr, 'generating index %s...' % index, | |
forwardsz, backwardsz = generate(crate, index) | |
totalsz += forwardsz + backwardsz | |
print >>sys.stderr, '%d + %d = %d bytes.' % (forwardsz, backwardsz, forwardsz + backwardsz) | |
print >>sys.stderr, 'total %d bytes.' % totalsz |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment