Skip to content

Instantly share code, notes, and snippets.

@lifthrasiir
Last active December 19, 2015 16:25
Show Gist options
  • Save lifthrasiir/ac96f6a927cfb27c0f6f to your computer and use it in GitHub Desktop.
Save lifthrasiir/ac96f6a927cfb27c0f6f to your computer and use it in GitHub Desktop.
# This is a part of rust-encoding.
# Copyright (c) 2013-2015, Kang Seonghoon.
# See README.md and LICENSE.txt for details.
import urllib
import sys
import os
import os.path
def whatwg_index(name, comments):
try: os.mkdir('.cache')
except Exception: pass
cached_path = os.path.join('.cache', '%s.txt' % name)
if os.path.exists(cached_path):
print >>sys.stderr, '(cached)',
else:
try:
urllib.urlretrieve('http://encoding.spec.whatwg.org/index-%s.txt' % name,
cached_path)
except Exception:
try: os.unlink(cached_path)
except Exception: pass
raise
for line in open(cached_path):
line = line.strip()
if not line: continue
if line.startswith('#'):
comments.append('//' + line[1:])
continue
parts = line.split(None, 2)
key = int(parts[0], 0)
value = int(parts[1], 0)
yield key, value
def mkdir_and_open(crate, name):
dirname = os.path.join(os.path.dirname(__file__), crate)
try:
os.mkdir(dirname)
except Exception:
pass
return open(os.path.join(dirname, '%s.rs' % name.replace('-', '_')), 'wb')
def write_header(f, name, comments):
print >>f, '// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name
print >>f, '//'
for line in comments:
print >>f, line
def write_comma_separated(f, prefix, l, width=80):
buffered = ''
for i in l:
i = str(i)
if len(prefix) + len(buffered) + len(i) <= width:
buffered += i
else:
print >>f, prefix + buffered.rstrip()
buffered = i
if buffered:
print >>f, prefix + buffered.rstrip()
def make_minimal_trie(invdata, lowerlimit=0x10000, linearlimit=0):
maxvalue = max(invdata.keys()) + 1
best = 0xffffffff
besttrie = None
for triebits in xrange(1, 21):
locallinearlimit = min(linearlimit, 16//triebits)
lower = [0]
upper = []
lowermap = {tuple(lower): 0}
for i in xrange(0, maxvalue, 1<<triebits):
blk = [invdata.get(j) for j in xrange(i, i + (1<<triebits))]
if locallinearlimit:
# use a linear search for smaller number of valid entries
shift = 0
newblk = [0]
for j, x in enumerate(blk):
if x is not None:
newblk.append(x)
newblk[0] |= j << shift
shift += triebits
if len(newblk) < len(blk) and len(newblk) <= locallinearlimit + 1:
if not newblk: newblk.append(0)
blk = newblk
loweridx = lowermap.get(tuple(blk))
if loweridx is None:
loweridx = len(lower)
lowermap[tuple(blk)] = loweridx
lower += blk
upper.append(loweridx)
if len(lower) < lowerlimit and best >= len(lower) + len(upper):
best = len(lower) + len(upper)
besttrie = (triebits, lower, upper)
return besttrie
def make_minimal_search(invdata):
maxvalue = max(invdata.keys()) + 1
best = 0xffffffff
bestsearch = None
for lowerbits in xrange(2, 7):
lower = []
upper = []
import collections; freq = collections.Counter()
for i in xrange(0, maxvalue, 1<<lowerbits):
blocks = [0]
for j in xrange(i, i + (1<<lowerbits)):
v = invdata.get(j)
if v is None:
if not isinstance(blocks[-1], int):
blocks.append(0)
blocks[-1] += 1
else:
if not isinstance(blocks[-1], list):
blocks.append([])
blocks[-1].append(v)
if len(blocks) % 2 == 1:
last = blocks.pop()
assert isinstance(last, int)
items = [] # (prior gap, list of values) or (prior gap, start value, count)
for gap, block in zip(blocks[0::2], blocks[1::2]):
last = None
if gap <= 1 and items and len(items[-1]) == 2:
run = gap
gap, current = items.pop()
else:
run = 0
current = []
assert all(isinstance(x, int) for x in block)
block.append(None) # a dummy entry, ignored
for x in block:
if x is not None and last is not None and x - last == 1:
run += 1
else:
if run >= 3:
assert last is not None
if current:
items.append((gap, current))
gap = 0
current = []
items.append((gap, last - run + 1, run))
run = 0
if run > 0:
for j in xrange(-run + 1, 1):
current.append(last + j if last is not None else None)
run = 1
last = x
assert (last, run) == (None, 1)
if current:
items.append((gap, current))
upper.append(len(lower))
freq[len(items)] += 1
for item in items:
if len(item) == 2:
gap, block = item
assert 0 <= gap < 0x100
assert 0 < len(block) <= 0x100
lower.append((gap, len(block) - 1, False))
lower += block
else:
gap, start, count = item
assert 0 <= gap < 0x100
assert 0 < count <= 0x100
lower.append((gap, count - 1, True))
lower.append(start)
upper.append(len(lower))
if best >= len(lower) * 2 + len(upper) * 2:
best = len(lower) * 2 + len(upper) * 2
bestsearch = (lowerbits, lower, upper)
bestfreq = freq
#print bestfreq, sum(k*v for k,v in bestfreq.items())
#import pprint; pprint.pprint(bestsearch)
return bestsearch
def make_minimal_lut(data, lowerlimit=0x10000):
maxvalue = max(data.keys()) + 1
best = 0xffffffff
besttab = None
bestfreq = None
for lowerbits in xrange(3, 9):
lower = []
upper = []
loweroff = 0
import collections; freq = []
for i in xrange(0, maxvalue, 1<<lowerbits):
blk = [data[j] - j if j in data else None for j in xrange(i, i + (1<<lowerbits))]
if all(x is None for x in blk):
minblk = maxblk = 0
else:
minblk = min(x for x in blk if x is not None)
maxblk = max(x for x in blk if x is not None)
pp = sorted(x - minblk for x in blk if x is not None)
maxreach = 0
for k in xrange(len(pp)):
for l in xrange(k, len(pp)):
if pp[l] - pp[k] >= 255: break
maxreach = max(maxreach, l - k + 1)
freq.append(pp)
if maxblk - minblk < 255:
blk = [x - minblk if x is not None else 255 for x in blk]
blk = [x | (y << 8) for x, y in zip(blk[0::2], blk[1::2])]
upper.append(len(lower) | (minblk << 16))
lower += blk
else:
upper.append(len(lower))
lower += blk
if len(lower) < lowerlimit and best >= len(lower) * 2 + len(upper) * 4:
best = len(lower) * 2 + len(upper) * 4
besttrie = (lowerbits, lower, upper)
bestfreq = freq
for i in bestfreq: print i
return besttrie
def generate_single_byte_index(crate, name):
modname = name.replace('-', '_')
data = [None] * 128
invdata = {}
comments = []
for key, value in whatwg_index(name, comments):
assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata
data[key] = value
invdata[value] = key
# generate a trie with a minimal amount of data
print
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000)
print 'minimal trie:', triebits, len(lower), len(upper), '=', len(lower)*2+len(upper)*2, 'bytes'
#import collections
#print [collections.Counter(x >> 8 << 8 for x in lower[i:i+(1<<triebits)] if x is not None) for i in xrange(0, len(lower), 1<<triebits)]
lowerbitsx, lowerx, upperx = make_minimal_search(invdata)
assert all(isinstance(x, tuple) or x is None or 0 <= x < 0x100 for x in lowerx)
lowerxsz = sum(2 if isinstance(x, tuple) else 1 for x in lowerx)
print 'minimal search:', lowerbitsx, len(lowerx), len(upperx), '=', lowerxsz+len(upperx)*2, 'bytes'
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u16] = &["
write_comma_separated(f, ' ',
['%d, ' % (0xffff if value is None else value) for value in data])
print >>f, ']; // %d entries' % len(data)
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u8) -> u16 {'
print >>f, ' FORWARD_TABLE[(code - 0x80) as usize]'
print >>f, '}'
print >>f
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u8] = &["
write_comma_separated(f, ' ', ['%d, ' % (0 if v is None else v+0x80) for v in lower])
print >>f, ']; // %d entries' % len(lower)
print >>f
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in upper])
print >>f, ']; // %d entries' % len(upper)
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u8 {'
print >>f, ' let offset = (code >> %d) as usize;' % triebits
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper)
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1)
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'single_byte_tests!('
print >>f, ' mod = %s' % modname
print >>f, ');'
return 2 * len(data), len(lower) + 2 * len(upper)
#return 2 * len(data), lowerxsz + 2 * len(upperx)
def generate_multi_byte_index(crate, name):
modname = name.replace('-', '_')
# some indices need an additional function for efficient mapping.
premap = lambda i: i
#if name == 'euc-kr':
# def premap(i):
# r, c = divmod(i, 190)
# if c >= 96:
# if r < 44: pass
# elif r < 47: return None
# elif r < 72: r -= 3
# elif r < 73: return None
# else: r -= 4
# return r * (190 - 96) + (c - 96)
# else:
# if c < 26: pass
# elif c < 32: return None
# elif c < 58: c -= 6
# elif c < 64: return None
# else: c -= 12
# return (125 - 4) * (190 - 96) + r * (96 - 12) + c
#elif name == 'jis0208':
# def premap(i):
# if i < 690: pass
# elif i < 1128: return None
# elif i < 1220: i -= 438
# elif i < 1410: return None
# elif i < 7808: i -= 628
# elif i < 8272: return None
# elif i < 8648: i -= 1092
# elif i < 10716: return None
# else: i -= 3160
# return i
#elif name == 'jis0212':
# def premap(i):
# if i < 175: pass
# elif i < 534: return None
# elif i < 1027: i -= 359
# elif i < 1410: return None
# else: i -= 742
# return i
data = {}
invdata = {}
dups = []
comments = []
morebits = False
for key, value in whatwg_index(name, comments):
assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data
if value >= 0x10001:
assert (value >> 16) == 2
morebits = True
data[key] = value
if value not in invdata:
invdata[value] = key
else:
dups.append(key)
# Big5 has four two-letter forward mappings, we use special entries for them
if name == 'big5':
specialidx = [1133, 1135, 1164, 1166]
assert all(key not in data for key in specialidx)
assert all(value not in invdata for value in xrange(len(specialidx)))
for value, key in enumerate(specialidx):
data[key] = value
dups.append(key) # no consistency testing for them
# generate a trie with a minimal amount of data
print
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000)
print 'minimal trie:', triebits, len(lower), len(upper), '=', len(lower)*2+len(upper)*2, 'bytes'
#import collections
#print [collections.Counter(x >> 8 << 8 for x in lower[i:i+(1<<triebits)] if x is not None) for i in xrange(0, len(lower), 1<<triebits)]
lowerbitsx, lowerx, upperx = make_minimal_search(invdata)
print 'minimal search:', lowerbitsx, len(lowerx), len(upperx), '=', len(lowerx)*2+len(upperx)*2, 'bytes'
# JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly
# different encodings EUC-JP and Shift_JIS; the default backward function would favor
# the former, so we need a separate mapping for the latter.
#
# fortunately for us, all allocated codes in [8272,8836) have counterparts in others,
# so we only need a smaller remapping from [8272,8836) to other counterparts.
remap = None
if name == 'jis0208':
REMAP_MIN = 8272
REMAP_MAX = 8835
invdataminusremap = {}
for key, value in data.items():
if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX:
invdataminusremap[value] = key
remap = []
for i in xrange(REMAP_MIN, REMAP_MAX+1):
if i in data:
assert data[i] in invdataminusremap
value = invdataminusremap[data[i]]
assert value < 0x10000
remap.append(value)
else:
remap.append(0xffff)
newdata = {}
for key, value in data.items():
key = premap(key)
assert key is not None and 0 <= key < 0x10000 and key not in newdata
newdata[key] = value
data = newdata
#print
#print 'uncompressed', len(data)*2
#lowerbits, tablower, tabupper = make_minimal_lut(data)
#print 'compressed', lowerbits, len(tablower)*2 + len(tabupper)*4,
#print 'ratio', len([x for x in tabupper if x <= 0xffff]), 'out of', len(tabupper)
minkey = min(data)
maxkey = max(data) + 1
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u16] = &["
write_comma_separated(f, ' ',
['%d, ' % ((data.get(key, 0xffff) - key) & 0xffff if key in data else 0xffff) for key in xrange(minkey, maxkey)])
print >>f, ']; // %d entries' % (maxkey - minkey)
if morebits:
print >>f
print >>f, "static FORWARD_TABLE_MORE: &'static [u32] = &["
bits = []
for i in xrange(minkey, maxkey, 32):
v = 0
for j in xrange(32):
v |= (data.get(i+j, 0) >= 0x10000) << j
bits.append(v)
write_comma_separated(f, ' ', ['%d, ' % v for v in bits])
print >>f, ']; // %d entries' % len(bits)
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u16) -> u32 {'
if minkey != 0:
print >>f, ' let code = (code as usize).wrapping_sub(%d);' % minkey
else:
print >>f, ' let code = code as usize;'
print >>f, ' if code < %d {' % (maxkey - minkey)
if morebits:
print >>f, ' (FORWARD_TABLE[code] as u32) | ' + \
'(((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)'
else:
print >>f, ' FORWARD_TABLE[code] as u32'
print >>f, ' } else {'
print >>f, ' 0xffff'
print >>f, ' }'
print >>f, '}'
print >>f
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % (0xffff if v is None else v) for v in lower])
print >>f, ']; // %d entries' % len(lower)
print >>f
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in upper])
print >>f, ']; // %d entries' % len(upper)
if remap:
print >>f
print >>f, "static BACKWARD_TABLE_REMAPPED: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in remap])
print >>f, ']; // %d entries' % len(remap)
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u16 {'
print >>f, ' let offset = (code >> %d) as usize;' % triebits
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper)
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1)
print >>f, '}'
if remap:
print >>f
assert name == 'jis0208'
print >>f, '/// Returns the index shift_jis pointer for code point `code`.'
print >>f, '#[inline]'
print >>f, 'pub fn backward_remapped(code: u32) -> u16 {'
print >>f, ' let value = backward(code);'
print >>f, ' if %d <= value && value <= %d {' % (REMAP_MIN, REMAP_MAX)
print >>f, ' BACKWARD_TABLE_REMAPPED[(value - %d) as usize]' % REMAP_MIN
print >>f, ' } else {'
print >>f, ' value'
print >>f, ' }'
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'multi_byte_tests!('
print >>f, ' mod = %s,' % modname
if remap:
print >>f, ' remap = [%d, %d],' % (REMAP_MIN, REMAP_MAX)
if dups:
print >>f, ' dups = ['
write_comma_separated(f, ' ', ['%d, ' % v for v in sorted(dups)])
print >>f, ' ]'
else:
print >>f, ' dups = []'
print >>f, ');'
forwardsz = 2 * (maxkey - minkey)
backwardsz = 2 * len(lower) + 2 * len(upper)
#backwardsz = len(lowerx) * 2 + 2 * len(upperx)
if morebits: backwardsz += 4 * ((maxkey - minkey + 31) // 32)
if remap: backwardsz += 2 * len(remap)
return forwardsz, backwardsz
def generate_multi_byte_range_lbound_index(crate, name):
modname = name.replace('-', '_')
data = []
comments = []
for key, value in whatwg_index(name, comments):
data.append((key, value))
assert data and data == sorted(data)
minkey, minvalue = data[0]
maxkey, maxvalue = data[-1]
if data[0] != (0, 0):
data.insert(0, (0, 0))
maxlog2 = 0
while 2**(maxlog2 + 1) <= len(data):
maxlog2 += 1
if name == 'gb18030-ranges':
keyubound = 0x110000
valueubound = 126 * 10 * 126 * 10
else:
keyubound = maxkey + 1
valueubound = maxvalue + 1
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u32] = &["
write_comma_separated(f, ' ', ['%d, ' % value for key, value in data])
print >>f, ']; // %d entries' % len(data)
print >>f
print >>f, "static BACKWARD_TABLE: &'static [u32] = &["
write_comma_separated(f, ' ', ['%d, ' % key for key, value in data])
print >>f, ']; // %d entries' % len(data)
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u32) -> u32 {'
if minkey > 0:
print >>f, ' if code < %d { return 0xffffffff; }' % minkey
if name == 'gb18030-ranges': # has "invalid" region inside
print >>f, ' if (code > 39419 && code < 189000) || code > 1237575 { return 0xffffffff; }'
print >>f, ' let mut i = if code >= BACKWARD_TABLE[%d] {%d} else {0};' % \
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1)
for i in xrange(maxlog2-1, -1, -1):
print >>f, ' if code >= BACKWARD_TABLE[i%s] { i += %d; }' % \
('+%d' % (2**i-1) if i > 0 else '', 2**i)
print >>f, ' (code - BACKWARD_TABLE[i-1]) + FORWARD_TABLE[i-1]'
print >>f, '}'
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u32 {'
if minvalue > 0:
print >>f, ' if code < %d { return 0xffffffff; }' % minvalue
print >>f, ' let mut i = if code >= FORWARD_TABLE[%d] {%d} else {0};' % \
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1)
for i in xrange(maxlog2-1, -1, -1):
print >>f, ' if code >= FORWARD_TABLE[i%s] { i += %d; }' % \
('+%d' % (2**i-1) if i > 0 else '', 2**i)
print >>f, ' (code - FORWARD_TABLE[i-1]) + BACKWARD_TABLE[i-1]'
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'multi_byte_range_tests!('
print >>f, ' mod = %s,' % modname
print >>f, ' key = [%d, %d], key < %d,' % (minkey, maxkey, keyubound)
print >>f, ' value = [%d, %d], value < %d' % (minvalue, maxvalue, valueubound)
print >>f, ');'
return 4 * len(data), 4 * len(data)
INDICES = {
'singlebyte/ibm866': generate_single_byte_index,
'singlebyte/iso-8859-2': generate_single_byte_index,
'singlebyte/iso-8859-3': generate_single_byte_index,
'singlebyte/iso-8859-4': generate_single_byte_index,
'singlebyte/iso-8859-5': generate_single_byte_index,
'singlebyte/iso-8859-6': generate_single_byte_index,
'singlebyte/iso-8859-7': generate_single_byte_index,
'singlebyte/iso-8859-8': generate_single_byte_index,
'singlebyte/iso-8859-10': generate_single_byte_index,
'singlebyte/iso-8859-13': generate_single_byte_index,
'singlebyte/iso-8859-14': generate_single_byte_index,
'singlebyte/iso-8859-15': generate_single_byte_index,
'singlebyte/iso-8859-16': generate_single_byte_index,
'singlebyte/koi8-r': generate_single_byte_index,
'singlebyte/koi8-u': generate_single_byte_index,
'singlebyte/macintosh': generate_single_byte_index,
'singlebyte/windows-874': generate_single_byte_index,
'singlebyte/windows-1250': generate_single_byte_index,
'singlebyte/windows-1251': generate_single_byte_index,
'singlebyte/windows-1252': generate_single_byte_index,
'singlebyte/windows-1253': generate_single_byte_index,
'singlebyte/windows-1254': generate_single_byte_index,
'singlebyte/windows-1255': generate_single_byte_index,
'singlebyte/windows-1256': generate_single_byte_index,
'singlebyte/windows-1257': generate_single_byte_index,
'singlebyte/windows-1258': generate_single_byte_index,
'singlebyte/x-mac-cyrillic': generate_single_byte_index,
'tradchinese/big5': generate_multi_byte_index,
'korean/euc-kr': generate_multi_byte_index,
'simpchinese/gb18030': generate_multi_byte_index,
'japanese/jis0208': generate_multi_byte_index,
'japanese/jis0212': generate_multi_byte_index,
'simpchinese/gb18030-ranges': generate_multi_byte_range_lbound_index,
}
if __name__ == '__main__':
import sys
filters = sys.argv[1:]
totalsz = 0
for index, generate in INDICES.items():
crate, _, index = index.partition('/')
if filters and all(s not in index for s in filters): continue
print >>sys.stderr, 'generating index %s...' % index,
forwardsz, backwardsz = generate(crate, index)
totalsz += forwardsz + backwardsz
print >>sys.stderr, '%d + %d = %d bytes.' % (forwardsz, backwardsz, forwardsz + backwardsz)
print >>sys.stderr, 'total %d bytes.' % totalsz
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment