Skip to content

Instantly share code, notes, and snippets.

@shonenada
Last active August 29, 2015 14:01
Show Gist options
  • Save shonenada/322ea9e76eb343eb32da to your computer and use it in GitHub Desktop.
Save shonenada/322ea9e76eb343eb32da to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
import sys
import struct
TABLE_START = 0x1540
CHINESE_START = 0x2628
TABLE_START_FLAG = '\x9D\x01\x00\x00'
SCEL_FLAG = '\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00'
def unpack(code):
fmt_chr = struct.unpack('H', code)
return fmt_chr[0]
def byte2str(byte):
"""将原始字节码转换成字符串"""
output = []
length = len(byte)
for pos in xrange(0, len(byte), 2):
unicode_code = byte[pos : pos + 2]
# transform uni to unsigned short.
fmt_chr = unpack(unicode_code)
# transform fmt_chr to unicode by its Unicode code.
char = unichr(fmt_chr)
if char not in ('\t', ' '):
output.append(char)
elif char == '\t':
output.append('\n')
return ''.join(output)
def get_table(data):
pinyin_table = {}
flag = data[0:4]
if flag != TABLE_START_FLAG:
return None
pos = 0
byte = data[4:]
length = len(byte)
while pos < length:
idx_pos = byte[pos : pos + 2]
index = unpack(idx_pos)
pos += 2
size_pos = byte[pos : pos + 2]
size = unpack(size_pos)
pos += 2
pinyin = byte2str(byte[pos : pos + size])
pinyin_table[index] = pinyin
pos += size
return pinyin_table
def get_word_pinyin(data, table):
output = []
for pos in xrange(0, len(data), 2):
uni = data[pos : pos + 2]
index = unpack(uni)
output.append(table[index])
return ''.join(output)
def get_chinese(data, table):
pos = 0
results = []
length = len(data)
while pos < length:
same = unpack(data[pos : pos + 2])
pos += 2
pinyin_table_len = unpack(data[pos : pos + 2])
pos += 2
pinyin = get_word_pinyin(data[pos : pos + pinyin_table_len], table)
pos += pinyin_table_len
for i in xrange(same):
chinese_len = unpack(data[pos : pos + 2])
pos += 2
word = byte2str(data[pos : pos + chinese_len])
pos += chinese_len
ext_len = unpack(data[pos : pos + 2])
pos += 2
count = unpack(data[pos : pos + 2])
results.append((count, pinyin, word))
pos += ext_len
return results
def analyse(filename):
with open(filename, 'rb') as scel:
data = scel.read()
if data[0:12] != SCEL_FLAG:
print("not *.scel")
sys.exit(0)
table = get_table(data[TABLE_START : CHINESE_START])
results = get_chinese(data[CHINESE_START:], table)
return results
def main(scel_files):
for f in scel_files:
results = analyse(f)
with open('%s%s' % (f, '.txt'), 'w') as out_stream:
for item in results:
line = "%s %s %s" % item
out_stream.write(unicode(line).encode('utf-8'))
out_stream.write('\n')
if __name__ == '__main__':
args = sys.argv
if len(args) < 2:
print("Usage: %s path_to_scel [path_to_scel]..." % args[0])
sys.exit(0)
args = args[1:]
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment