Last active
August 29, 2015 14:01
-
-
Save shonenada/322ea9e76eb343eb32da to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import sys | |
import struct | |
TABLE_START = 0x1540 | |
CHINESE_START = 0x2628 | |
TABLE_START_FLAG = '\x9D\x01\x00\x00' | |
SCEL_FLAG = '\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00' | |
def unpack(code): | |
fmt_chr = struct.unpack('H', code) | |
return fmt_chr[0] | |
def byte2str(byte): | |
"""将原始字节码转换成字符串""" | |
output = [] | |
length = len(byte) | |
for pos in xrange(0, len(byte), 2): | |
unicode_code = byte[pos : pos + 2] | |
# transform uni to unsigned short. | |
fmt_chr = unpack(unicode_code) | |
# transform fmt_chr to unicode by its Unicode code. | |
char = unichr(fmt_chr) | |
if char not in ('\t', ' '): | |
output.append(char) | |
elif char == '\t': | |
output.append('\n') | |
return ''.join(output) | |
def get_table(data): | |
pinyin_table = {} | |
flag = data[0:4] | |
if flag != TABLE_START_FLAG: | |
return None | |
pos = 0 | |
byte = data[4:] | |
length = len(byte) | |
while pos < length: | |
idx_pos = byte[pos : pos + 2] | |
index = unpack(idx_pos) | |
pos += 2 | |
size_pos = byte[pos : pos + 2] | |
size = unpack(size_pos) | |
pos += 2 | |
pinyin = byte2str(byte[pos : pos + size]) | |
pinyin_table[index] = pinyin | |
pos += size | |
return pinyin_table | |
def get_word_pinyin(data, table): | |
output = [] | |
for pos in xrange(0, len(data), 2): | |
uni = data[pos : pos + 2] | |
index = unpack(uni) | |
output.append(table[index]) | |
return ''.join(output) | |
def get_chinese(data, table): | |
pos = 0 | |
results = [] | |
length = len(data) | |
while pos < length: | |
same = unpack(data[pos : pos + 2]) | |
pos += 2 | |
pinyin_table_len = unpack(data[pos : pos + 2]) | |
pos += 2 | |
pinyin = get_word_pinyin(data[pos : pos + pinyin_table_len], table) | |
pos += pinyin_table_len | |
for i in xrange(same): | |
chinese_len = unpack(data[pos : pos + 2]) | |
pos += 2 | |
word = byte2str(data[pos : pos + chinese_len]) | |
pos += chinese_len | |
ext_len = unpack(data[pos : pos + 2]) | |
pos += 2 | |
count = unpack(data[pos : pos + 2]) | |
results.append((count, pinyin, word)) | |
pos += ext_len | |
return results | |
def analyse(filename): | |
with open(filename, 'rb') as scel: | |
data = scel.read() | |
if data[0:12] != SCEL_FLAG: | |
print("not *.scel") | |
sys.exit(0) | |
table = get_table(data[TABLE_START : CHINESE_START]) | |
results = get_chinese(data[CHINESE_START:], table) | |
return results | |
def main(scel_files): | |
for f in scel_files: | |
results = analyse(f) | |
with open('%s%s' % (f, '.txt'), 'w') as out_stream: | |
for item in results: | |
line = "%s %s %s" % item | |
out_stream.write(unicode(line).encode('utf-8')) | |
out_stream.write('\n') | |
if __name__ == '__main__': | |
args = sys.argv | |
if len(args) < 2: | |
print("Usage: %s path_to_scel [path_to_scel]..." % args[0]) | |
sys.exit(0) | |
args = args[1:] | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment