Skip to content

Instantly share code, notes, and snippets.

@suapapa
Created April 3, 2013 23:12
Show Gist options
  • Save suapapa/5306329 to your computer and use it in GitHub Desktop.
Save suapapa/5306329 to your computer and use it in GitHub Desktop.
Script to create cp949.dat of go-charset from the CP949.TXT.
#!/usr/bin/env python
# CP949 specs;
# http://ko.wikipedia.org/wiki/%EC%BD%94%EB%93%9C_%ED%8E%98%EC%9D%B4%EC%A7%80_949
# http://msdn.microsoft.com/ko-kr/goglobal/cc305154.aspx
import re
ptn = re.compile(r'0x([A-F0-9]*)\s*0x([A-F0-9]*)')
chunk_begin = 0
chunk_buf = ""
chunk_cnt = 0
code_cnt = 0
data_buf = ""
def makeChunkData(begin, chunk):
cl = len(chunk)
return chr((begin & 0xff00) >> 8) + chr(begin & 0xff) + \
chr((cl & 0xff00) >> 8) + chr(cl & 0xff) + chunk
last_native = 0
# XXX: download CP949.TXT from;
# http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT
for line in open("CP949.TXT"):
if line.startswith("#"):
continue
rst = ptn.findall(line)
if len(rst) != 1:
continue
curr_native, curr_unicode = int(rst[0][0], 16), int(rst[0][1], 16)
if curr_native == curr_unicode:
continue
code_cnt += 1
if chunk_begin == 0:
chunk_begin = curr_native
if (curr_native - 1 != last_native) and (last_native != 0):
data_buf += makeChunkData(chunk_begin, chunk_buf)
chunk_cnt += 1
chunk_begin = curr_native
chunk_buf = ""
chunk_buf += unichr(curr_unicode).encode('utf-8')
last_native = curr_native
data_buf += makeChunkData(chunk_begin, chunk_buf)
chunk_cnt += 1
wp = open('cp949.dat', 'w')
wp.write(chr((code_cnt & 0xff00) >> 8))
wp.write(chr(code_cnt & 0xff))
wp.write(chr((chunk_cnt & 0xff00) >> 8))
wp.write(chr(chunk_cnt & 0xff))
wp.write(data_buf)
wp.close()
print "chunk_cnt =", chunk_cnt, "code_cnt =", code_cnt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment