-
-
Save yejingx/1ab4abf9d8fd370684af to your computer and use it in GitHub Desktop.
CJK Auto Formating
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
from __future__ import (unicode_literals, print_function) | |
import os | |
import re | |
import sys | |
import codecs | |
utf8_reader = codecs.getreader('utf-8') | |
cjk_range = [ | |
('\u3400', '\u4DB5'), # CJK Unified Ideographs Extension A | |
('\u4E00', '\u9FA5'), # CJK Unified Ideographs | |
('\u9FA6', '\u9FBB'), # CJK Unified Ideographs | |
('\uF900', '\uFA2D'), # CJK Compatibility Ideographs | |
('\uFA30', '\uFA6A'), # CJK Compatibility Ideographs | |
('\uFA70', '\uFAD9'), # CJK Compatibility Ideographs | |
('\U00020000', '\U0002A6D6'), # CJK Unified Ideographs Extension B | |
('\U0002F800', '\U0002FA1D'), # CJK Compatibility Supplement | |
] | |
punc_range = [ | |
('\u0000', '\u0020'), # space | |
('\u201c', '\u201d'), # “” | |
('\u3000', '\u303f'), # CJK Symbols and Punctuation | |
('\uff00', '\uffef'), # Halfwidth and Fullwidth Forms | |
] | |
def _chinese_auto_spacing(text): | |
def _with_range(char, check_range): | |
for start, end in check_range: | |
if char >= start and char <= end: | |
return True | |
return False | |
def is_cjk(char): | |
return _with_range(char, cjk_range) | |
def is_punc(char): | |
return _with_range(char, punc_range) | |
ret = '' | |
prev = None | |
for char in text: | |
sp = '' | |
curr_is_cjk = is_cjk(char) | |
curr_is_punc = is_punc(char) | |
if prev: | |
prev_is_cjk, prev_is_punc = prev | |
if curr_is_punc or prev_is_punc: | |
# do not add space around a punctuation | |
sp = '' | |
elif prev_is_cjk != curr_is_cjk: | |
sp = ' ' | |
ret = ret + sp + char | |
prev = (curr_is_cjk, curr_is_punc) | |
return ret | |
def _punc_sub(text): | |
punc_dict = { | |
',': ',', | |
':': ':', | |
';': ';', | |
'(': '(', | |
')': ')', | |
} | |
text = re.sub(r'[ \t]+', ' ', text) | |
text = re.sub(r'[ \t]+([.,:;()])', r'\1', text) | |
text = re.sub(r'"([\u4e00-\u9fff]+)"', r'“\1”', text) | |
text = re.sub(r"'([\u4e00-\u9fff]+)'", r'“\1”', text) | |
text = re.sub(r'([\u4e00-\u9fff])\.', r'\1。', text) | |
text = re.sub(r'[,:;()]', | |
lambda m: punc_dict[m.group(0)], text) | |
text = re.sub(r'[ \t]?([,:。;()“”])[ \t]?', r'\1', text) | |
return text | |
def process_text(text): | |
text = _punc_sub(text) | |
text = _chinese_auto_spacing(text) | |
return text.encode('utf-8') | |
if __name__ == '__main__': | |
text = None | |
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]): | |
text = codecs.open(sys.argv[1], 'rb', 'utf-8').read() | |
if text is None: | |
text = utf8_reader(sys.stdin).read() | |
sys.stdout.write(process_text(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment