Skip to content

Instantly share code, notes, and snippets.

@yzhangcs
Last active November 17, 2024 05:01
Show Gist options
  • Save yzhangcs/e093d77c357142e4ccf9d19f0d3d4928 to your computer and use it in GitHub Desktop.
Save yzhangcs/e093d77c357142e4ccf9d19f0d3d4928 to your computer and use it in GitHub Desktop.
Convert full-width characters to half-width ones.
# -*- coding: utf-8 -*-
import argparse
import unicodedata
# FF00-FF5F -> 0020-007E
MAP = {' ': ' ', '!': '!', '"': '"', '#': '#', '$': '$', '%': '%', '&': '&',
''': "'", '(': '(', ')': ')', '*': '*', '+': '+', ',': ',', '-': '-',
'.': '.', '/': '/',
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6',
'7': '7', '8': '8', '9': '9',
':': ':', ';': ';', '<': '<', '=': '=', '>': '>', '?': '?', '@': '@',
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z',
'[': '[', '\': '\\',
']': ']', '^': '^', '_': '_', '`': '`',
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g',
'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n',
'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u',
'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z',
'{': '{', '|': '|', '}': '}'}
def ispunct(token):
return all(unicodedata.category(char).startswith('P')
for char in token)
def isfullwidth(token):
return all(unicodedata.east_asian_width(char) in ['W', 'F', 'A']
for char in token)
def islatin(token):
return all('LATIN' in unicodedata.name(char)
for char in token)
def isdigit(token):
return all('DIGIT' in unicodedata.name(char)
for char in token)
def tohalfwidth(token):
return unicodedata.normalize('NFKC', token)
def full2half(fin, fout, narrow=False):
r'''Convert full-width characters to half-width ones.
Parameters:
fin (str): the file to convert.
fout (str): the file to save.
narrow (bool):
True if only convert the characters in the range [FF00, FF5F);
False else.
'''
with open(fin, 'r') as f:
lines = [l.strip() for l in f]
if narrow:
lines = [''.join(MAP.get(c, c) for c in l) for l in lines]
else:
lines = [tohalfwidth(l) for l in lines]
with open(fout, 'w') as f:
for l in lines:
f.write(l + '\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Convert full-width characters to half-width ones.'
)
parser.add_argument('--fin', help='the file to convert')
parser.add_argument('--fout', help='the file to save')
parser.add_argument('--narrow', action='store_true',
help='only convert the characters in the above table')
args = parser.parse_args()
full2half(args.fin, args.fout, args.narrow)
# 0123456789
# ABCDEFGHIJKLMNOPQRSTUVWXYZ
# abcdefghijklmnopqrstuvwxyz
# 他们这里洋文好的人多得很呐
#   !"#$%&'()*+,-./
# 0123456789
# :;<=>?@
# ABCDEFGHIJKLMNOPQRSTUVWXYZ
# [\]^_`
# abcdefghijklmnopqrstuvwxyz
# {|}
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩
# 𝓪𝓫𝓬𝓭𝓮𝓯𝓰𝓱𝓲𝓳𝓴𝓵𝓶𝓷𝓸𝓹𝓺𝓻𝓼𝓽𝓾𝓿𝔀𝔁𝔂𝔃
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 -> ABCDEFGHIJKLMNOPQRSTUVWXYZ
python full2half.py --fin=demo.txt --fout=out.txt
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 -> 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩
python full2half.py --fin=demo.txt --fout=out.narrow.txt --narrow
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment