Last active
November 17, 2024 05:01
-
-
Save yzhangcs/e093d77c357142e4ccf9d19f0d3d4928 to your computer and use it in GitHub Desktop.
Convert full-width characters to half-width ones.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import argparse | |
import unicodedata | |
# FF00-FF5F -> 0020-007E | |
MAP = {' ': ' ', '!': '!', '"': '"', '#': '#', '$': '$', '%': '%', '&': '&', | |
''': "'", '(': '(', ')': ')', '*': '*', '+': '+', ',': ',', '-': '-', | |
'.': '.', '/': '/', | |
'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', | |
'7': '7', '8': '8', '9': '9', | |
':': ':', ';': ';', '<': '<', '=': '=', '>': '>', '?': '?', '@': '@', | |
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G', | |
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', | |
'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', | |
'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z', | |
'[': '[', '\': '\\', | |
']': ']', '^': '^', '_': '_', '`': '`', | |
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g', | |
'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', | |
'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', | |
'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z', | |
'{': '{', '|': '|', '}': '}'} | |
def ispunct(token): | |
return all(unicodedata.category(char).startswith('P') | |
for char in token) | |
def isfullwidth(token): | |
return all(unicodedata.east_asian_width(char) in ['W', 'F', 'A'] | |
for char in token) | |
def islatin(token): | |
return all('LATIN' in unicodedata.name(char) | |
for char in token) | |
def isdigit(token): | |
return all('DIGIT' in unicodedata.name(char) | |
for char in token) | |
def tohalfwidth(token): | |
return unicodedata.normalize('NFKC', token) | |
def full2half(fin, fout, narrow=False): | |
r'''Convert full-width characters to half-width ones. | |
Parameters: | |
fin (str): the file to convert. | |
fout (str): the file to save. | |
narrow (bool): | |
True if only convert the characters in the range [FF00, FF5F); | |
False else. | |
''' | |
with open(fin, 'r') as f: | |
lines = [l.strip() for l in f] | |
if narrow: | |
lines = [''.join(MAP.get(c, c) for c in l) for l in lines] | |
else: | |
lines = [tohalfwidth(l) for l in lines] | |
with open(fout, 'w') as f: | |
for l in lines: | |
f.write(l + '\n') | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description='Convert full-width characters to half-width ones.' | |
) | |
parser.add_argument('--fin', help='the file to convert') | |
parser.add_argument('--fout', help='the file to save') | |
parser.add_argument('--narrow', action='store_true', | |
help='only convert the characters in the above table') | |
args = parser.parse_args() | |
full2half(args.fin, args.fout, args.narrow) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 0123456789 | |
# ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
# abcdefghijklmnopqrstuvwxyz | |
# 他们这里洋文好的人多得很呐 | |
# !"#$%&'()*+,-./ | |
# 0123456789 | |
# :;<=>?@ | |
# ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
# [\]^_` | |
# abcdefghijklmnopqrstuvwxyz | |
# {|} | |
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 | |
# 𝓪𝓫𝓬𝓭𝓮𝓯𝓰𝓱𝓲𝓳𝓴𝓵𝓶𝓷𝓸𝓹𝓺𝓻𝓼𝓽𝓾𝓿𝔀𝔁𝔂𝔃 | |
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 -> ABCDEFGHIJKLMNOPQRSTUVWXYZ | |
python full2half.py --fin=demo.txt --fout=out.txt | |
# 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 -> 𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩 | |
python full2half.py --fin=demo.txt --fout=out.narrow.txt --narrow |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment