Last active
April 7, 2024 12:14
-
-
Save hjeffrey/df08677b4d33753510ced204bfc34ed3 to your computer and use it in GitHub Desktop.
把康熙部首替换为简体中文
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding:utf-8 -*- | |
# 把康熙部首替换为简体中文 | |
import os | |
import sys | |
import glob | |
import json | |
# 查看编码 | |
def encodingValue(key): | |
print(key + "-" + json.dumps(key)) | |
# 加载康熙部首的对应字典 | |
def initKangxiDic(): | |
global kangxi_map | |
# 康熙部首 https://unicode-table.com/cn/blocks/kangxi-radicals/ | |
# 对应关系根据 https://raw.githubusercontent.com/furuiyang0715/spider_notes/master/codes/kangxi.json 补充 | |
kangxi_map = {"⼀": "一","⼄": "乙","⼆": "二","⼈": "人","⼉": "儿","⼊": "入","⼋": "八","⼏": "几","⼑": "刀","⼒": "力","⼔": "匕","⼗": "十","⼘": "卜","⼚": "厂","⼜": "又","⼝": "口","⼞": "口","⼟": "土","⼠": "士","⼣": "夕","⼤": "大","⼥": "女","⼦": "子","⼨": "寸","⼩": "小","⼫": "尸","⼭": "山","⼯": "工","⼰": "己","⼲": "干","⼴": "广","⼸": "弓","⼼": "心","⼽": "戈","⼿": "手","⽀": "支","⽂": "文","⽃": "斗","⽄": "斤","⽅": "方","⽆": "无","⽇": "日","⽈": "曰","⽉": "月","⽊": "木","⽋": "欠","⽌": "止","⽍": "歹","⽏": "毋","⽐": "比","⽑": "毛","⽒": "氏","⽓": "气","⽔": "水","⽕": "火","⽖": "爪","⽗": "父","⽚": "片","⽛": "牙","⽜": "牛","⽝": "犬","⽞": "玄","⽟": "玉","⽠": "瓜","⽡": "瓦","⽢": "甘","⽣": "生","⽤": "用","⽥": "田","⽩": "白","⽪": "皮","⽫": "皿","⽬": "目","⽭": "矛","⽮": "矢","⽯": "石","⽰": "示","⽲": "禾","⽳": "穴","⽴": "立","⽵": "竹","⽶": "米","⽸": "缶","⽹": "网","⽺": "羊","⽻": "羽","⽼": "老","⽽": "而","⽿": "耳","⾁": "肉","⾂": "臣","⾃": "自","⾄": "至","⾆": "舌","⾈": "舟","⾉": "艮","⾊": "色","⾍": "虫","⾎": "血","⾏": "行","⾐": "衣","⾒": "儿","⾓": "角","⾔": "言","⾕": "谷","⾖": "豆","⾚": "赤","⾛": "走","⾜": "足","⾝": "身","⾞": "车","⾟": "辛","⾠": "辰","⾢": "邑","⾣": "酉","⾤": "采","⾥": "里","⾦": "金","⾧": "长","⾨": "门","⾩": "阜","⾪": "隶","⾬": "雨","⾭": "青","⾮": "非","⾯": "面","⾰": "革","⾲": "韭","⾳": "音","⾴": "页","⾵": "风","⾶": "飞","⾷": "食","⾸": "首","⾹": "香","⾺": "马","⾻": "骨","⾼": "高","⿁": "鬼","⿂": "鱼","⿃": "鸟","⿄": "卤","⿅": "鹿","⿇": "麻","⿉": "黍","⿊": "黑","⿍": "鼎","⿎": "鼓","⿏": "鼠","⿐": "鼻","⿒": "齿","⿓": "龙","⿔":"龟","⿕":"仑"} | |
# for key in kangxi_map: | |
# encodingValue(kangxi_map[key]) | |
# 替换康熙部首的文字 | |
def repaceKangxi(fileName, newFileName): | |
f = open(fileName, 'r') | |
contents = f.readlines() | |
wf = open(newFileName, 'w') | |
wf.seek(0) | |
for line in contents: | |
newLine = line | |
for word in kangxi_map.keys(): | |
newLine = newLine.replace(word, kangxi_map[word]) | |
wf.write(newLine) | |
f.close() | |
wf.close() | |
if __name__ == '__main__': | |
path = sys.argv[1].decode('utf-8') | |
if os.path.isdir(path): | |
print("请输入正确的文件路径") | |
elif os.path.isfile(path): | |
initKangxiDic() | |
newPath = os.path.dirname(path)+'/fix_kangxi_'+os.path.basename(path) | |
i = 1 | |
while os.path.exists(newPath): | |
i=i+1 | |
newPath = os.path.dirname(path)+'/fix_kangxi'+str(i)+'_'+os.path.basename(path) | |
repaceKangxi(path, newPath) | |
print(u"转换完成: "+newPath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this work🤗. Help me deal with the unpleasant words in my textual data.