Last active
January 10, 2022 15:48
-
-
Save HirbodBehnam/d0fa09179ed36975de93db6d5b70cd55 to your computer and use it in GitHub Desktop.
This script decomposes the Unicode characters in a text file based on UnicodeData.txt and then creates a json file which can be used in other programming languages for decomposing. I used this script to fix the Persian characters extracted from PDF files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import json | |
fixset = {} | |
# Get this file from http://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt | |
with open('UnicodeData.txt', 'r') as data: | |
for line in data: | |
lineData = line.split(';') | |
if lineData[5] != '': | |
g = re.search(r'^<.+> (.+)$', lineData[5]) | |
if g is not None and g.group(1) is not None: | |
fixed = "" | |
for unicode in g.group(1).split(): | |
fixed += chr(int(unicode, 16)) | |
fixset[chr(int(lineData[0], 16))] = fixed | |
# Pirnt to json for other apps | |
with open('decomposition.json', 'w') as decomposition: | |
json.dump(fixset, decomposition, separators=(',', ":")) | |
# Fix file | |
with open('in.txt', 'r', encoding='utf-8') as input: | |
with open('out.txt', 'w', encoding='utf-8') as output: | |
while True: | |
char = input.read(1) | |
if not char: | |
break | |
output.write(fixset.get(char, char)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment