Last active
September 30, 2022 19:38
-
-
Save avalonv/33874f2e4b841394488d556eb65fc604 to your computer and use it in GitHub Desktop.
Replace unicode symbols with ascii ones
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# this will TRY to replace a set of unicode characters with a corresponding set | |
# of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode), | |
# run 'pip3 install unidecode' to install. | |
# useful for translating mathematical symbols commonly found in PDFs into | |
# plain ascii ones, NOT recommended for transliterating text that doesn't use | |
# the latin alphabet, though it can still spot instances of that text for you. | |
# there's no gurantee the replacements it suggests will be accurate, so you | |
# should carefully inspect the suggestion for each individual line before | |
# writing. | |
# This code is licensed under the terms of the GNU General Public License v3 | |
# gwyn oscuro 2022 | |
from unidecode import unidecode as cleanse | |
from sys import argv, exit | |
# symbols to explicitly ignore (append to end of string, don't use commas) | |
ignore = '−' | |
# any symbols with unicode codes higher than this value will be replaced. | |
# set the cutoff on a case by case basis, start with a high value value and | |
# decrease it accordingly to filter more stuff, there's no universal solution | |
# ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes | |
cutoff = 8230 | |
if len(argv) > 1: | |
file = argv[1] | |
else: | |
print('Please supply a file') | |
exit(1) | |
with open(file, newline='\n', mode='r', encoding='utf8') as f: | |
rlines = f.readlines() | |
width = len(str(len(rlines))) | |
grace = [ord(c) for c in ignore] | |
grace.extend(range(0,cutoff+1)) | |
converts = [] | |
impenitents = [] | |
targets = [] | |
total = 0 | |
for i, line in enumerate(rlines): | |
line = line.strip() | |
sins = ''.join((c for c in line if not ord(c) in grace)) | |
if any(sins): | |
total += 1 | |
repenters = cleanse(sins) | |
fmt_unicd = "{:>{width}} | unicode: '{}'" | |
fmt_ascii = "{:>{width}} | ascii: '{}'" | |
if len(sins) == len(repenters): | |
converts.append(sins) | |
targets.append(i) | |
print(fmt_unicd.format(i+1, sins, width=width)) | |
print(fmt_ascii.format(' ', repenters, width=width)) | |
else: | |
impenitents.append(i+1) | |
print("{:>{width}} | WARNING: char count mismatch, discarding". | |
format(i+1, width=width)) | |
print(fmt_unicd.format(' ', sins, width=width)) | |
print(fmt_ascii.format(' ', repenters, width=width)) | |
print("########################################") | |
print("-Total offending lines:", total) | |
if any(impenitents): | |
impenitents = ', '.join((str(n) for n in impenitents)) | |
print("-Please manually check lines", impenitents) | |
if any(converts): | |
converts = list(set(''.join(converts))) | |
converts.sort() | |
print("-Offending characters:") | |
print(converts) | |
print("-Replace these characters with ascii?") | |
if input(">type YES to proceed, anything else to quit: ").lower() == 'yes': | |
wlines = rlines | |
for i, line in enumerate(rlines): | |
if i in targets: | |
for c in converts: | |
if c in line: | |
line = line.replace(c, cleanse(c)) | |
wlines[i] = line | |
with open(file, newline='\n', mode='w', encoding='utf8') as f: | |
f.writelines(wlines) | |
else: | |
print("-Aborting") | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment