avalonv · September 30, 2022 19:38
diff --git a/unicode2ascii.py b/unicode2ascii.py
 #!/usr/bin/python3
 # this will TRY to replace a set of unicode characters with a corresponding set
 # of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode),
 # run 'pip3 install unidecode' to install.
 # useful for translating mathematical symbols commonly found in PDFs into
 # plain ascii ones, NOT recommended for transliterating text that doesn't use
 # the latin alphabet, though it can still spot instances of that text for you.
 # there's no gurantee the replacements it suggests will be accurate, so you
 # should carefully inspect the suggestion for each individual line before
 # writing.
 # This code is licensed under the terms of the GNU General Public License v3
 # gwyn oscuro 2022
 from unidecode import unidecode as cleanse
 from sys import argv, exit

 # symbols to explicitly ignore (append to end of string, don't use commas)
 ignore = '−'

 # any symbols with unicode codes higher than this value will be replaced.
 # set the cutoff on a case by case basis, start with a high value value and
 # decrease it accordingly to filter more stuff, there's no universal solution
 # ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes
 cutoff = 8230

 if len(argv) > 1:
    file = argv[1]
 else:
    print('Please supply a file')
    exit(1)

 with open(file, newline='\n', mode='r', encoding='utf8') as f:
    rlines = f.readlines()
    width = len(str(len(rlines)))

 grace = [ord(c) for c in ignore]
 grace.extend(range(0,cutoff+1))
 converts = []
 impenitents = []
 targets = []
 total = 0

 for i, line in enumerate(rlines):
    line = line.strip()
    sins = ''.join((c for c in line if not ord(c) in grace))
    if any(sins):
        total += 1
        repenters = cleanse(sins)
        fmt_unicd = "{:>{width}} | unicode: '{}'"
        fmt_ascii = "{:>{width}} |   ascii: '{}'"
        if len(sins) == len(repenters):
            converts.append(sins)
            targets.append(i)
            print(fmt_unicd.format(i+1, sins, width=width))
            print(fmt_ascii.format(' ', repenters, width=width))
        else:
            impenitents.append(i+1)
            print("{:>{width}} | WARNING: char count mismatch, discarding".
                                                  format(i+1, width=width))
            print(fmt_unicd.format(' ', sins, width=width))
            print(fmt_ascii.format(' ', repenters, width=width))

 print("########################################")
 print("-Total offending lines:", total)
 if any(impenitents):
    impenitents = ', '.join((str(n) for n in impenitents))
    print("-Please manually check lines", impenitents)
 if any(converts):
    converts = list(set(''.join(converts)))
    converts.sort()
    print("-Offending characters:")
    print(converts)
    print("-Replace these characters with ascii?")
    if input(">type YES to proceed, anything else to quit: ").lower() == 'yes':
        wlines = rlines
        for i, line in enumerate(rlines):
            if i in targets:
                for c in converts:
                    if c in line:
                        line = line.replace(c, cleanse(c))
                wlines[i] = line
        with open(file, newline='\n', mode='w', encoding='utf8') as f:
            f.writelines(wlines)
    else:
        print("-Aborting")
        exit(0)
	#!/usr/bin/python3
	# this will TRY to replace a set of unicode characters with a corresponding set
	# of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode),
	# run 'pip3 install unidecode' to install.
	# useful for translating mathematical symbols commonly found in PDFs into
	# plain ascii ones, NOT recommended for transliterating text that doesn't use
	# the latin alphabet, though it can still spot instances of that text for you.
	# there's no gurantee the replacements it suggests will be accurate, so you
	# should carefully inspect the suggestion for each individual line before
	# writing.
	# This code is licensed under the terms of the GNU General Public License v3
	# gwyn oscuro 2022
	from unidecode import unidecode as cleanse
	from sys import argv, exit

	# symbols to explicitly ignore (append to end of string, don't use commas)
	ignore = '−'

	# any symbols with unicode codes higher than this value will be replaced.
	# set the cutoff on a case by case basis, start with a high value value and
	# decrease it accordingly to filter more stuff, there's no universal solution
	# ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes
	cutoff = 8230

	if len(argv) > 1:
	file = argv[1]
	else:
	print('Please supply a file')
	exit(1)

	with open(file, newline='\n', mode='r', encoding='utf8') as f:
	rlines = f.readlines()
	width = len(str(len(rlines)))

	grace = [ord(c) for c in ignore]
	grace.extend(range(0,cutoff+1))
	converts = []
	impenitents = []
	targets = []
	total = 0

	for i, line in enumerate(rlines):
	line = line.strip()
	sins = ''.join((c for c in line if not ord(c) in grace))
	if any(sins):
	total += 1
	repenters = cleanse(sins)
	fmt_unicd = "{:>{width}} \| unicode: '{}'"
	fmt_ascii = "{:>{width}} \| ascii: '{}'"
	if len(sins) == len(repenters):
	converts.append(sins)
	targets.append(i)
	print(fmt_unicd.format(i+1, sins, width=width))
	print(fmt_ascii.format(' ', repenters, width=width))
	else:
	impenitents.append(i+1)
	print("{:>{width}} \| WARNING: char count mismatch, discarding".
	format(i+1, width=width))
	print(fmt_unicd.format(' ', sins, width=width))
	print(fmt_ascii.format(' ', repenters, width=width))

	print("########################################")
	print("-Total offending lines:", total)
	if any(impenitents):
	impenitents = ', '.join((str(n) for n in impenitents))
	print("-Please manually check lines", impenitents)
	if any(converts):
	converts = list(set(''.join(converts)))
	converts.sort()
	print("-Offending characters:")
	print(converts)
	print("-Replace these characters with ascii?")
	if input(">type YES to proceed, anything else to quit: ").lower() == 'yes':
	wlines = rlines
	for i, line in enumerate(rlines):
	if i in targets:
	for c in converts:
	if c in line:
	line = line.replace(c, cleanse(c))
	wlines[i] = line
	with open(file, newline='\n', mode='w', encoding='utf8') as f:
	f.writelines(wlines)
	else:
	print("-Aborting")
	exit(0)