Last active
May 23, 2023 22:47
-
-
Save burnash/d6d35fbabd2566f2b1b9 to your computer and use it in GitHub Desktop.
Find strings with non-latin characters in files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import codecs | |
import unicodedata as ud | |
# from http://stackoverflow.com/questions/3094498/how-can-i-check-if-a-python-unicode-string-contains-non-western-letters | |
latin_letters = {} | |
def is_latin(uchr): | |
try: | |
return latin_letters[uchr] | |
except KeyError: | |
return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr)) | |
def only_roman_chars(unistr): | |
return all(is_latin(uchr) | |
for uchr in unistr | |
if uchr.isalpha()) # isalpha suggested by John Machin | |
path = sys.argv[1] | |
for dirpath, dirnames, filenames in os.walk(path): | |
for fn in filenames: | |
full_path = os.path.join(dirpath, fn) | |
try: | |
with codecs.open(full_path, encoding='utf-8') as f: | |
for line in f: | |
if not only_roman_chars(line): | |
print full_path, ':' | |
print line | |
except UnicodeDecodeError: | |
continue | |
except: | |
print 'Error while reading', full_path | |
raise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment