Last active
October 3, 2019 17:12
-
-
Save andreasvc/1cae9033fe6310bae9f45d3c0a8c3883 to your computer and use it in GitHub Desktop.
Apply polyglot language detection recursively
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Apply polyglot language detection to all .txt files under current directory | |
(searched recursively), write report in tab-separated file detectedlangs.tsv. | |
""" | |
import os | |
from glob import glob | |
from polyglot.detect import Detector | |
from polyglot.detect.base import UnknownLanguage | |
def main(): | |
"""Main.""" | |
with open('detectedlangs.tsv', 'w', encoding='utf8') as out: | |
print( | |
'filename', 'lang', 'confidence', 'read_bytes', | |
sep='\t', file=out) | |
for fname in glob('**/*.txt', recursive=True): | |
with open(fname, encoding='utf8') as inp: | |
text = inp.read() | |
# https://github.com/aboSamoor/polyglot/issues/71#issuecomment-445199949 | |
filteredtext = ''.join(x for x in text if x.isprintable()) | |
try: | |
res = Detector(filteredtext) | |
except UnknownLanguage: | |
print( | |
fname, | |
'unknown', | |
0, | |
len(text), | |
sep='\t', file=out) | |
else: | |
print( | |
fname, | |
res.language.code, | |
res.language.confidence, | |
res.language.read_bytes, | |
sep='\t', file=out) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment