Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active October 3, 2019 17:12
Show Gist options
  • Save andreasvc/1cae9033fe6310bae9f45d3c0a8c3883 to your computer and use it in GitHub Desktop.
Save andreasvc/1cae9033fe6310bae9f45d3c0a8c3883 to your computer and use it in GitHub Desktop.
Apply polyglot language detection recursively
"""Apply polyglot language detection to all .txt files under current directory
(searched recursively), write report in tab-separated file detectedlangs.tsv.
"""
import os
from glob import glob
from polyglot.detect import Detector
from polyglot.detect.base import UnknownLanguage
def main():
"""Main."""
with open('detectedlangs.tsv', 'w', encoding='utf8') as out:
print(
'filename', 'lang', 'confidence', 'read_bytes',
sep='\t', file=out)
for fname in glob('**/*.txt', recursive=True):
with open(fname, encoding='utf8') as inp:
text = inp.read()
# https://github.com/aboSamoor/polyglot/issues/71#issuecomment-445199949
filteredtext = ''.join(x for x in text if x.isprintable())
try:
res = Detector(filteredtext)
except UnknownLanguage:
print(
fname,
'unknown',
0,
len(text),
sep='\t', file=out)
else:
print(
fname,
res.language.code,
res.language.confidence,
res.language.read_bytes,
sep='\t', file=out)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment