Last active
October 3, 2021 07:37
-
-
Save manifoldhiker/6aeca084e9f3419d9b180d16c34a0694 to your computer and use it in GitHub Desktop.
Language detection with fasttext
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from pathlib import Path | |
import spacy | |
import fasttext | |
FMODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" | |
class FasttextLanguageDetector(): | |
def __init__(self, model_path=None, load_model_to_dir='/tmp/'): | |
if model_path is None: | |
model_path = self._download_model(load_model_to_dir) | |
self.fmodel = fasttext.load_model(model_path) | |
self.nlp = spacy.load('en_core_web_sm') | |
def detect_lang(self, text): | |
lang, _ = self.fmodel.predict([text.replace('\n', ' ')]) | |
return lang[0][0].replace('__label__', '') | |
def count_lang_chars(self, text): | |
doc = self.nlp(text) | |
dist = {} | |
for sent in doc.sents: | |
lang = self.detect_lang(str(sent)) | |
sent_len = len(str(sent)) | |
if lang in dist: | |
dist[lang] += sent_len | |
else: | |
dist[lang] = sent_len | |
return dist | |
def _download_model(self, load_model_to_dir): | |
destination_path = Path(load_model_to_dir) / \ | |
'fasttext_lang_detect_model.bin' | |
if not destination_path.exists(): | |
urllib.request.urlretrieve(FMODEL_URL, destination_path) | |
return str(destination_path) | |
def get_english_chars_count(d): return d.get('en', 0) | |
def get_nonenglish_chars_count(d): | |
en_count = get_english_chars_count(d) | |
all_count = sum(d.values()) | |
return all_count - en_count |
Author
manifoldhiker
commented
Oct 3, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment