Last active
March 13, 2022 15:53
-
-
Save ymoslem/3bbd96fee23d3691b9b227f27fa2ab3e to your computer and use it in GitHub Desktop.
Runtime test of language detection libraries.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pip3 install gdown langdetect fasttext pycld2 py3langid | |
import gdown | |
from datetime import datetime | |
# Download fasttext models | |
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" | |
output = "lid.176.ftz" | |
gdown.download(url, output, quiet=False) | |
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" | |
output = "lid.176.bin" | |
gdown.download(url, output, quiet=False) | |
# Text to detect - try in different languages | |
text = "वह जापान से होकर भारत गया" | |
# Test runtime of langdetect | |
start = datetime.now() | |
from langdetect import detect | |
lang_code =detect(text) | |
print(lang_code) | |
end = datetime.now() | |
print(f"langdetect runtime is: {end - start} \n") | |
# Test runtime of fasttext bin model | |
start = datetime.now() | |
import fasttext | |
pretrained_lang_model = "lid.176.bin" | |
model = fasttext.load_model(pretrained_lang_model) | |
prediction = model.predict(text.replace("\n",""), k=1) # top 1 matching languages | |
lang_code = prediction[0][0][-2:] | |
print(lang_code) | |
end = datetime.now() | |
print(f"fasttext bin runtime is: {end - start} \n") | |
# Test runtime of fasttext ftz model | |
start = datetime.now() | |
import fasttext | |
pretrained_lang_model = "lid.176.ftz" | |
model = fasttext.load_model(pretrained_lang_model) | |
prediction = model.predict(text.replace("\n",""), k=1) # top 1 matching languages | |
lang_code = prediction[0][0][-2:] | |
print(lang_code) | |
end = datetime.now() | |
print(f"fasttext ftz runtime is: {end - start} \n") | |
# Test runtime of pycld2 | |
start = datetime.now() | |
import pycld2 as cld2 | |
isReliable, textBytesFound, details = cld2.detect(text) | |
lang_code = details[0][1] | |
print(lang_code) | |
end = datetime.now() | |
print(f"pycld2 runtime is: {end - start} \n") | |
# Test runtime of py3langid | |
start = datetime.now() | |
import py3langid as langid | |
lang_code = langid.classify(text)[0] | |
print(lang_code) | |
end = datetime.now() | |
print(f"py3langid runtime is: {end - start} \n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment