This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pandas as pd | |
import numpy as np | |
import gensim | |
from gsdmm import MovieGroupProcess | |
# cast tweets to numpy array | |
docs = df.tweet_text.to_numpy() | |
# create dictionary of all words in all documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pandas as pd | |
import numpy as np | |
import gensim | |
from gensim import corpora, models | |
# cast tweets to numpy array | |
docs = df.tweet_text.to_numpy() | |
# create dictionary of all words in all documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from once_per_worker import once_per_worker | |
from camel_tools.disambig.mle import MLEDisambiguator | |
# create dask.delayed object around Disambiguator | |
loaded_disambiguator = once_per_worker(lambda: MLEDisambiguator.pretrained()) | |
# define mapping function with disambiguator as second argument | |
def map_lemmas(df, disambiguator): | |
def get_lemmas_nested(tokenized_text): | |
disambig = disambiguator.disambiguate(tokenized_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define function to map across dask partitions | |
def map_lemmas(df, mle): | |
def get_lemmas_nested(tokenized_text): | |
disambig = mle.disambiguate(tokenized_text) | |
try: | |
lemmas = [d.analyses[0].analysis['lex'] for d in disambig] | |
return lemmas | |
except: | |
return np.nan |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from camel_tools.tokenizers.morphological import MorphologicalTokenizer | |
# atbseg scheme | |
tokenizer = MorphologicalTokenizer(mle, scheme='atbseg') | |
tokens = tokenizer.tokenize(df.tweet_text.iloc[0]) | |
print(tokens) | |
# atbtok scheme | |
tokenizer = MorphologicalTokenizer(mle, scheme='atbtok') | |
tokens = tokenizer.tokenize(df.tweet_text.iloc[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_lemmas(tokenized_text): | |
disambig = mle.disambiguate(tokenized_text) | |
lemmas = [d.analyses[0].analysis['lex'] for d in disambig] | |
return lemmas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from camel_tools.disambig.mle import MLEDisambiguator | |
# instantiate the Maximum Likelihood Disambiguator | |
mle = MLEDisambiguator.pretrained() | |
# The disambiguator expects pre-tokenized text | |
sentence = simple_word_tokenize('نجح بايدن في الانتخابات') | |
disambig = mle.disambiguate(sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from camel_tools.morphology.database import MorphologyDB | |
from camel_tools.morphology.analyzer import Analyzer | |
db = MorphologyDB.builtin_db() | |
analyzer = Analyzer(db) | |
analyses = analyzer.analyze('وبعقدنا') | |
for analysis in analyses: | |
print(analysis, '\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from camel_tools.tokenizers.word import simple_word_tokenize | |
df.tweet_text = df.tweet_text.apply(simple_word_tokenize) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the dediacritization tool | |
from camel_tools.utils.dediac import dediac_ar | |
# apply to your text column | |
df.tweet_text = df.tweet_text.apply(dediac_ar) |