Created
September 29, 2021 09:20
-
-
Save dayyass/45461622cde7346ce73cc60b7a115e10 to your computer and use it in GitHub Desktop.
How to use sklearn TfidfVectorizer with lemmatizer.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
# pymorphy2 lemmatizer | |
import pymorphy2 | |
class Lemmatizer: | |
def __init__(self): | |
self.morph = pymorphy2.MorphAnalyzer() | |
def __call__(self, x: str) -> str: | |
lemma = self.morph.parse(x)[0].normal_form | |
return lemma | |
lemmatizer = Lemmatizer() | |
# data | |
corpus = [ | |
'This is the first document.', | |
'This document is the second document.', | |
'And this is the third one.', | |
'Is this the first document?', | |
] | |
# tfidf default tokenizer | |
tokenizer = re.compile(r"(?u)\b\w\w+\b").findall | |
# tfidf | |
vectorizer = TfidfVectorizer( | |
tokenizer=lambda sentence: [lemmatizer(token) for token in tokenizer(sentence)] | |
) | |
X = vectorizer.fit_transform(corpus) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment