Skip to content

Instantly share code, notes, and snippets.

@dayyass
Created September 29, 2021 09:20
Show Gist options
  • Save dayyass/45461622cde7346ce73cc60b7a115e10 to your computer and use it in GitHub Desktop.
Save dayyass/45461622cde7346ce73cc60b7a115e10 to your computer and use it in GitHub Desktop.
How to use sklearn TfidfVectorizer with lemmatizer.
from sklearn.feature_extraction.text import TfidfVectorizer
# pymorphy2 lemmatizer
import pymorphy2
class Lemmatizer:
def __init__(self):
self.morph = pymorphy2.MorphAnalyzer()
def __call__(self, x: str) -> str:
lemma = self.morph.parse(x)[0].normal_form
return lemma
lemmatizer = Lemmatizer()
# data
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]
# tfidf default tokenizer
tokenizer = re.compile(r"(?u)\b\w\w+\b").findall
# tfidf
vectorizer = TfidfVectorizer(
tokenizer=lambda sentence: [lemmatizer(token) for token in tokenizer(sentence)]
)
X = vectorizer.fit_transform(corpus)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment