Created
October 7, 2021 15:29
-
-
Save manifoldhiker/71e34758a98abd9b98e1dac3ba16c830 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.sparse import hstack | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
import pandas as pd | |
class WikiBotOnlineClassifer: | |
def __init__(self, tfidf_vectorizer_comments, tfidf_vectorizer_username, model): | |
self.tfidf_vectorizer_comments = tfidf_vectorizer_comments | |
self.tfidf_vectorizer_username = tfidf_vectorizer_username | |
self.model = model | |
@staticmethod | |
def init_from_stream_sample(sample_df): | |
sample_df['comment'] = sample_df['comment'].fillna(0) | |
tfidf_vectorizer_comments = TfidfVectorizer() | |
tfidf_vectorizer_username = TfidfVectorizer(ngram_range=(2,3), analyzer='char') | |
tfidf_vectorizer_comments.fit(sample_df['comment']) | |
tfidf_vectorizer_username.fit(sample_df['user']) | |
model = SGDClassifier() | |
return WikiBotOnlineClassifer(tfidf_vectorizer_comments, tfidf_vectorizer_username, model) | |
def _extract_X_features(self, batch_df): | |
X_comment = self.tfidf_vectorizer_comments.transform(batch_df['comment']) | |
X_username = self.tfidf_vectorizer_username.transform(batch_df['user']) | |
X = hstack([X_comment, X_username], format='csc') | |
return X | |
def partial_fit(self, batch_df): | |
X = self._extract_X_features(batch_df) | |
y = batch_df['bot'] | |
self.model.partial_fit(X, y, classes=[0,1]) | |
def predict(self, batch_df): | |
X = self._extract_X_features(batch_df) | |
return self.model.predict(X) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment