Skip to content

Instantly share code, notes, and snippets.

@manifoldhiker
Created October 7, 2021 15:29
Show Gist options
  • Save manifoldhiker/71e34758a98abd9b98e1dac3ba16c830 to your computer and use it in GitHub Desktop.
Save manifoldhiker/71e34758a98abd9b98e1dac3ba16c830 to your computer and use it in GitHub Desktop.
from scipy.sparse import hstack
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
class WikiBotOnlineClassifer:
def __init__(self, tfidf_vectorizer_comments, tfidf_vectorizer_username, model):
self.tfidf_vectorizer_comments = tfidf_vectorizer_comments
self.tfidf_vectorizer_username = tfidf_vectorizer_username
self.model = model
@staticmethod
def init_from_stream_sample(sample_df):
sample_df['comment'] = sample_df['comment'].fillna(0)
tfidf_vectorizer_comments = TfidfVectorizer()
tfidf_vectorizer_username = TfidfVectorizer(ngram_range=(2,3), analyzer='char')
tfidf_vectorizer_comments.fit(sample_df['comment'])
tfidf_vectorizer_username.fit(sample_df['user'])
model = SGDClassifier()
return WikiBotOnlineClassifer(tfidf_vectorizer_comments, tfidf_vectorizer_username, model)
def _extract_X_features(self, batch_df):
X_comment = self.tfidf_vectorizer_comments.transform(batch_df['comment'])
X_username = self.tfidf_vectorizer_username.transform(batch_df['user'])
X = hstack([X_comment, X_username], format='csc')
return X
def partial_fit(self, batch_df):
X = self._extract_X_features(batch_df)
y = batch_df['bot']
self.model.partial_fit(X, y, classes=[0,1])
def predict(self, batch_df):
X = self._extract_X_features(batch_df)
return self.model.predict(X)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment