Skip to content

Instantly share code, notes, and snippets.

View FeryET's full-sized avatar

Farhood FeryET

View GitHub Profile
def foo(func, *args, **kwargs):
res = func(*args, **kwargs)
return res["a key"]["in json"]["response"]
class A:
def __init__(self, api):
super().__init__()
self.api = api
def get_x(self):
@FeryET
FeryET / clean_text.py
Created August 26, 2020 10:06
Gists for Medium Article: Text Classification using LDA
processor = SpacyCleaner(chunksize=1000, workers=workers)
docs = processor.transform(raw_docs)
import logging
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_val_score, )
from sklearn.pipeline import Pipeline
labels = ["rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]
raw_docs, y = fetch_20newsgroups(subset='all', return_X_y=True, categories=labels)
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=0.1, shuffle=True)
processor = SpacyCleaner(chunksize=1000, workers=workers)
docs = processor.transform(raw_docs)
hdp_model = HDPModel(min_df=min_df, rm_top=rm_top)
hdp_model.optim_interval = 5
for d in docs_train:
hdp_model.add_doc(d)
hdp_model.burn_in = 100
hdp_model.train(0, workers=workers)
for i in range(0, 1000, 10):
hdp_model.train(10, workers=workers)
print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp_model.ll_per_word, hdp_model.live_k))
vectorizer = TomotopyLDAVectorizer(num_of_topics=num_of_topics,
workers=workers, min_df=min_df,
rm_top=rm_top)
x_train = vectorizer.fit_transform(docs_train)
x_test = vectorizer.transform(docs_test)
def plot_topic_clusters(ax, x2d, y, labels):
ax.set_aspect("equal")
colors = cm.get_cmap("Spectral", len(labels))
for i, l in enumerate(labels):
c = colors(i / len(labels))
ax.scatter(x2d[y == i, 0], x2d[y == i, 1], color=c, label=l, alpha=0.7)
ax.grid()
ax.legend()
ax.set(adjustable='box', aspect='equal')
return ax
folds = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)
vectorizer = TomotopyLDAVectorizer(num_of_topics=15, workers=workers, min_df=min_df,
rm_top=rm_top)
clf = SVC()
pca = PCA(n_components=0.95)
pipe = Pipeline([("vectorizer", vectorizer), ("scalar", StandardScaler()),
("classifier", clf)])
results = cross_val_score(pipe, docs, y_true, cv=folds, n_jobs=2, verbose=1,