Skip to content

Instantly share code, notes, and snippets.

@chi-feng
Created May 16, 2019 22:29
Show Gist options
  • Save chi-feng/0cd0b2957271cd3d0c6da0f4768acda3 to your computer and use it in GitHub Desktop.
Save chi-feng/0cd0b2957271cd3d0c6da0f4768acda3 to your computer and use it in GitHub Desktop.
install jupyter and spacy
conda install -c conda-forge jupyter_contrib_nbextensions
conda install -c conda-forge jupyter_nbextensions_configurator
conda install -c conda-forge gensim
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
@chi-feng
Copy link
Author

import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, workers=5, passes=25)
ldamodel.save('model.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans

# LSA parameters
max_features = 10**6
max_components = 400

documents = [status['stemmed_tokens'].replace(",", " ") for status in statuses]
vectorizer = TfidfVectorizer(ngram_range=(2, 3), max_features=max_features)
vectorizer.fit(documents)
X = vectorizer.transform(documents)

# PCA
svd = TruncatedSVD(max_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)

# k-means clustering parameters
n_clusters = 15
max_iter = 1000
n_init = 20
km = KMeans(n_clusters=n_clusters, init='k-means++',
            max_iter=max_iter, n_init=n_init, verbose=False)
km.fit(X)

centers = km.cluster_centers_

com = np.zeros((n_clusters, np.shape(X)[1]))
for i in range(0, n_clusters):
    cluster_points = X[km.labels_ == i, :]
    cluster_mean = np.mean(cluster_points, axis=0)
    com[i, :] = cluster_mean

original_space_centroids = svd.inverse_transform(centers)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
article_counts = collections.Counter(km.labels_)

cluster_results = []
for i in range(n_clusters):
    # This gives an array of len(X) distances to centroid i
    d = km.transform(X)[:, i]
    ind = np.argsort(d)

    cluster_result = {'cluster_id': i,
                      'article_count': article_counts[i],
                      'status_ids': [statuses[ind[j]]['id'] for j in range(10)],
                      'distances': [d[ind[j]] for j in range(article_counts[i])],
                      'best_articles': [statuses[ind[i]] for i in range(3)], 
                      'keywords': [terms[ind] for ind in order_centroids[i, :10]]}
    cluster_results.append(cluster_result)

cluster_results = sorted(
    cluster_results[1:], key=lambda result: np.mean(np.array(result['distances'])), reverse=True)

from IPython.display import HTML, display

table = ['<table><tr><th>Cluster</th><th>Article Count</th><th style="text-align:left">Keywords</th></tr>']
for result in cluster_results:
    article_summaries = []
    for article in result['best_articles']:
        article_summaries.append('<tr><td colspan="2"></td><td style="text-align:left">%s</td></tr>' % (
                                 article['text']))

    table.append('<tr><td>%d</td><td>%d</td><td style="text-align:left"><strong>%s</strong></td></td></tr>%s' %
                 (
                     result['cluster_id'],
                     result['article_count'],
                     ', '.join(result['keywords']),
                     article_summaries
                 ))
table.append('</table>')

display(HTML(
    ''.join(table)
))
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2,verbose=1, perplexity=40)
embedding = tsne.fit_transform(X)

plt.figure(figsize=(12,10))
plt.scatter(embedding[:,0], embedding[:,1], c=km.labels_, cmap='rainbow')
plt.axis('off')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment