Skip to content

Instantly share code, notes, and snippets.

@bogsio
Created June 11, 2014 23:56
Show Gist options
  • Save bogsio/ce34d7f5583af8676aa9 to your computer and use it in GitHub Desktop.
Save bogsio/ce34d7f5583af8676aa9 to your computer and use it in GitHub Desktop.
import string
import collections
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
def process_text(text, stem=True):
""" Tokenize text and stem words removing punctuation """
text = text.translate(string.punctuation)
tokens = word_tokenize(text)
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens]
return tokens
def cluster_texts(texts, clusters=3):
""" Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
vectorizer = TfidfVectorizer(tokenizer=process_text,
stop_words=stopwords.words('english'),
max_df=0.5,
min_df=0.1,
lowercase=True)
tfidf_model = vectorizer.fit_transform(texts)
km_model = KMeans(n_clusters=clusters)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
for idx, label in enumerate(km_model.labels_):
clustering[label].append(idx)
return clustering
if __name__ == "__main__":
articles = [...]
clusters = cluster_texts(articles, 7)
pprint(dict(clusters))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment