Skip to content

Instantly share code, notes, and snippets.

@gaborvecsei
Created January 16, 2017 22:01
Show Gist options
  • Save gaborvecsei/25107e7b28c38b3d34e72d1e3c90844c to your computer and use it in GitHub Desktop.
Save gaborvecsei/25107e7b28c38b3d34e72d1e3c90844c to your computer and use it in GitHub Desktop.
import re
import nltk
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
df = pd.read_csv("data/movie_metadata.csv", header=0)
n_data = 1000
titles = df['movie_title'].values.tolist()[:n_data]
keywords = df['plot_keywords'].values.tolist()[:n_data]
titles = [t.replace('\xc2\xa0', '').decode('utf8') for t in titles]
keywords = [str(k).replace('|', ' ').decode('utf8') for k in keywords]
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
tfidf_vectorizer = TfidfVectorizer(max_df=0.99, max_features=100,
min_df=0.02, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(keywords)
feature_names = tfidf_vectorizer.get_feature_names()
print "Feature names: {0}".format(feature_names)
d = DBSCAN()
d.fit(tfidf_matrix)
clusters = d.labels_
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
print "Number of clusters: {0}".format(n_clusters)
data = {'title': titles, "cluster": clusters, "keywords": keywords}
frame = pd.DataFrame(data, index=[clusters], columns=["title", "cluster", "keywords"])
print frame.head()
print frame['cluster'].value_counts()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment