Created
June 29, 2015 18:14
-
-
Save pwin/a7559126ddc4c08d9d56 to your computer and use it in GitHub Desktop.
hierarchical clustering of docs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import numpy as np | |
import pandas as pd | |
import nltk | |
import re | |
import os | |
import codecs | |
from sklearn import feature_extraction | |
import mpld3 | |
import requests | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.cluster import KMeans | |
from sklearn.externals import joblib | |
import os # for os.path.basename | |
import matplotlib.pyplot as plt | |
import matplotlib as mpl | |
from sklearn.manifold import MDS | |
from scipy.cluster.hierarchy import ward, dendrogram | |
####http://brandonrose.org/clustering | |
uris = [ | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cross_Agency_Strategy', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/High_Level_Support', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Holistic_Metrics', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/User_engagement_and_collaboration_throughout_the_lifecycle', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Organisational-internal_engagement', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Human_Readability_and_Machine_Processing', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cost_of_Publication', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Stakeholders%E2%80%99_Interests_and_Rights', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Feedback_to_Improve_Quality', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Optimization_for_Search_Engines', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publication_with_Common_Metadata', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Catalogs_and_Indexes', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Encourage_crowdsourcing', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publish_spatial_data_on_the_web', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Monitoring_and_Benchmarking', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_quality_assessment', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Identifying_what_you_already_publish', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Make_the_data_available_in_the_language_people_want_it', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Management_Of_A_Wide_Public_Actors_Network', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Making_Research_Results_Open_For_The_Country', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Using_Business_Process_Paradigm_For_Open_Data_Lifecycle_Management', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publishing_Statistical_Data_In_Linked_Data_Format', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Supervizor_-_An_Indispensable_Open_Government_Application_(Transparency_Of_Public_Spending)', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Civic_Use_Of_Open_Data', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Publication_Plan', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/A_Federation_Tool_For_Opendata_Portals', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Traffic_Light_System_For_Data_Sharing', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_To_Improve_Sharing_And_Publication_Of_Information_Between_Public_Administrations', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Commercial_Considerations_in_Open_Data_Portal_Design', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Infomediary_Sector_Characteristics', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_2.0_-_Changing_Perspectives', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Business_Model_Patterns_and_Open_Data_Business_Value_Disciplines', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/The_Central_Role_of_Location', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/An_ongoing_open_dialog_in_an_open_data_ecosystem', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Discover_published_information_by_site_scraping', | |
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Free_our_maps'] | |
replacements = ['=Title=', | |
'=Short Description=', | |
'=Overview=', | |
'=Why=', | |
'=Intended Outcome=', | |
'=Life Cycle Stage=', | |
'=Possible Approach=', | |
'=How to Test=', | |
'=Evidence=', | |
'=Lifecycle Stage=', | |
'=Audience=', | |
'=Related Best Practices=', | |
'=Tags=', | |
'=Status=', | |
'=Intended Audience=', | |
'nowiki', | |
'Name of the Share-PSI workshop:', | |
'Title of the Best Practice:', | |
'Outline of the best practice', | |
'Management summary', | |
'Challenge', | |
'Solution.', | |
'Best Practice Identification', | |
'Why is this a Best Practice?', | |
'What\'s the impact of the Best Practice?', | |
'Link to the PSI Directive', | |
'Why is there a need for this Best Practice?', | |
'What do you need for this Best Practice?', | |
'Applicability by other member states?', | |
'Contact info - record of the person to be contacted for additional information or advice.'] | |
def tokenize_and_stem(text): | |
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token | |
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] | |
filtered_tokens = [] | |
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) | |
for token in tokens: | |
if re.search('[a-zA-Z]', token): | |
filtered_tokens.append(token) | |
stems = [stemmer.stem(t) for t in filtered_tokens] | |
return stems | |
def tokenize_only(text): | |
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token | |
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] | |
filtered_tokens = [] | |
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) | |
for token in tokens: | |
if re.search('[a-zA-Z]', token): | |
filtered_tokens.append(token) | |
return filtered_tokens | |
debug=False | |
synopses = [] | |
titles = [] | |
for i in uris: | |
titles.append(i.split('/')[-1]) | |
if debug: print(i + "?action=raw") | |
text = requests.get(i + "?action=raw").text.lower() | |
if debug: print(text) | |
for q in replacements: | |
text = text.replace(q,'') | |
if debug: print(text) | |
synopses.append(text) | |
# load nltk's English stopwords as variable called 'stopwords' | |
stopwords = nltk.corpus.stopwords.words('english') | |
# load nltk's SnowballStemmer as variabled 'stemmer' | |
from nltk.stem.snowball import SnowballStemmer | |
stemmer = SnowballStemmer("english") | |
totalvocab_stemmed = [] | |
totalvocab_tokenized = [] | |
for i in synopses: | |
allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem | |
totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list | |
allwords_tokenized = tokenize_only(i) | |
totalvocab_tokenized.extend(allwords_tokenized) | |
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) | |
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame') | |
#define vectorizer parameters | |
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, | |
min_df=0.2, stop_words='english', | |
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) | |
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses | |
print(tfidf_matrix.shape) | |
terms = tfidf_vectorizer.get_feature_names() | |
dist = 1 - cosine_similarity(tfidf_matrix) | |
num_clusters = 7 | |
km = KMeans(n_clusters=num_clusters) | |
km.fit(tfidf_matrix) | |
clusters = km.labels_.tolist() | |
BPs = { 'title': titles, 'synopsis': synopses, 'cluster': clusters} | |
frame = pd.DataFrame(BPs, index = [clusters] , columns = ['title', 'cluster']) | |
#grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes | |
#grouped.mean() #average rank (1 to 100) per cluster | |
print("Top terms per cluster:") | |
print() | |
#sort cluster centers by proximity to centroid | |
order_centroids = km.cluster_centers_.argsort()[:, ::-1] | |
for i in range(num_clusters): | |
print("Cluster %d words:" % i, end='') | |
for ind in order_centroids[i, :10]: #replace 6 with n words per cluster | |
print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',') | |
print() #add whitespace | |
print() #add whitespace | |
print("Cluster %d titles:" % i, end='') | |
for title in frame.ix[i]['title'].values.tolist(): | |
print(' %s,' % title, end='') | |
print() #add whitespace | |
print() #add whitespace | |
print() | |
print() | |
MDS() | |
# convert two components as we're plotting points in a two-dimensional plane | |
# "precomputed" because we provide a distance matrix | |
# we will also specify `random_state` so the plot is reproducible. | |
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) | |
pos = mds.fit_transform(dist) # shape (n_components, n_samples) | |
xs, ys = pos[:, 0], pos[:, 1] | |
print() | |
print() | |
#set up colors per clusters using a dict | |
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', | |
4: '#66a61e', 5: '#111ddd', 6: '#444aaa'} | |
#set up cluster names using a dict | |
cluster_names = {0: 'Cl 0', | |
1: 'Cl 1', | |
2: 'Cl 2', | |
3: 'Cl 3', | |
4: 'Cl 4', | |
5: 'Cl 5', | |
6: 'Cl 6'} | |
#create data frame that has the result of the MDS plus the cluster numbers and titles | |
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) | |
#group by cluster | |
groups = df.groupby('label') | |
# set up plot | |
fig, ax = plt.subplots(figsize=(17, 9)) # set size | |
ax.margins(0.10) # Optional, just adds 5% padding to the autoscaling | |
#iterate through groups to layer the plot | |
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label | |
for name, group in groups: | |
ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, | |
label=cluster_names[name], color=cluster_colors[name], | |
mec='none') | |
ax.set_aspect('auto') | |
ax.tick_params(\ | |
axis= 'x', # changes apply to the x-axis | |
which='both', # both major and minor ticks are affected | |
bottom='off', # ticks along the bottom edge are off | |
top='off', # ticks along the top edge are off | |
labelbottom='off') | |
ax.tick_params(\ | |
axis= 'y', # changes apply to the y-axis | |
which='both', # both major and minor ticks are affected | |
left='off', # ticks along the bottom edge are off | |
top='off', # ticks along the top edge are off | |
labelleft='off') | |
ax.legend(numpoints=1, fontsize = 'x-small', loc='upper center', bbox_to_anchor=(0.5, -0.05), | |
fancybox=True, shadow=True, ncol=7) #show legend with only 1 point | |
#add label in x,y position with the label as the film title | |
for i in range(len(df)): | |
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8) | |
#plt.show() #show the plot | |
#uncomment the below to save the plot if need be | |
plt.savefig('clusters_small_noaxes.png', dpi=100) | |
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances | |
fig, ax = plt.subplots(figsize=(15, 20)) # set size | |
ax = dendrogram(linkage_matrix, orientation="right", labels=titles); | |
plt.tick_params(\ | |
axis= 'x', # changes apply to the x-axis | |
which='both', # both major and minor ticks are affected | |
bottom='off', # ticks along the bottom edge are off | |
top='off', # ticks along the top edge are off | |
labelbottom='off') | |
plt.tight_layout() #show plot with tight layout | |
#uncomment below to save figure | |
plt.savefig('ward_clusters.png', dpi=100) #save figure as ward_clusters | |
print("Finished") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment