Skip to content

Instantly share code, notes, and snippets.

@gdbassett
Last active October 2, 2018 07:19
Show Gist options
  • Save gdbassett/4bc4e1e199baf49d8142 to your computer and use it in GitHub Desktop.
Save gdbassett/4bc4e1e199baf49d8142 to your computer and use it in GitHub Desktop.
Basic script for text->vectorization->TF-IDF->canopies->kmeans->clusters. Initially tested on VCDB breach summaries.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# based on http://scikit-learn.org/stable/auto_examples/document_clustering.html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
from time import time
from collections import defaultdict
VCDB_DIR = '~/Documents/Development/VCDB/data/json'
# T1 > T2 for overlapping clusters
T1 = .4 # Within this distance, don't allow to be in other clusters
T2 = .7 # Within this distance, create a cluster
filemap = glob.glob(VCDB_DIR.rstrip("/") + "/*.json")
def summ_iter(files):
for filename in files:
with open(filename, 'r') as f:
j = json.load(f)
if 'summary' in j and len(j['summary']) > 0:
yield j['summary']
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
canopies = dict()
X1_dist = pairwise_distances(X, metric=distance_metric)
canopy_points = set(range(X.shape[0]))
while canopy_points:
point = canopy_points.pop()
i = len(canopies)
canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
if filemap:
for canopy_id in canopies.keys():
canopy = canopies.pop(canopy_id)
canopy2 = {"c":filemap[canopy['c']], "points":list()}
for point in canopy['points']:
canopy2["points"].append(filemap[point])
canopies[canopy_id] = canopy2
return canopies
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
min_df=2, stop_words='english',
use_idf=True)
# Primary Execution #
print("Vectorizing Text")
X1 = vectorizer.fit_transform(summ_iter(filemap))
print("Performing canopy clustering.")
canopies = canopy(X1, T1, T2, distance_metric='cosine')
k = len(canopies)
# retrieves canopy centroid vectors
c = list()
for can in canopies.values():
c.append(can['c'])
centroids = X1[c]
print("{0} canopies found.".format(k))
# Kmeans
km = KMeans(n_clusters=len(canopies), init=np.asarray(X1[centroids].todense()), max_iter=100, n_init=1, verbose=True)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
# Output
print("Outputting Clusters")
clusters = defaultdict(list)
for i in range(len(km.labels_)):
clusters[km.labels_].append(filemap[i])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment