Last active
October 2, 2018 07:19
-
-
Save gdbassett/4bc4e1e199baf49d8142 to your computer and use it in GitHub Desktop.
Basic script for text->vectorization->TF-IDF->canopies->kmeans->clusters. Initially tested on VCDB breach summaries.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# based on http://scikit-learn.org/stable/auto_examples/document_clustering.html | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans, MiniBatchKMeans | |
from sklearn.metrics.pairwise import pairwise_distances | |
import numpy as np | |
from time import time | |
from collections import defaultdict | |
VCDB_DIR = '~/Documents/Development/VCDB/data/json' | |
# T1 > T2 for overlapping clusters | |
T1 = .4 # Within this distance, don't allow to be in other clusters | |
T2 = .7 # Within this distance, create a cluster | |
filemap = glob.glob(VCDB_DIR.rstrip("/") + "/*.json") | |
def summ_iter(files): | |
for filename in files: | |
with open(filename, 'r') as f: | |
j = json.load(f) | |
if 'summary' in j and len(j['summary']) > 0: | |
yield j['summary'] | |
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None): | |
canopies = dict() | |
X1_dist = pairwise_distances(X, metric=distance_metric) | |
canopy_points = set(range(X.shape[0])) | |
while canopy_points: | |
point = canopy_points.pop() | |
i = len(canopies) | |
canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])} | |
canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0])) | |
if filemap: | |
for canopy_id in canopies.keys(): | |
canopy = canopies.pop(canopy_id) | |
canopy2 = {"c":filemap[canopy['c']], "points":list()} | |
for point in canopy['points']: | |
canopy2["points"].append(filemap[point]) | |
canopies[canopy_id] = canopy2 | |
return canopies | |
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, | |
min_df=2, stop_words='english', | |
use_idf=True) | |
# Primary Execution # | |
print("Vectorizing Text") | |
X1 = vectorizer.fit_transform(summ_iter(filemap)) | |
print("Performing canopy clustering.") | |
canopies = canopy(X1, T1, T2, distance_metric='cosine') | |
k = len(canopies) | |
# retrieves canopy centroid vectors | |
c = list() | |
for can in canopies.values(): | |
c.append(can['c']) | |
centroids = X1[c] | |
print("{0} canopies found.".format(k)) | |
# Kmeans | |
km = KMeans(n_clusters=len(canopies), init=np.asarray(X1[centroids].todense()), max_iter=100, n_init=1, verbose=True) | |
print("Clustering sparse data with %s" % km) | |
t0 = time() | |
km.fit(X) | |
print("done in %0.3fs" % (time() - t0)) | |
print() | |
# Output | |
print("Outputting Clusters") | |
clusters = defaultdict(list) | |
for i in range(len(km.labels_)): | |
clusters[km.labels_].append(filemap[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment