Skip to content

Instantly share code, notes, and snippets.

@manuel-delverme
Last active August 12, 2016 15:21
Show Gist options
  • Select an option

  • Save manuel-delverme/e9a7a65146df4155bc9b822d1f5048f9 to your computer and use it in GitHub Desktop.

Select an option

Save manuel-delverme/e9a7a65146df4155bc9b822d1f5048f9 to your computer and use it in GitHub Desktop.
from pprint import pprint
import hdbscan as hdbscan
import numpy as np
import matplotlib.pyplot as plt
# import scipy.cluster.hierarchy as hcluster
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import sklearn.metrics.pairwise
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import hdbscan
import math
import pickle
import collections
import xmltodict
import fireplace.utils
from fireplace import cards
from hearthstone.enums import CardClass
from adjustText import adjust_text, sys
def load_decks_from_file(file_name):
decks = []
deck_names = []
deck_classes = []
with open(file_name, 'rb') as f:
try:
while True:
d, c, n = pickle.load(f)
if "arena" in n.lower():
continue
# print(n)
deck_names.append(n)
deck_classes.append(c)
decks.append(d)
except EOFError:
pass
return decks, deck_classes, deck_names
def import_decks_from_hdt(file_name):
with open(file_name) as f:
tree = xmltodict.parse(f.read())
# cards.db.initialize()
decks = []
deck_names = []
deck_classes = []
for d in tree['Decks']['Deck']:
# print(d['Name'],d['Class'])
deck = collections.Counter()
for c in d['Cards']['Card']:
deck[c['Id']] = c['Count']
decks.append(deck)
deck_names.append(d['Name'])
deck_classes.append(d['Class'])
return decks, deck_classes, deck_names
def load_data_from_file(file_name):
decks, deck_classes, deck_names = load_decks_from_file(file_name)
lookup = list({card for deck in decks for card in deck})
data = []
for deck in decks:
datapoint = [0] * len(lookup)
for card in deck:
card_dimension = lookup.index(card)
datapoint[card_dimension] = deck[card]
data.append(datapoint)
pca = PCA(n_components=50)
data = pca.fit_transform(np.array(data))
return data, deck_classes, deck_names
def flatten_data(data):
# plotting
pca = PCA(n_components=50)
pca_result = pca.fit_transform(data)
model = TSNE()
embed = model.fit_transform(pca_result)
return embed
dataset = "100kdecks.pkl"
data_cache = "data_{}.pkl".format(dataset)
try:
with open(data_cache, 'rb') as d:
data, deck_classes, deck_names = pickle.load(d)
except IOError:
data, deck_classes, deck_names = load_data_from_file(dataset)
with open(data_cache.format(dataset), 'wb') as d:
pickle.dump((data, deck_classes, deck_names), d)
print("loaded")
# embed = flatten_data(data)
# print("flattened")
def cluster(eps, samples, data, deck_names):
# clustering
# clusters = hcluster.fclusterdata(data, thresh, metric='cityblock', criterion="distance")
# 0.0365 164 patron out of 4489
# 0.0365 1574 patron out of 39912
db = DBSCAN(eps=eps, # a deck with a changed card
min_samples=samples, # and 20 similar decks
metric="manhattan").fit(data)
labels = db.labels_
return labels, db
try:
clusters
except NameError:
print("clustering")
eps = int(sys.argv[1])
samples = int(sys.argv[2])
clusters, db = cluster(eps, samples, data, deck_names)
print("clustered")
def print_data(data, deck_names, clusters):
sets = collections.defaultdict(list)
for (i, name) in enumerate(deck_names):
sets[clusters[i]].append(name)
groups = []
for cluster_number in sets:
groups.append(sets[cluster_number])
for group in sorted(groups, key=len):
print(len(group), group, "\n")
print("found {} clusters".format(len(set(clusters))))
print_data(data, deck_names, clusters)
def plot(embed, deck_names, clusters, thresh):
notes = []
plt.axis("equal")
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data, labels))
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = data[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
father = np.argmax((class_member_mask & core_samples_mask) is True)
plt.annotate(deck_names[father], xy=data[:, 0:1], xytext=(0, 0), textcoords='offset points')
xy = data[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
for cluster in set(clusters):
point = embed[cluster]
label = deck_names[cluster]
members = [c for c in clusters if cluster == c]
if len(members) > 1:
print(label, len(members))
# print(point, label)
# plt.annotate(label[:20], xy = point, xytext = (0, 0), textcoords = 'offset points', bbox={'pad':0, 'alpha':0} )
notes.append(plt.text(point[0], point[1], label[:15], bbox={'pad': 0, 'alpha': 0}, size=7))
else:
print("DROPPING", label, len(members))
# for point, label in zip(embed, deck_names):
# plt.annotate(label[:10], xy = point, xytext = (0, 0), textcoords = 'offset points')
# notes.append(plt.text(point[0], point[1], label[:20], bbox={'pad':0, 'alpha':0}, size=5))
plt.scatter(*np.transpose(embed), c=clusters)
adjust_text(notes, arrowprops=dict(arrowstyle="->", color='r'), force_text=0.25, lim=10)
plt.title("thres %f; clusters: %d" % (thresh, len(set(clusters))))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment