Last active
August 12, 2016 15:21
-
-
Save manuel-delverme/e9a7a65146df4155bc9b822d1f5048f9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pprint import pprint | |
| import hdbscan as hdbscan | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| # import scipy.cluster.hierarchy as hcluster | |
| from sklearn.manifold import TSNE | |
| from sklearn.decomposition import PCA | |
| import sklearn.metrics.pairwise | |
| from sklearn.cluster import DBSCAN | |
| from sklearn import metrics | |
| from sklearn.datasets.samples_generator import make_blobs | |
| from sklearn.preprocessing import StandardScaler | |
| import hdbscan | |
| import math | |
| import pickle | |
| import collections | |
| import xmltodict | |
| import fireplace.utils | |
| from fireplace import cards | |
| from hearthstone.enums import CardClass | |
| from adjustText import adjust_text, sys | |
| def load_decks_from_file(file_name): | |
| decks = [] | |
| deck_names = [] | |
| deck_classes = [] | |
| with open(file_name, 'rb') as f: | |
| try: | |
| while True: | |
| d, c, n = pickle.load(f) | |
| if "arena" in n.lower(): | |
| continue | |
| # print(n) | |
| deck_names.append(n) | |
| deck_classes.append(c) | |
| decks.append(d) | |
| except EOFError: | |
| pass | |
| return decks, deck_classes, deck_names | |
| def import_decks_from_hdt(file_name): | |
| with open(file_name) as f: | |
| tree = xmltodict.parse(f.read()) | |
| # cards.db.initialize() | |
| decks = [] | |
| deck_names = [] | |
| deck_classes = [] | |
| for d in tree['Decks']['Deck']: | |
| # print(d['Name'],d['Class']) | |
| deck = collections.Counter() | |
| for c in d['Cards']['Card']: | |
| deck[c['Id']] = c['Count'] | |
| decks.append(deck) | |
| deck_names.append(d['Name']) | |
| deck_classes.append(d['Class']) | |
| return decks, deck_classes, deck_names | |
| def load_data_from_file(file_name): | |
| decks, deck_classes, deck_names = load_decks_from_file(file_name) | |
| lookup = list({card for deck in decks for card in deck}) | |
| data = [] | |
| for deck in decks: | |
| datapoint = [0] * len(lookup) | |
| for card in deck: | |
| card_dimension = lookup.index(card) | |
| datapoint[card_dimension] = deck[card] | |
| data.append(datapoint) | |
| pca = PCA(n_components=50) | |
| data = pca.fit_transform(np.array(data)) | |
| return data, deck_classes, deck_names | |
| def flatten_data(data): | |
| # plotting | |
| pca = PCA(n_components=50) | |
| pca_result = pca.fit_transform(data) | |
| model = TSNE() | |
| embed = model.fit_transform(pca_result) | |
| return embed | |
| dataset = "100kdecks.pkl" | |
| data_cache = "data_{}.pkl".format(dataset) | |
| try: | |
| with open(data_cache, 'rb') as d: | |
| data, deck_classes, deck_names = pickle.load(d) | |
| except IOError: | |
| data, deck_classes, deck_names = load_data_from_file(dataset) | |
| with open(data_cache.format(dataset), 'wb') as d: | |
| pickle.dump((data, deck_classes, deck_names), d) | |
| print("loaded") | |
| # embed = flatten_data(data) | |
| # print("flattened") | |
| def cluster(eps, samples, data, deck_names): | |
| # clustering | |
| # clusters = hcluster.fclusterdata(data, thresh, metric='cityblock', criterion="distance") | |
| # 0.0365 164 patron out of 4489 | |
| # 0.0365 1574 patron out of 39912 | |
| db = DBSCAN(eps=eps, # a deck with a changed card | |
| min_samples=samples, # and 20 similar decks | |
| metric="manhattan").fit(data) | |
| labels = db.labels_ | |
| return labels, db | |
| try: | |
| clusters | |
| except NameError: | |
| print("clustering") | |
| eps = int(sys.argv[1]) | |
| samples = int(sys.argv[2]) | |
| clusters, db = cluster(eps, samples, data, deck_names) | |
| print("clustered") | |
| def print_data(data, deck_names, clusters): | |
| sets = collections.defaultdict(list) | |
| for (i, name) in enumerate(deck_names): | |
| sets[clusters[i]].append(name) | |
| groups = [] | |
| for cluster_number in sets: | |
| groups.append(sets[cluster_number]) | |
| for group in sorted(groups, key=len): | |
| print(len(group), group, "\n") | |
| print("found {} clusters".format(len(set(clusters)))) | |
| print_data(data, deck_names, clusters) | |
| def plot(embed, deck_names, clusters, thresh): | |
| notes = [] | |
| plt.axis("equal") | |
| core_samples_mask = np.zeros_like(db.labels_, dtype=bool) | |
| core_samples_mask[db.core_sample_indices_] = True | |
| n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | |
| print('Estimated number of clusters: %d' % n_clusters_) | |
| # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(data, labels)) | |
| # Black removed and is used for noise instead. | |
| unique_labels = set(labels) | |
| colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) | |
| for k, col in zip(unique_labels, colors): | |
| if k == -1: | |
| # Black used for noise. | |
| col = 'k' | |
| class_member_mask = (labels == k) | |
| xy = data[class_member_mask & core_samples_mask] | |
| plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) | |
| father = np.argmax((class_member_mask & core_samples_mask) is True) | |
| plt.annotate(deck_names[father], xy=data[:, 0:1], xytext=(0, 0), textcoords='offset points') | |
| xy = data[class_member_mask & ~core_samples_mask] | |
| plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) | |
| plt.title('Estimated number of clusters: %d' % n_clusters_) | |
| plt.show() | |
| for cluster in set(clusters): | |
| point = embed[cluster] | |
| label = deck_names[cluster] | |
| members = [c for c in clusters if cluster == c] | |
| if len(members) > 1: | |
| print(label, len(members)) | |
| # print(point, label) | |
| # plt.annotate(label[:20], xy = point, xytext = (0, 0), textcoords = 'offset points', bbox={'pad':0, 'alpha':0} ) | |
| notes.append(plt.text(point[0], point[1], label[:15], bbox={'pad': 0, 'alpha': 0}, size=7)) | |
| else: | |
| print("DROPPING", label, len(members)) | |
| # for point, label in zip(embed, deck_names): | |
| # plt.annotate(label[:10], xy = point, xytext = (0, 0), textcoords = 'offset points') | |
| # notes.append(plt.text(point[0], point[1], label[:20], bbox={'pad':0, 'alpha':0}, size=5)) | |
| plt.scatter(*np.transpose(embed), c=clusters) | |
| adjust_text(notes, arrowprops=dict(arrowstyle="->", color='r'), force_text=0.25, lim=10) | |
| plt.title("thres %f; clusters: %d" % (thresh, len(set(clusters)))) | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment