A t-SNE inspired visualization of a BigML G-means clustering built from the UCI diabetes dataset. Mouse over a cluster to see the centroid.
Created
June 29, 2017 01:51
-
-
Save ashenfad/54726a70080f92de3e55e8c4866bd990 to your computer and use it in GitHub Desktop.
t-SNE Layout of Diabetes Clusters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Set.prototype.union = function(setB) { | |
var union = new Set(this); | |
for (var elem of setB) { | |
union.add(elem); | |
} | |
return union; | |
} | |
Set.prototype.intersection = function(setB) { | |
var intersection = new Set(); | |
for (var elem of setB) { | |
if (this.has(elem)) { | |
intersection.add(elem); | |
} | |
} | |
return intersection; | |
} | |
function termDist(set1, set2) { | |
if (set1.size == 0 && set2.size == 0) { | |
return 0; | |
} else if (set1.size == 0 || set2.size == 0) { | |
return 1; | |
} else { | |
return 1 - set1.intersection(set2).size / Math.sqrt(set1.size * set2.size); | |
} | |
} | |
/* | |
Given a BigML cluster resource, will return a function that computes | |
the distance between two clusters (referenced by their index) | |
*/ | |
function clusterDistFn(resource) { | |
var clusters = resource.clusters.clusters; | |
for (var i in clusters) { | |
var center = clusters[i].center; | |
for (k in center) { | |
if (typeof center[k] == 'object') { | |
center[k] = new Set(center[k]); | |
} | |
} | |
} | |
var scales = resource.scales; | |
return function (a, b) { | |
var sum = 0; | |
var clusterA = clusters[a]; | |
var clusterB = clusters[b]; | |
for (var k in clusterA.center) { | |
var aVal = clusterA.center[k]; | |
var bVal = clusterB.center[k]; | |
var diff; | |
switch (typeof aVal) { | |
case 'number': | |
diff = aVal - bVal; | |
break; | |
case 'string': | |
diff = aVal == bVal ? 0 : 1; | |
break; | |
case 'object': | |
diff = termDist(aVal, bVal); | |
break; | |
} | |
diff *= scales[k]; | |
sum += diff * diff; | |
} | |
/* console.log(clusterA.center, clusterB.center, a, b, Math.sqrt(sum)); */ | |
return Math.sqrt(sum); | |
}; | |
} | |
/* | |
Given a BigML cluster resource, returns a distance matrix for the | |
clusters | |
*/ | |
function distMatrix (resource) { | |
var clusterCount = Object.keys(resource.clusters.clusters).length; | |
var distFn = clusterDistFn(resource); | |
var matrix = new Array(clusterCount); | |
for (var a = 0; a < clusterCount; a++) { | |
matrix[a] = new Array(clusterCount); | |
for (var b = 0; b < clusterCount; b++) { | |
if (a == b) { | |
matrix[a][b] = 0; | |
} else if (a > b) { | |
matrix[a][b] = matrix[b][a]; | |
} else { | |
matrix[a][b] = distFn(a, b); | |
} | |
} | |
} | |
console.log(matrix); | |
return matrix; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"balance_fields": true, "category": 0, "cluster_datasets": {}, "cluster_models": {}, "cluster_seed": "2c249dda00fbf54ab4cdd850532a584f286af5b6", "clusters": {"between_ss": 137.62817, "clusters": [{"center": {"000000": 2.66778, "000001": 159.08954, "000002": 74.08703, "000003": 33.30293, "000004": 293.04351, "000005": 37.22996, "000006": 0.564, "000007": 31.21674, "000008": "true"}, "count": 83, "distance": {"bins": [[0.18907, 1], [0.24593, 1], [0.26674, 1], [0.28445, 1], [0.29535, 2], [0.32734, 2], [0.34476, 6], [0.36607, 3], [0.38403, 3], [0.39602, 3], [0.40817, 2], [0.42088, 1], [0.44745, 8], [0.46526, 7], [0.48219, 4], [0.49548, 2], [0.50671, 5], [0.5266, 4], [0.56296, 4], [0.58193, 4], [0.60828, 4], [0.63979, 3], [0.66522, 1], [0.68245, 1], [0.72961, 2], [0.75079, 2], [0.77039, 1], [0.84335, 1], [0.90445, 1], [1.09559, 1], [1.20363, 1], [1.45601, 1]], "exact_histogram": {"populations": [1, 1, 4, 6, 11, 9, 15, 9, 8, 7, 2, 3, 2, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1], "start": 0.15, "width": 0.05}, "maximum": 1.45601, "mean": 0.5167, "median": 0.47881, "minimum": 0.18907, "population": 83, "standard_deviation": 0.19969, "sum": 42.88574, "sum_squares": 25.42885, "variance": 0.03988}, "id": "000000", "name": "Cluster 0"}, {"center": {"000000": 8.41857, "000001": 137.35357, "000002": 78.96286, "000003": 33.21143, "000004": 100.44143, "000005": 34.20136, "000006": 0.54056, "000007": 45.43643, "000008": "true"}, "count": 100, "distance": {"bins": [[0.23454, 3], [0.24514, 1], [0.28325, 2], [0.30894, 4], [0.3207, 2], [0.33297, 4], [0.35577, 3], [0.36589, 4], [0.37488, 1], [0.38499, 10], [0.4062, 4], [0.41947, 3], [0.43153, 7], [0.45085, 5], [0.46687, 4], [0.48319, 8], [0.49504, 1], [0.50637, 4], [0.52323, 1], [0.5329, 2], [0.55994, 2], [0.56944, 4], [0.58354, 5], [0.59458, 2], [0.60541, 2], [0.61543, 4], [0.63655, 2], [0.64533, 1], [0.67242, 2], [0.69382, 1], [0.72585, 1], [1.11165, 1]], "exact_histogram": {"populations": [4, 2, 10, 18, 17, 15, 7, 13, 9, 3, 1, 0, 0, 0, 0, 0, 0, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.11165, "mean": 0.46452, "median": 0.44912, "minimum": 0.23352, "population": 100, "standard_deviation": 0.13077, "sum": 46.4523, "sum_squares": 23.27123, "variance": 0.0171}, "id": "000001", "name": "Cluster 1"}, {"center": {"000000": 6.24365, "000001": 129.76257, "000002": 77.87351, "000003": 2.08657, "000004": 3.78175, "000005": 31.3212, "000006": 0.37859, "000007": 45.82167, "000008": "false"}, "count": 138, "distance": {"bins": [[0.20071, 2], [0.22872, 1], [0.23959, 1], [0.25527, 3], [0.27942, 1], [0.2916, 5], [0.30281, 4], [0.31483, 3], [0.34398, 7], [0.36323, 11], [0.38208, 3], [0.39464, 13], [0.40817, 2], [0.42063, 10], [0.43821, 5], [0.4675, 13], [0.49717, 5], [0.51423, 7], [0.53103, 7], [0.55152, 3], [0.56814, 6], [0.59638, 3], [0.6175, 10], [0.6486, 2], [0.66261, 1], [0.70003, 2], [0.72111, 1], [0.75462, 3], [0.76538, 1], [0.83764, 1], [0.92275, 1], [0.96815, 1]], "exact_histogram": {"populations": [1, 3, 10, 12, 26, 19, 16, 17, 11, 11, 3, 2, 4, 1, 0, 1, 1], "start": 0.15, "width": 0.05}, "maximum": 0.96815, "mean": 0.46499, "median": 0.43792, "minimum": 0.19684, "population": 138, "standard_deviation": 0.14081, "sum": 64.1688, "sum_squares": 32.55438, "variance": 0.01983}, "id": "000002", "name": "Cluster 2"}, {"center": {"000000": 2.26935, "000001": 106.10988, "000002": 70.76913, "000003": 33.39376, "000004": 81.75246, "000005": 36.05887, "000006": 0.39822, "000007": 27.47969, "000008": "false"}, "count": 165, "distance": {"bins": [[0.12604, 1], [0.15268, 1], [0.17683, 2], [0.20909, 9], [0.23226, 10], [0.24849, 15], [0.27213, 12], [0.29241, 10], [0.30902, 7], [0.32156, 11], [0.33547, 8], [0.3485, 5], [0.36254, 8], [0.37588, 9], [0.39201, 12], [0.40556, 6], [0.41987, 7], [0.43444, 3], [0.4519, 4], [0.47719, 3], [0.49619, 3], [0.50988, 3], [0.52571, 3], [0.5413, 3], [0.59302, 2], [0.60758, 1], [0.63971, 1], [0.65683, 1], [0.74412, 2], [0.81305, 1], [0.86617, 1], [1.01229, 1]], "exact_histogram": {"populations": [1, 3, 30, 26, 30, 30, 18, 8, 9, 2, 2, 1, 2, 0, 1, 1, 0, 0, 1], "start": 0.1, "width": 0.05}, "maximum": 1.01229, "mean": 0.35922, "median": 0.3362, "minimum": 0.12604, "population": 165, "standard_deviation": 0.13214, "sum": 59.27181, "sum_squares": 24.15551, "variance": 0.01746}, "id": "000003", "name": "Cluster 3"}, {"center": {"000000": 2.31424, "000001": 116.43465, "000002": 67.97651, "000003": 24.30103, "000004": 84.22173, "000005": 32.12937, "000006": 1.21622, "000007": 30.07195, "000008": "false"}, "count": 53, "distance": {"bins": [[0.2136, 1], [0.23012, 1], [0.31117, 3], [0.3323, 1], [0.33955, 2], [0.3643, 1], [0.37003, 1], [0.37721, 1], [0.38853, 4], [0.39595, 6], [0.40873, 1], [0.42737, 1], [0.43413, 2], [0.44055, 3], [0.45309, 2], [0.46676, 1], [0.47587, 4], [0.49615, 1], [0.50402, 1], [0.51232, 1], [0.52482, 2], [0.54835, 1], [0.56482, 1], [0.57462, 2], [0.61825, 1], [0.67822, 2], [0.75612, 1], [0.76271, 1], [0.7867, 1], [0.89168, 1], [0.959, 1], [1.30494, 1]], "exact_histogram": {"populations": [2, 0, 6, 13, 7, 8, 5, 3, 1, 2, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.30494, "mean": 0.49043, "median": 0.44134, "minimum": 0.2136, "population": 53, "standard_deviation": 0.18952, "sum": 25.9928, "sum_squares": 14.61544, "variance": 0.03592}, "id": "000004", "name": "Cluster 4"}, {"center": {"000000": 2.08876, "000001": 103.80473, "000002": 67.14682, "000003": 12.87907, "000004": 41.0429, "000005": 26.62245, "000006": 0.35307, "000007": 25.14904, "000008": "false"}, "count": 193, "distance": {"bins": [[0.16476, 1], [0.18193, 4], [0.19549, 8], [0.20648, 1], [0.21661, 6], [0.23332, 13], [0.24713, 16], [0.25971, 4], [0.2695, 10], [0.28379, 20], [0.30181, 14], [0.31864, 16], [0.33221, 9], [0.34795, 10], [0.36433, 11], [0.37822, 7], [0.38753, 5], [0.40343, 7], [0.41678, 5], [0.4303, 4], [0.44382, 2], [0.45727, 2], [0.47276, 5], [0.51307, 2], [0.52655, 3], [0.545, 2], [0.5733, 1], [0.59883, 1], [0.65773, 1], [0.71396, 1], [0.75529, 1], [0.77176, 1]], "exact_histogram": {"populations": [2, 11, 6, 14, 18, 17, 21, 18, 15, 12, 14, 8, 11, 4, 4, 5, 0, 2, 3, 2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1], "start": 0.16, "width": 0.02}, "maximum": 0.77176, "mean": 0.32802, "median": 0.30679, "minimum": 0.16476, "population": 193, "standard_deviation": 0.10352, "sum": 63.30696, "sum_squares": 22.82329, "variance": 0.01072}, "id": "000005", "name": "Cluster 5"}, {"center": {"000000": 3.55556, "000001": 117, "000002": 0.66667, "000003": 2, "000004": 0.69444, "000005": 25.76389, "000006": 0.39317, "000007": 30.44444, "000008": "false"}, "count": 36, "distance": {"bins": [[0.20446, 1], [0.23972, 1], [0.25833, 1], [0.26807, 1], [0.2777, 1], [0.29397, 1], [0.31328, 1], [0.32791, 1], [0.34103, 1], [0.35249, 1], [0.38296, 1], [0.40602, 1], [0.412, 1], [0.41987, 2], [0.45393, 1], [0.47468, 1], [0.51296, 1], [0.5202, 1], [0.52956, 1], [0.57105, 2], [0.58459, 2], [0.63655, 1], [0.71946, 1], [0.73256, 1], [0.74793, 1], [0.76154, 2], [0.77091, 1], [0.79969, 1], [0.82993, 1], [0.84152, 1], [0.87804, 1], [0.91919, 1]], "exact_histogram": {"populations": [2, 4, 3, 2, 4, 2, 3, 4, 1, 0, 3, 4, 2, 1, 1], "start": 0.2, "width": 0.05}, "maximum": 0.91919, "mean": 0.53109, "median": 0.51658, "minimum": 0.20446, "population": 36, "standard_deviation": 0.20936, "sum": 19.11915, "sum_squares": 11.68805, "variance": 0.04383}, "id": "000006", "name": "Cluster 6"}], "fields": {"000000": {"column_number": 0, "datatype": "int8", "name": "pregnancies", "optype": "numeric", "order": 0, "preferred": true, "summary": {"counts": [[0, 111], [1, 135], [2, 103], [3, 75], [4, 68], [5, 57], [6, 50], [7, 45], [8, 38], [9, 28], [10, 24], [11, 11], [12, 9], [13, 10], [14, 2], [15, 1], [17, 1]], "kurtosis": 0.15038, "maximum": 17, "mean": 3.84505, "median": 3, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.89991, "standard_deviation": 3.36958, "sum": 2953, "sum_squares": 20063, "variance": 11.35406}}, "000001": {"column_number": 1, "datatype": "int16", "name": "plasma glucose", "optype": "numeric", "order": 1, "preferred": true, "summary": {"bins": [[0, 5], [44, 1], [56.66667, 3], [61.5, 2], [67.2, 5], [73.3125, 16], [79.79167, 24], [84.26923, 26], [87.95455, 22], [91.33333, 36], [95.7, 40], [100.97183, 71], [107.21739, 69], [113.18333, 60], [118.5641, 39], [123.93939, 66], [129.5, 42], [133.93333, 15], [137.82353, 34], [142.65217, 23], [146.5, 26], [151.33333, 15], [154.78571, 14], [158, 12], [161.84615, 13], [166.22222, 18], [172.5625, 16], [176.33333, 3], [180, 17], [183.5, 6], [188.23077, 13], [195.6875, 16]], "kurtosis": 0.62881, "maximum": 199, "mean": 120.89453, "median": 116.8391, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.17341, "splits": [73.40028, 80.70976, 84.56046, 88.29697, 91.0653, 94.02324, 96.37393, 99.15167, 100.81422, 102.99865, 105.47912, 107.37394, 109.45106, 111.68975, 114.22117, 116.8391, 119.5, 122.06267, 124.31608, 126.65592, 129.19127, 132.64738, 136.8101, 140.43237, 144.72673, 148.62092, 155.01649, 161.85305, 168.92, 178.9276, 187.59806], "standard_deviation": 31.97262, "sum": 92847, "sum_squares": 12008759, "variance": 1022.24831}}, "000002": {"column_number": 2, "datatype": "int8", "name": "blood pressure", "optype": "numeric", "order": 2, "preferred": true, "summary": {"bins": [[0, 35], [24, 1], [30, 2], [39, 2], [44.66667, 6], [49.44444, 18], [52, 11], [55.04, 25], [58, 21], [60.95833, 72], [64.8375, 80], [68, 45], [70, 57], [72, 44], [74.86869, 99], [78, 45], [80, 40], [82, 30], [84.96, 50], [88, 25], [90, 22], [92, 8], [94.81818, 11], [98, 3], [100, 3], [102, 1], [104, 2], [106, 3], [108, 2], [110, 3], [114, 1], [122, 1]], "kurtosis": 5.13869, "maximum": 122, "mean": 69.10547, "median": 72, "minimum": 0, "missing_count": 0, "population": 768, "skewness": -1.84001, "splits": [4.95379, 47.79361, 52.45455, 56.31476, 58.78684, 60.12982, 61.72909, 62.88889, 64.03541, 65.06458, 66.38178, 67.65685, 68.70061, 69.62414, 70.48722, 71.43285, 72.48913, 73.49, 74.23488, 75.36578, 76.27906, 77.43376, 78.51858, 79.65375, 80.9007, 82.4098, 84.16186, 85.97598, 88.04005, 90.09226, 94.7101], "standard_deviation": 19.35581, "sum": 53073, "sum_squares": 3954989, "variance": 374.64727}}, "000003": {"column_number": 3, "datatype": "int8", "name": "triceps skin thickness", "optype": "numeric", "order": 3, "preferred": true, "summary": {"bins": [[0, 227], [7.5, 4], [10.54545, 11], [12.61111, 18], [14.7, 20], [16.7, 20], [18.47368, 38], [20.43478, 23], [22.57895, 38], [24.57143, 28], [26.58974, 39], [28.45946, 37], [30.41304, 46], [32.39216, 51], [34.65217, 23], [36.53333, 30], [38.72, 25], [40.48387, 31], [42.35294, 17], [44.54545, 11], [46, 8], [47, 4], [48, 4], [49, 3], [50, 3], [51, 1], [52, 2], [54, 2], [56, 1], [60, 1], [63, 1], [99, 1]], "kurtosis": -0.52449, "maximum": 99, "mean": 20.53646, "median": 23, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.10916, "splits": [0.46227, 13.14087, 22.81344, 29.82784, 36.68115], "standard_deviation": 15.95222, "sum": 15772, "sum_squares": 519082, "variance": 254.47325}}, "000004": {"column_number": 4, "datatype": "int16", "name": "insulin", "optype": "numeric", "order": 4, "preferred": true, "summary": {"bins": [[0, 374], [19.33333, 9], [43.69697, 33], [60.10811, 37], [76.77778, 36], [98.36842, 57], [122.59524, 42], [141.82759, 29], [162.35714, 28], [184.35294, 34], [208.57895, 19], [232, 11], [251.4, 5], [272.7, 10], [288.5, 6], [304.66667, 3], [324.75, 8], [338.5, 2], [368.33333, 3], [393.66667, 3], [415, 1], [440, 1], [465, 1], [479.4, 5], [495, 2], [510, 1], [542.66667, 3], [579, 1], [600, 1], [680, 1], [744, 1], [846, 1]], "kurtosis": 7.15957, "maximum": 846, "mean": 79.79948, "median": 30.5, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 2.26781, "splits": [0.21803, 30.5, 127.38859], "standard_deviation": 115.244, "sum": 61286, "sum_squares": 15077256, "variance": 13281.18008}}, "000005": {"column_number": 5, "datatype": "double", "name": "bmi", "optype": "numeric", "order": 5, "preferred": true, "summary": {"bins": [[0, 11], [18.25, 4], [19.72308, 13], [21.02222, 9], [22.15909, 22], [23.46333, 30], [24.92034, 59], [26.32222, 36], [27.59787, 47], [28.70278, 36], [29.80851, 47], [30.83333, 45], [32.30286, 70], [33.46, 40], [34.47018, 57], [35.59167, 36], [36.508, 25], [37.41724, 29], [38.34, 25], [39.46061, 33], [40.90556, 18], [42.45789, 19], [43.70952, 21], [45.42727, 11], [46.41111, 9], [48.225, 4], [49.65, 4], [52.675, 4], [55, 1], [57.3, 1], [59.4, 1], [67.1, 1]], "kurtosis": 3.26126, "maximum": 67.1, "mean": 31.99258, "median": 32.11435, "minimum": 0, "missing_count": 0, "population": 768, "skewness": -0.42814, "splits": [19.91842, 22.15909, 23.38981, 24.26256, 24.99372, 25.65688, 26.38114, 27.23053, 27.74252, 28.5307, 29.12209, 29.75821, 30.26534, 30.81337, 31.42032, 32.11435, 32.62432, 33.09397, 33.6175, 34.121, 34.60968, 35.1546, 35.78179, 36.57558, 37.39384, 38.1764, 39.1192, 40.14221, 42.1375, 43.51924, 45.98461], "standard_deviation": 7.88416, "sum": 24570.3, "sum_squares": 833743.95, "variance": 62.15998}}, "000006": {"column_number": 6, "datatype": "double", "name": "diabetes pedigree", "optype": "numeric", "order": 6, "preferred": true, "summary": {"bins": [[0.096, 16], [0.1473, 71], [0.19455, 62], [0.24832, 117], [0.29244, 52], [0.34438, 73], [0.40872, 54], [0.45431, 29], [0.50663, 40], [0.55204, 28], [0.59478, 32], [0.64421, 24], [0.68819, 32], [0.74482, 33], [0.82673, 22], [0.88471, 14], [0.94765, 17], [1.015, 5], [1.08729, 7], [1.14571, 7], [1.20238, 8], [1.2702, 5], [1.33067, 3], [1.39375, 4], [1.45933, 3], [1.6, 1], [1.70933, 3], [1.781, 1], [1.893, 1], [2.137, 1], [2.3085, 2], [2.42, 1]], "kurtosis": 5.55079, "maximum": 2.42, "mean": 0.47188, "median": 0.37281, "minimum": 0.078, "missing_count": 0, "population": 768, "skewness": 1.91616, "splits": [0.12593, 0.14529, 0.16151, 0.17954, 0.19595, 0.21304, 0.22969, 0.24253, 0.25308, 0.26263, 0.2742, 0.28946, 0.30592, 0.32835, 0.34889, 0.37281, 0.39834, 0.42312, 0.44939, 0.48737, 0.52039, 0.55142, 0.58659, 0.62493, 0.6685, 0.70158, 0.74241, 0.82117, 0.8975, 1.03122, 1.25279], "standard_deviation": 0.33133, "sum": 362.401, "sum_squares": 255.20866, "variance": 0.10978}}, "000007": {"column_number": 7, "datatype": "int8", "name": "age", "optype": "numeric", "order": 7, "preferred": true, "summary": {"bins": [[21.53333, 135], [23.54762, 84], [25.40741, 81], [27.52239, 67], [29.42, 50], [31.4, 40], [33.45161, 31], [35.61538, 26], [37.45714, 35], [39.52, 25], [41.45, 40], [43.38095, 21], [45.46429, 28], [47.45455, 11], [49.61538, 13], [51.5, 16], [53.54545, 11], [55.42857, 7], [57.58333, 12], [59.625, 8], [61, 2], [62, 4], [63, 4], [64, 1], [65, 3], [66, 4], [67, 3], [68, 1], [69, 2], [70, 1], [72, 1], [81, 1]], "kurtosis": 0.63118, "maximum": 81, "mean": 33.24089, "median": 29, "minimum": 21, "missing_count": 0, "population": 768, "skewness": 1.12739, "splits": [21.00793, 21.49815, 21.95822, 22.45075, 23.15535, 23.91238, 24.60085, 25.28338, 26.13665, 27.12428, 28.07187, 29.08726, 30.43864, 31.93845, 33.92911, 36.36267, 38.19211, 40.54937, 42.17071, 44.82629, 48.1, 52.72302, 59.43649], "standard_deviation": 11.76023, "sum": 25529, "sum_squares": 954685, "variance": 138.30305}}, "000008": {"column_number": 8, "datatype": "string", "name": "diabetes", "optype": "categorical", "order": 8, "preferred": true, "summary": {"categories": [["false", 500], ["true", 268]], "missing_count": 0}, "term_analysis": {"enabled": true}}}, "global": {"center": {"000000": 3.84505, "000001": 120.89453, "000002": 69.10547, "000003": 20.53646, "000004": 79.79948, "000005": 31.99258, "000006": 0.47188, "000007": 33.24089, "000008": "false"}, "distance": {"bins": [[0.2208, 3], [0.25075, 5], [0.28444, 12], [0.32053, 31], [0.35402, 52], [0.39408, 58], [0.42608, 73], [0.46072, 67], [0.48888, 46], [0.52234, 73], [0.56464, 57], [0.6125, 60], [0.66062, 40], [0.69534, 21], [0.72872, 29], [0.76962, 21], [0.81285, 23], [0.8573, 16], [0.8964, 22], [0.93227, 15], [0.96695, 13], [1.01147, 3], [1.03983, 2], [1.09701, 5], [1.15219, 6], [1.24281, 6], [1.28332, 2], [1.32619, 3], [1.42872, 1], [1.59306, 1], [1.66191, 1], [1.81357, 1]], "exact_histogram": {"open_max": 3, "populations": [6, 14, 50, 72, 98, 103, 82, 68, 55, 44, 35, 23, 25, 25, 24, 13, 5, 3, 5, 3, 3, 5, 3, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.81357, "mean": 0.57692, "median": 0.52217, "minimum": 0.21119, "population": 768, "standard_deviation": 0.21827, "sum": 443.07838, "sum_squares": 292.16492, "variance": 0.04764}}, "ratio_ss": 0.47106, "total_ss": 292.16492, "within_ss": 154.53675}, "code": 200, "columns": 9, "configuration": null, "configuration_status": false, "created": "2017-06-29T00:07:23.266000", "credits": 0, "credits_per_prediction": 0.0, "critical_value": 5, "dataset": "dataset/572b9d1749c4a133a2008296", "dataset_field_types": {"categorical": 1, "datetime": 0, "effective_fields": 9, "items": 0, "numeric": 8, "preferred": 9, "text": 0, "total": 9}, "dataset_status": true, "dataset_type": 0, "description": "", "excluded_fields": [], "field_scales": {}, "fields_meta": {"count": 9, "limit": 1000, "offset": 0, "query_total": 9, "total": 9}, "input_fields": ["000000", "000001", "000002", "000003", "000004", "000005", "000006", "000007", "000008"], "k": 7, "locale": "en-us", "max_columns": 9, "max_rows": 768, "model_clusters": false, "name": "diabetes dataset's cluster", "number_of_batchcentroids": 0, "number_of_centroids": 0, "number_of_public_centroids": 0, "out_of_bag": false, "price": 0.0, "private": true, "project": "project/572a709f3bbd2130a2001a6c", "range": [1, 768], "replacement": false, "resource": "cluster/595444bb49c4a1364f0032e4", "rows": 768, "sample_rate": 1.0, "scales": {"000000": 0.06336138346167877, "000001": 0.006683258614263443, "000002": 0.011036666284803718, "000003": 0.013387896832249753, "000004": 0.0018538054418117465, "000005": 0.027103747698761376, "000006": 0.6450418835540328, "000007": 0.018169273936616194, "000008": 0.21336208417578217}, "shared": false, "size": 26106, "source": "source/572b9d113bbd2130a1002c31", "source_status": true, "status": {"code": 5, "elapsed": 4444, "message": "The cluster has been created", "progress": 1.0}, "subscription": true, "summary_fields": [], "tags": [], "updated": "2017-06-29T00:07:32.983000", "white_box": false} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<meta charset="utf-8"> | |
<style> | |
#terms { | |
position: absolute; | |
top: 50px; | |
left: 700px; | |
display: flex; | |
flex-direction: column; | |
} | |
</style> | |
<head> | |
<script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script> | |
<script src="//cdnjs.cloudflare.com/ajax/libs/seedrandom/2.4.0/seedrandom.min.js"></script> | |
<script src="cluster-dist.js" charset="utf-8"></script> | |
<script src="tsne.js" charset="utf-8"></script> | |
</head> | |
<body> | |
<svg width="850" height="500"></svg> | |
<div id="terms"></div> | |
<script> | |
var width = 850; | |
var height = 500; | |
var colors = d3.scale.category20(); | |
d3.json("clustered-diabetes.json", function(error, resource) { | |
var fields = resource.clusters.fields; | |
var coords = tsneLayout(resource, "defaultseed"); | |
var clusters = resource.clusters.clusters; | |
var clusterCount = Object.keys(clusters).length; | |
var range = Array.apply(null, Array(clusterCount)).map(function (_, i) {return i;}); | |
// scale the tsne coordinates relative to the svg size | |
coords = scaleDimension(width, width/10, 0, coords); | |
coords = scaleDimension(height, height/10, 1, coords); | |
var terms = d3.select("#terms"); | |
var colorFn = function (i, lightness) { | |
return d3.lab(lightness, | |
(200 * coords[i][0] / width) - 100, | |
(200 * coords[i][1] / height) - 100); | |
}; | |
var svg = d3.select("svg"); | |
svg.style("width", width); | |
svg.style("height", height); | |
svg.selectAll(".dot").data(range).enter() | |
.append("circle") | |
.attr("class", "dot") | |
.attr("r", function (i) { | |
// set topic radius relative to the 'topic_means' score | |
var area = clusters[i].count / resource.rows; | |
r = Math.sqrt(area) * 60; | |
return r; | |
}) | |
.on("mouseenter", function (i) { | |
terms.selectAll(".term").remove(); | |
var cluster = clusters[i]; | |
for (var k in cluster.center) { | |
var name = fields[k].name; | |
var val; | |
if (typeof cluster.center[k] == 'object') { | |
name += "(tokens)"; | |
val = Array.from(cluster.center[k]); | |
} else { | |
val = cluster.center[k]; | |
} | |
name += " : "; | |
terms.append("text") | |
.attr("class", "term") | |
.text(name + val); | |
} | |
terms.append("text").attr("class", "term") | |
.text("Cluster Size : " + cluster.count); | |
}) | |
.on("mouseleave", function (i) { | |
terms.selectAll(".term").remove(); | |
}) | |
.attr("cx", function (i) {return coords[i][0];}) | |
.attr("cy", function (i) {return coords[i][1];}) | |
.style("fill", function (i) { return colorFn(i, 80); }) | |
.style("stroke", function (i) { return colorFn(i, 65); }); | |
}); | |
function scaleDimension(dimSize, buffer, dimIndex, coords) { | |
var minDim = coords[0][dimIndex]; | |
var maxDim = coords[0][dimIndex]; | |
for (var i in coords) { | |
minDim = Math.min(minDim, coords[i][dimIndex]); | |
maxDim = Math.max(maxDim, coords[i][dimIndex]); | |
} | |
var diff = maxDim - minDim; | |
var dimScale = dimSize - 2 * buffer; | |
for (var i in coords) { | |
coords[i][dimIndex] = (((coords[i][dimIndex] - minDim) / diff) * dimScale) + buffer; | |
} | |
return coords; | |
} | |
function tsneLayout(resource, seed) { | |
Math.seedrandom(seed); | |
var clusterCount = resource.clusters.clusters.length; | |
var perplexity = Math.round(clusterCount / 10); | |
perplexity = Math.max(perplexity, 2); | |
console.log(perplexity); | |
var tsne = new tsnejs.tSNE({epsilon: 10, perplexity: perplexity}); | |
tsne.initDataDist(distMatrix(resource)); | |
for (var k = 0; k < 300; k++) { | |
tsne.step(); | |
} | |
return tsne.getSolution(); | |
} | |
</script> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from https://github.com/karpathy/tsnejs | |
// create main global object | |
var tsnejs = tsnejs || { REVISION: 'ALPHA' }; | |
(function(global) { | |
"use strict"; | |
// utility function | |
var assert = function(condition, message) { | |
if (!condition) { throw message || "Assertion failed"; } | |
} | |
// syntax sugar | |
var getopt = function(opt, field, defaultval) { | |
if(opt.hasOwnProperty(field)) { | |
return opt[field]; | |
} else { | |
return defaultval; | |
} | |
} | |
// return 0 mean unit standard deviation random number | |
var return_v = false; | |
var v_val = 0.0; | |
var gaussRandom = function() { | |
if(return_v) { | |
return_v = false; | |
return v_val; | |
} | |
var u = 2*Math.random()-1; | |
var v = 2*Math.random()-1; | |
var r = u*u + v*v; | |
if(r == 0 || r > 1) return gaussRandom(); | |
var c = Math.sqrt(-2*Math.log(r)/r); | |
v_val = v*c; // cache this for next function call for efficiency | |
return_v = true; | |
return u*c; | |
} | |
// return random normal number | |
var randn = function(mu, std){ return mu+gaussRandom()*std; } | |
// utilitity that creates contiguous vector of zeros of size n | |
var zeros = function(n) { | |
if(typeof(n)==='undefined' || isNaN(n)) { return []; } | |
if(typeof ArrayBuffer === 'undefined') { | |
// lacking browser support | |
var arr = new Array(n); | |
for(var i=0;i<n;i++) { arr[i]= 0; } | |
return arr; | |
} else { | |
return new Float64Array(n); // typed arrays are faster | |
} | |
} | |
// utility that returns 2d array filled with random numbers | |
// or with value s, if provided | |
var randn2d = function(n,d,s) { | |
var uses = typeof s !== 'undefined'; | |
var x = []; | |
for(var i=0;i<n;i++) { | |
var xhere = []; | |
for(var j=0;j<d;j++) { | |
if(uses) { | |
xhere.push(s); | |
} else { | |
xhere.push(randn(0.0, 1e-4)); | |
} | |
} | |
x.push(xhere); | |
} | |
return x; | |
} | |
// compute L2 distance between two vectors | |
var L2 = function(x1, x2) { | |
var D = x1.length; | |
var d = 0; | |
for(var i=0;i<D;i++) { | |
var x1i = x1[i]; | |
var x2i = x2[i]; | |
d += (x1i-x2i)*(x1i-x2i); | |
} | |
return d; | |
} | |
// compute pairwise distance in all vectors in X | |
var xtod = function(X) { | |
var N = X.length; | |
var dist = zeros(N * N); // allocate contiguous array | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var d = L2(X[i], X[j]); | |
dist[i*N+j] = d; | |
dist[j*N+i] = d; | |
} | |
} | |
return dist; | |
} | |
// compute (p_{i|j} + p_{j|i})/(2n) | |
var d2p = function(D, perplexity, tol) { | |
var Nf = Math.sqrt(D.length); // this better be an integer | |
var N = Math.floor(Nf); | |
assert(N === Nf, "D should have square number of elements."); | |
var Htarget = Math.log(perplexity); // target entropy of distribution | |
var P = zeros(N * N); // temporary probability matrix | |
var prow = zeros(N); // a temporary storage compartment | |
for(var i=0;i<N;i++) { | |
var betamin = -Infinity; | |
var betamax = Infinity; | |
var beta = 1; // initial value of precision | |
var done = false; | |
var maxtries = 50; | |
// perform binary search to find a suitable precision beta | |
// so that the entropy of the distribution is appropriate | |
var num = 0; | |
while(!done) { | |
//debugger; | |
// compute entropy and kernel row with beta precision | |
var psum = 0.0; | |
for(var j=0;j<N;j++) { | |
var pj = Math.exp(- D[i*N+j] * beta); | |
if(i===j) { pj = 0; } // we dont care about diagonals | |
prow[j] = pj; | |
psum += pj; | |
} | |
// normalize p and compute entropy | |
var Hhere = 0.0; | |
for(var j=0;j<N;j++) { | |
var pj = prow[j] / psum; | |
prow[j] = pj; | |
if(pj > 1e-7) Hhere -= pj * Math.log(pj); | |
} | |
// adjust beta based on result | |
if(Hhere > Htarget) { | |
// entropy was too high (distribution too diffuse) | |
// so we need to increase the precision for more peaky distribution | |
betamin = beta; // move up the bounds | |
if(betamax === Infinity) { beta = beta * 2; } | |
else { beta = (beta + betamax) / 2; } | |
} else { | |
// converse case. make distrubtion less peaky | |
betamax = beta; | |
if(betamin === -Infinity) { beta = beta / 2; } | |
else { beta = (beta + betamin) / 2; } | |
} | |
// stopping conditions: too many tries or got a good precision | |
num++; | |
if(Math.abs(Hhere - Htarget) < tol) { done = true; } | |
if(num >= maxtries) { done = true; } | |
} | |
// console.log('data point ' + i + ' gets precision ' + beta + ' after ' + num + ' binary search steps.'); | |
// copy over the final prow to P at row i | |
for(var j=0;j<N;j++) { P[i*N+j] = prow[j]; } | |
} // end loop over examples i | |
// symmetrize P and normalize it to sum to 1 over all ij | |
var Pout = zeros(N * N); | |
var N2 = N*2; | |
for(var i=0;i<N;i++) { | |
for(var j=0;j<N;j++) { | |
Pout[i*N+j] = Math.max((P[i*N+j] + P[j*N+i])/N2, 1e-100); | |
} | |
} | |
return Pout; | |
} | |
// helper function | |
function sign(x) { return x > 0 ? 1 : x < 0 ? -1 : 0; } | |
var tSNE = function(opt) { | |
var opt = opt || {}; | |
this.perplexity = getopt(opt, "perplexity", 30); // effective number of nearest neighbors | |
this.dim = getopt(opt, "dim", 2); // by default 2-D tSNE | |
this.epsilon = getopt(opt, "epsilon", 10); // learning rate | |
this.iter = 0; | |
} | |
tSNE.prototype = { | |
// this function takes a set of high-dimensional points | |
// and creates matrix P from them using gaussian kernel | |
initDataRaw: function(X) { | |
var N = X.length; | |
var D = X[0].length; | |
assert(N > 0, " X is empty? You must have some data!"); | |
assert(D > 0, " X[0] is empty? Where is the data?"); | |
var dists = xtod(X); // convert X to distances using gaussian kernel | |
this.P = d2p(dists, this.perplexity, 1e-4); // attach to object | |
this.N = N; // back up the size of the dataset | |
this.initSolution(); // refresh this | |
}, | |
// this function takes a given distance matrix and creates | |
// matrix P from them. | |
// D is assumed to be provided as a list of lists, and should be symmetric | |
initDataDist: function(D) { | |
var N = D.length; | |
assert(N > 0, " X is empty? You must have some data!"); | |
// convert D to a (fast) typed array version | |
var dists = zeros(N * N); // allocate contiguous array | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var d = D[i][j]; | |
dists[i*N+j] = d; | |
dists[j*N+i] = d; | |
} | |
} | |
this.P = d2p(dists, this.perplexity, 1e-4); | |
this.N = N; | |
this.initSolution(); // refresh this | |
}, | |
// (re)initializes the solution to random | |
initSolution: function() { | |
// generate random solution to t-SNE | |
this.Y = randn2d(this.N, this.dim); // the solution | |
this.gains = randn2d(this.N, this.dim, 1.0); // step gains to accelerate progress in unchanging directions | |
this.ystep = randn2d(this.N, this.dim, 0.0); // momentum accumulator | |
this.iter = 0; | |
}, | |
// return pointer to current solution | |
getSolution: function() { | |
return this.Y; | |
}, | |
// perform a single step of optimization to improve the embedding | |
step: function() { | |
this.iter += 1; | |
var N = this.N; | |
var cg = this.costGrad(this.Y); // evaluate gradient | |
var cost = cg.cost; | |
var grad = cg.grad; | |
// perform gradient step | |
var ymean = zeros(this.dim); | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
var gid = grad[i][d]; | |
var sid = this.ystep[i][d]; | |
var gainid = this.gains[i][d]; | |
// compute gain update | |
var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2; | |
if(newgain < 0.01) newgain = 0.01; // clamp | |
this.gains[i][d] = newgain; // store for next turn | |
// compute momentum step direction | |
var momval = this.iter < 250 ? 0.5 : 0.8; | |
var newsid = momval * sid - this.epsilon * newgain * grad[i][d]; | |
this.ystep[i][d] = newsid; // remember the step we took | |
// step! | |
this.Y[i][d] += newsid; | |
ymean[d] += this.Y[i][d]; // accumulate mean so that we can center later | |
} | |
} | |
// reproject Y to be zero mean | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
this.Y[i][d] -= ymean[d]/N; | |
} | |
} | |
//if(this.iter%100===0) console.log('iter ' + this.iter + ', cost: ' + cost); | |
return cost; // return current cost | |
}, | |
// for debugging: gradient check | |
debugGrad: function() { | |
var N = this.N; | |
var cg = this.costGrad(this.Y); // evaluate gradient | |
var cost = cg.cost; | |
var grad = cg.grad; | |
var e = 1e-5; | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
var yold = this.Y[i][d]; | |
this.Y[i][d] = yold + e; | |
var cg0 = this.costGrad(this.Y); | |
this.Y[i][d] = yold - e; | |
var cg1 = this.costGrad(this.Y); | |
var analytic = grad[i][d]; | |
var numerical = (cg0.cost - cg1.cost) / ( 2 * e ); | |
console.log(i + ',' + d + ': gradcheck analytic: ' + analytic + ' vs. numerical: ' + numerical); | |
this.Y[i][d] = yold; | |
} | |
} | |
}, | |
// return cost and gradient, given an arrangement | |
costGrad: function(Y) { | |
var N = this.N; | |
var dim = this.dim; // dim of output space | |
var P = this.P; | |
var pmul = this.iter < 100 ? 4 : 1; // trick that helps with local optima | |
// compute current Q distribution, unnormalized first | |
var Qu = zeros(N * N); | |
var qsum = 0.0; | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var dsum = 0.0; | |
for(var d=0;d<dim;d++) { | |
var dhere = Y[i][d] - Y[j][d]; | |
dsum += dhere * dhere; | |
} | |
var qu = 1.0 / (1.0 + dsum); // Student t-distribution | |
Qu[i*N+j] = qu; | |
Qu[j*N+i] = qu; | |
qsum += 2 * qu; | |
} | |
} | |
// normalize Q distribution to sum to 1 | |
var NN = N*N; | |
var Q = zeros(NN); | |
for(var q=0;q<NN;q++) { Q[q] = Math.max(Qu[q] / qsum, 1e-100); } | |
var cost = 0.0; | |
var grad = []; | |
for(var i=0;i<N;i++) { | |
var gsum = new Array(dim); // init grad for point i | |
for(var d=0;d<dim;d++) { gsum[d] = 0.0; } | |
for(var j=0;j<N;j++) { | |
cost += - P[i*N+j] * Math.log(Q[i*N+j]); // accumulate cost (the non-constant portion at least...) | |
var premult = 4 * (pmul * P[i*N+j] - Q[i*N+j]) * Qu[i*N+j]; | |
for(var d=0;d<dim;d++) { | |
gsum[d] += premult * (Y[i][d] - Y[j][d]); | |
} | |
} | |
grad.push(gsum); | |
} | |
return {cost: cost, grad: grad}; | |
} | |
} | |
global.tSNE = tSNE; // export tSNE class | |
})(tsnejs); | |
// export the library to window, or to module in nodejs | |
(function(lib) { | |
"use strict"; | |
if (typeof module === "undefined" || typeof module.exports === "undefined") { | |
window.tsnejs = lib; // in ordinary browser attach library to window | |
} else { | |
module.exports = lib; // in nodejs | |
} | |
})(tsnejs); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment