Skip to content

Instantly share code, notes, and snippets.

@ashenfad
Created June 29, 2017 01:51
Show Gist options
  • Save ashenfad/54726a70080f92de3e55e8c4866bd990 to your computer and use it in GitHub Desktop.
Save ashenfad/54726a70080f92de3e55e8c4866bd990 to your computer and use it in GitHub Desktop.
t-SNE Layout of Diabetes Clusters
Set.prototype.union = function(setB) {
var union = new Set(this);
for (var elem of setB) {
union.add(elem);
}
return union;
}
Set.prototype.intersection = function(setB) {
var intersection = new Set();
for (var elem of setB) {
if (this.has(elem)) {
intersection.add(elem);
}
}
return intersection;
}
function termDist(set1, set2) {
if (set1.size == 0 && set2.size == 0) {
return 0;
} else if (set1.size == 0 || set2.size == 0) {
return 1;
} else {
return 1 - set1.intersection(set2).size / Math.sqrt(set1.size * set2.size);
}
}
/*
Given a BigML cluster resource, will return a function that computes
the distance between two clusters (referenced by their index)
*/
function clusterDistFn(resource) {
var clusters = resource.clusters.clusters;
for (var i in clusters) {
var center = clusters[i].center;
for (k in center) {
if (typeof center[k] == 'object') {
center[k] = new Set(center[k]);
}
}
}
var scales = resource.scales;
return function (a, b) {
var sum = 0;
var clusterA = clusters[a];
var clusterB = clusters[b];
for (var k in clusterA.center) {
var aVal = clusterA.center[k];
var bVal = clusterB.center[k];
var diff;
switch (typeof aVal) {
case 'number':
diff = aVal - bVal;
break;
case 'string':
diff = aVal == bVal ? 0 : 1;
break;
case 'object':
diff = termDist(aVal, bVal);
break;
}
diff *= scales[k];
sum += diff * diff;
}
/* console.log(clusterA.center, clusterB.center, a, b, Math.sqrt(sum)); */
return Math.sqrt(sum);
};
}
/*
Given a BigML cluster resource, returns a distance matrix for the
clusters
*/
function distMatrix (resource) {
var clusterCount = Object.keys(resource.clusters.clusters).length;
var distFn = clusterDistFn(resource);
var matrix = new Array(clusterCount);
for (var a = 0; a < clusterCount; a++) {
matrix[a] = new Array(clusterCount);
for (var b = 0; b < clusterCount; b++) {
if (a == b) {
matrix[a][b] = 0;
} else if (a > b) {
matrix[a][b] = matrix[b][a];
} else {
matrix[a][b] = distFn(a, b);
}
}
}
console.log(matrix);
return matrix;
}
{"balance_fields": true, "category": 0, "cluster_datasets": {}, "cluster_models": {}, "cluster_seed": "2c249dda00fbf54ab4cdd850532a584f286af5b6", "clusters": {"between_ss": 137.62817, "clusters": [{"center": {"000000": 2.66778, "000001": 159.08954, "000002": 74.08703, "000003": 33.30293, "000004": 293.04351, "000005": 37.22996, "000006": 0.564, "000007": 31.21674, "000008": "true"}, "count": 83, "distance": {"bins": [[0.18907, 1], [0.24593, 1], [0.26674, 1], [0.28445, 1], [0.29535, 2], [0.32734, 2], [0.34476, 6], [0.36607, 3], [0.38403, 3], [0.39602, 3], [0.40817, 2], [0.42088, 1], [0.44745, 8], [0.46526, 7], [0.48219, 4], [0.49548, 2], [0.50671, 5], [0.5266, 4], [0.56296, 4], [0.58193, 4], [0.60828, 4], [0.63979, 3], [0.66522, 1], [0.68245, 1], [0.72961, 2], [0.75079, 2], [0.77039, 1], [0.84335, 1], [0.90445, 1], [1.09559, 1], [1.20363, 1], [1.45601, 1]], "exact_histogram": {"populations": [1, 1, 4, 6, 11, 9, 15, 9, 8, 7, 2, 3, 2, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1], "start": 0.15, "width": 0.05}, "maximum": 1.45601, "mean": 0.5167, "median": 0.47881, "minimum": 0.18907, "population": 83, "standard_deviation": 0.19969, "sum": 42.88574, "sum_squares": 25.42885, "variance": 0.03988}, "id": "000000", "name": "Cluster 0"}, {"center": {"000000": 8.41857, "000001": 137.35357, "000002": 78.96286, "000003": 33.21143, "000004": 100.44143, "000005": 34.20136, "000006": 0.54056, "000007": 45.43643, "000008": "true"}, "count": 100, "distance": {"bins": [[0.23454, 3], [0.24514, 1], [0.28325, 2], [0.30894, 4], [0.3207, 2], [0.33297, 4], [0.35577, 3], [0.36589, 4], [0.37488, 1], [0.38499, 10], [0.4062, 4], [0.41947, 3], [0.43153, 7], [0.45085, 5], [0.46687, 4], [0.48319, 8], [0.49504, 1], [0.50637, 4], [0.52323, 1], [0.5329, 2], [0.55994, 2], [0.56944, 4], [0.58354, 5], [0.59458, 2], [0.60541, 2], [0.61543, 4], [0.63655, 2], [0.64533, 1], [0.67242, 2], [0.69382, 1], [0.72585, 1], [1.11165, 1]], "exact_histogram": {"populations": [4, 2, 10, 18, 17, 15, 7, 13, 9, 3, 1, 0, 0, 0, 0, 0, 0, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.11165, "mean": 0.46452, "median": 0.44912, "minimum": 0.23352, "population": 100, "standard_deviation": 0.13077, "sum": 46.4523, "sum_squares": 23.27123, "variance": 0.0171}, "id": "000001", "name": "Cluster 1"}, {"center": {"000000": 6.24365, "000001": 129.76257, "000002": 77.87351, "000003": 2.08657, "000004": 3.78175, "000005": 31.3212, "000006": 0.37859, "000007": 45.82167, "000008": "false"}, "count": 138, "distance": {"bins": [[0.20071, 2], [0.22872, 1], [0.23959, 1], [0.25527, 3], [0.27942, 1], [0.2916, 5], [0.30281, 4], [0.31483, 3], [0.34398, 7], [0.36323, 11], [0.38208, 3], [0.39464, 13], [0.40817, 2], [0.42063, 10], [0.43821, 5], [0.4675, 13], [0.49717, 5], [0.51423, 7], [0.53103, 7], [0.55152, 3], [0.56814, 6], [0.59638, 3], [0.6175, 10], [0.6486, 2], [0.66261, 1], [0.70003, 2], [0.72111, 1], [0.75462, 3], [0.76538, 1], [0.83764, 1], [0.92275, 1], [0.96815, 1]], "exact_histogram": {"populations": [1, 3, 10, 12, 26, 19, 16, 17, 11, 11, 3, 2, 4, 1, 0, 1, 1], "start": 0.15, "width": 0.05}, "maximum": 0.96815, "mean": 0.46499, "median": 0.43792, "minimum": 0.19684, "population": 138, "standard_deviation": 0.14081, "sum": 64.1688, "sum_squares": 32.55438, "variance": 0.01983}, "id": "000002", "name": "Cluster 2"}, {"center": {"000000": 2.26935, "000001": 106.10988, "000002": 70.76913, "000003": 33.39376, "000004": 81.75246, "000005": 36.05887, "000006": 0.39822, "000007": 27.47969, "000008": "false"}, "count": 165, "distance": {"bins": [[0.12604, 1], [0.15268, 1], [0.17683, 2], [0.20909, 9], [0.23226, 10], [0.24849, 15], [0.27213, 12], [0.29241, 10], [0.30902, 7], [0.32156, 11], [0.33547, 8], [0.3485, 5], [0.36254, 8], [0.37588, 9], [0.39201, 12], [0.40556, 6], [0.41987, 7], [0.43444, 3], [0.4519, 4], [0.47719, 3], [0.49619, 3], [0.50988, 3], [0.52571, 3], [0.5413, 3], [0.59302, 2], [0.60758, 1], [0.63971, 1], [0.65683, 1], [0.74412, 2], [0.81305, 1], [0.86617, 1], [1.01229, 1]], "exact_histogram": {"populations": [1, 3, 30, 26, 30, 30, 18, 8, 9, 2, 2, 1, 2, 0, 1, 1, 0, 0, 1], "start": 0.1, "width": 0.05}, "maximum": 1.01229, "mean": 0.35922, "median": 0.3362, "minimum": 0.12604, "population": 165, "standard_deviation": 0.13214, "sum": 59.27181, "sum_squares": 24.15551, "variance": 0.01746}, "id": "000003", "name": "Cluster 3"}, {"center": {"000000": 2.31424, "000001": 116.43465, "000002": 67.97651, "000003": 24.30103, "000004": 84.22173, "000005": 32.12937, "000006": 1.21622, "000007": 30.07195, "000008": "false"}, "count": 53, "distance": {"bins": [[0.2136, 1], [0.23012, 1], [0.31117, 3], [0.3323, 1], [0.33955, 2], [0.3643, 1], [0.37003, 1], [0.37721, 1], [0.38853, 4], [0.39595, 6], [0.40873, 1], [0.42737, 1], [0.43413, 2], [0.44055, 3], [0.45309, 2], [0.46676, 1], [0.47587, 4], [0.49615, 1], [0.50402, 1], [0.51232, 1], [0.52482, 2], [0.54835, 1], [0.56482, 1], [0.57462, 2], [0.61825, 1], [0.67822, 2], [0.75612, 1], [0.76271, 1], [0.7867, 1], [0.89168, 1], [0.959, 1], [1.30494, 1]], "exact_histogram": {"populations": [2, 0, 6, 13, 7, 8, 5, 3, 1, 2, 0, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.30494, "mean": 0.49043, "median": 0.44134, "minimum": 0.2136, "population": 53, "standard_deviation": 0.18952, "sum": 25.9928, "sum_squares": 14.61544, "variance": 0.03592}, "id": "000004", "name": "Cluster 4"}, {"center": {"000000": 2.08876, "000001": 103.80473, "000002": 67.14682, "000003": 12.87907, "000004": 41.0429, "000005": 26.62245, "000006": 0.35307, "000007": 25.14904, "000008": "false"}, "count": 193, "distance": {"bins": [[0.16476, 1], [0.18193, 4], [0.19549, 8], [0.20648, 1], [0.21661, 6], [0.23332, 13], [0.24713, 16], [0.25971, 4], [0.2695, 10], [0.28379, 20], [0.30181, 14], [0.31864, 16], [0.33221, 9], [0.34795, 10], [0.36433, 11], [0.37822, 7], [0.38753, 5], [0.40343, 7], [0.41678, 5], [0.4303, 4], [0.44382, 2], [0.45727, 2], [0.47276, 5], [0.51307, 2], [0.52655, 3], [0.545, 2], [0.5733, 1], [0.59883, 1], [0.65773, 1], [0.71396, 1], [0.75529, 1], [0.77176, 1]], "exact_histogram": {"populations": [2, 11, 6, 14, 18, 17, 21, 18, 15, 12, 14, 8, 11, 4, 4, 5, 0, 2, 3, 2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1], "start": 0.16, "width": 0.02}, "maximum": 0.77176, "mean": 0.32802, "median": 0.30679, "minimum": 0.16476, "population": 193, "standard_deviation": 0.10352, "sum": 63.30696, "sum_squares": 22.82329, "variance": 0.01072}, "id": "000005", "name": "Cluster 5"}, {"center": {"000000": 3.55556, "000001": 117, "000002": 0.66667, "000003": 2, "000004": 0.69444, "000005": 25.76389, "000006": 0.39317, "000007": 30.44444, "000008": "false"}, "count": 36, "distance": {"bins": [[0.20446, 1], [0.23972, 1], [0.25833, 1], [0.26807, 1], [0.2777, 1], [0.29397, 1], [0.31328, 1], [0.32791, 1], [0.34103, 1], [0.35249, 1], [0.38296, 1], [0.40602, 1], [0.412, 1], [0.41987, 2], [0.45393, 1], [0.47468, 1], [0.51296, 1], [0.5202, 1], [0.52956, 1], [0.57105, 2], [0.58459, 2], [0.63655, 1], [0.71946, 1], [0.73256, 1], [0.74793, 1], [0.76154, 2], [0.77091, 1], [0.79969, 1], [0.82993, 1], [0.84152, 1], [0.87804, 1], [0.91919, 1]], "exact_histogram": {"populations": [2, 4, 3, 2, 4, 2, 3, 4, 1, 0, 3, 4, 2, 1, 1], "start": 0.2, "width": 0.05}, "maximum": 0.91919, "mean": 0.53109, "median": 0.51658, "minimum": 0.20446, "population": 36, "standard_deviation": 0.20936, "sum": 19.11915, "sum_squares": 11.68805, "variance": 0.04383}, "id": "000006", "name": "Cluster 6"}], "fields": {"000000": {"column_number": 0, "datatype": "int8", "name": "pregnancies", "optype": "numeric", "order": 0, "preferred": true, "summary": {"counts": [[0, 111], [1, 135], [2, 103], [3, 75], [4, 68], [5, 57], [6, 50], [7, 45], [8, 38], [9, 28], [10, 24], [11, 11], [12, 9], [13, 10], [14, 2], [15, 1], [17, 1]], "kurtosis": 0.15038, "maximum": 17, "mean": 3.84505, "median": 3, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.89991, "standard_deviation": 3.36958, "sum": 2953, "sum_squares": 20063, "variance": 11.35406}}, "000001": {"column_number": 1, "datatype": "int16", "name": "plasma glucose", "optype": "numeric", "order": 1, "preferred": true, "summary": {"bins": [[0, 5], [44, 1], [56.66667, 3], [61.5, 2], [67.2, 5], [73.3125, 16], [79.79167, 24], [84.26923, 26], [87.95455, 22], [91.33333, 36], [95.7, 40], [100.97183, 71], [107.21739, 69], [113.18333, 60], [118.5641, 39], [123.93939, 66], [129.5, 42], [133.93333, 15], [137.82353, 34], [142.65217, 23], [146.5, 26], [151.33333, 15], [154.78571, 14], [158, 12], [161.84615, 13], [166.22222, 18], [172.5625, 16], [176.33333, 3], [180, 17], [183.5, 6], [188.23077, 13], [195.6875, 16]], "kurtosis": 0.62881, "maximum": 199, "mean": 120.89453, "median": 116.8391, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.17341, "splits": [73.40028, 80.70976, 84.56046, 88.29697, 91.0653, 94.02324, 96.37393, 99.15167, 100.81422, 102.99865, 105.47912, 107.37394, 109.45106, 111.68975, 114.22117, 116.8391, 119.5, 122.06267, 124.31608, 126.65592, 129.19127, 132.64738, 136.8101, 140.43237, 144.72673, 148.62092, 155.01649, 161.85305, 168.92, 178.9276, 187.59806], "standard_deviation": 31.97262, "sum": 92847, "sum_squares": 12008759, "variance": 1022.24831}}, "000002": {"column_number": 2, "datatype": "int8", "name": "blood pressure", "optype": "numeric", "order": 2, "preferred": true, "summary": {"bins": [[0, 35], [24, 1], [30, 2], [39, 2], [44.66667, 6], [49.44444, 18], [52, 11], [55.04, 25], [58, 21], [60.95833, 72], [64.8375, 80], [68, 45], [70, 57], [72, 44], [74.86869, 99], [78, 45], [80, 40], [82, 30], [84.96, 50], [88, 25], [90, 22], [92, 8], [94.81818, 11], [98, 3], [100, 3], [102, 1], [104, 2], [106, 3], [108, 2], [110, 3], [114, 1], [122, 1]], "kurtosis": 5.13869, "maximum": 122, "mean": 69.10547, "median": 72, "minimum": 0, "missing_count": 0, "population": 768, "skewness": -1.84001, "splits": [4.95379, 47.79361, 52.45455, 56.31476, 58.78684, 60.12982, 61.72909, 62.88889, 64.03541, 65.06458, 66.38178, 67.65685, 68.70061, 69.62414, 70.48722, 71.43285, 72.48913, 73.49, 74.23488, 75.36578, 76.27906, 77.43376, 78.51858, 79.65375, 80.9007, 82.4098, 84.16186, 85.97598, 88.04005, 90.09226, 94.7101], "standard_deviation": 19.35581, "sum": 53073, "sum_squares": 3954989, "variance": 374.64727}}, "000003": {"column_number": 3, "datatype": "int8", "name": "triceps skin thickness", "optype": "numeric", "order": 3, "preferred": true, "summary": {"bins": [[0, 227], [7.5, 4], [10.54545, 11], [12.61111, 18], [14.7, 20], [16.7, 20], [18.47368, 38], [20.43478, 23], [22.57895, 38], [24.57143, 28], [26.58974, 39], [28.45946, 37], [30.41304, 46], [32.39216, 51], [34.65217, 23], [36.53333, 30], [38.72, 25], [40.48387, 31], [42.35294, 17], [44.54545, 11], [46, 8], [47, 4], [48, 4], [49, 3], [50, 3], [51, 1], [52, 2], [54, 2], [56, 1], [60, 1], [63, 1], [99, 1]], "kurtosis": -0.52449, "maximum": 99, "mean": 20.53646, "median": 23, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 0.10916, "splits": [0.46227, 13.14087, 22.81344, 29.82784, 36.68115], "standard_deviation": 15.95222, "sum": 15772, "sum_squares": 519082, "variance": 254.47325}}, "000004": {"column_number": 4, "datatype": "int16", "name": "insulin", "optype": "numeric", "order": 4, "preferred": true, "summary": {"bins": [[0, 374], [19.33333, 9], [43.69697, 33], [60.10811, 37], [76.77778, 36], [98.36842, 57], [122.59524, 42], [141.82759, 29], [162.35714, 28], [184.35294, 34], [208.57895, 19], [232, 11], [251.4, 5], [272.7, 10], [288.5, 6], [304.66667, 3], [324.75, 8], [338.5, 2], [368.33333, 3], [393.66667, 3], [415, 1], [440, 1], [465, 1], [479.4, 5], [495, 2], [510, 1], [542.66667, 3], [579, 1], [600, 1], [680, 1], [744, 1], [846, 1]], "kurtosis": 7.15957, "maximum": 846, "mean": 79.79948, "median": 30.5, "minimum": 0, "missing_count": 0, "population": 768, "skewness": 2.26781, "splits": [0.21803, 30.5, 127.38859], "standard_deviation": 115.244, "sum": 61286, "sum_squares": 15077256, "variance": 13281.18008}}, "000005": {"column_number": 5, "datatype": "double", "name": "bmi", "optype": "numeric", "order": 5, "preferred": true, "summary": {"bins": [[0, 11], [18.25, 4], [19.72308, 13], [21.02222, 9], [22.15909, 22], [23.46333, 30], [24.92034, 59], [26.32222, 36], [27.59787, 47], [28.70278, 36], [29.80851, 47], [30.83333, 45], [32.30286, 70], [33.46, 40], [34.47018, 57], [35.59167, 36], [36.508, 25], [37.41724, 29], [38.34, 25], [39.46061, 33], [40.90556, 18], [42.45789, 19], [43.70952, 21], [45.42727, 11], [46.41111, 9], [48.225, 4], [49.65, 4], [52.675, 4], [55, 1], [57.3, 1], [59.4, 1], [67.1, 1]], "kurtosis": 3.26126, "maximum": 67.1, "mean": 31.99258, "median": 32.11435, "minimum": 0, "missing_count": 0, "population": 768, "skewness": -0.42814, "splits": [19.91842, 22.15909, 23.38981, 24.26256, 24.99372, 25.65688, 26.38114, 27.23053, 27.74252, 28.5307, 29.12209, 29.75821, 30.26534, 30.81337, 31.42032, 32.11435, 32.62432, 33.09397, 33.6175, 34.121, 34.60968, 35.1546, 35.78179, 36.57558, 37.39384, 38.1764, 39.1192, 40.14221, 42.1375, 43.51924, 45.98461], "standard_deviation": 7.88416, "sum": 24570.3, "sum_squares": 833743.95, "variance": 62.15998}}, "000006": {"column_number": 6, "datatype": "double", "name": "diabetes pedigree", "optype": "numeric", "order": 6, "preferred": true, "summary": {"bins": [[0.096, 16], [0.1473, 71], [0.19455, 62], [0.24832, 117], [0.29244, 52], [0.34438, 73], [0.40872, 54], [0.45431, 29], [0.50663, 40], [0.55204, 28], [0.59478, 32], [0.64421, 24], [0.68819, 32], [0.74482, 33], [0.82673, 22], [0.88471, 14], [0.94765, 17], [1.015, 5], [1.08729, 7], [1.14571, 7], [1.20238, 8], [1.2702, 5], [1.33067, 3], [1.39375, 4], [1.45933, 3], [1.6, 1], [1.70933, 3], [1.781, 1], [1.893, 1], [2.137, 1], [2.3085, 2], [2.42, 1]], "kurtosis": 5.55079, "maximum": 2.42, "mean": 0.47188, "median": 0.37281, "minimum": 0.078, "missing_count": 0, "population": 768, "skewness": 1.91616, "splits": [0.12593, 0.14529, 0.16151, 0.17954, 0.19595, 0.21304, 0.22969, 0.24253, 0.25308, 0.26263, 0.2742, 0.28946, 0.30592, 0.32835, 0.34889, 0.37281, 0.39834, 0.42312, 0.44939, 0.48737, 0.52039, 0.55142, 0.58659, 0.62493, 0.6685, 0.70158, 0.74241, 0.82117, 0.8975, 1.03122, 1.25279], "standard_deviation": 0.33133, "sum": 362.401, "sum_squares": 255.20866, "variance": 0.10978}}, "000007": {"column_number": 7, "datatype": "int8", "name": "age", "optype": "numeric", "order": 7, "preferred": true, "summary": {"bins": [[21.53333, 135], [23.54762, 84], [25.40741, 81], [27.52239, 67], [29.42, 50], [31.4, 40], [33.45161, 31], [35.61538, 26], [37.45714, 35], [39.52, 25], [41.45, 40], [43.38095, 21], [45.46429, 28], [47.45455, 11], [49.61538, 13], [51.5, 16], [53.54545, 11], [55.42857, 7], [57.58333, 12], [59.625, 8], [61, 2], [62, 4], [63, 4], [64, 1], [65, 3], [66, 4], [67, 3], [68, 1], [69, 2], [70, 1], [72, 1], [81, 1]], "kurtosis": 0.63118, "maximum": 81, "mean": 33.24089, "median": 29, "minimum": 21, "missing_count": 0, "population": 768, "skewness": 1.12739, "splits": [21.00793, 21.49815, 21.95822, 22.45075, 23.15535, 23.91238, 24.60085, 25.28338, 26.13665, 27.12428, 28.07187, 29.08726, 30.43864, 31.93845, 33.92911, 36.36267, 38.19211, 40.54937, 42.17071, 44.82629, 48.1, 52.72302, 59.43649], "standard_deviation": 11.76023, "sum": 25529, "sum_squares": 954685, "variance": 138.30305}}, "000008": {"column_number": 8, "datatype": "string", "name": "diabetes", "optype": "categorical", "order": 8, "preferred": true, "summary": {"categories": [["false", 500], ["true", 268]], "missing_count": 0}, "term_analysis": {"enabled": true}}}, "global": {"center": {"000000": 3.84505, "000001": 120.89453, "000002": 69.10547, "000003": 20.53646, "000004": 79.79948, "000005": 31.99258, "000006": 0.47188, "000007": 33.24089, "000008": "false"}, "distance": {"bins": [[0.2208, 3], [0.25075, 5], [0.28444, 12], [0.32053, 31], [0.35402, 52], [0.39408, 58], [0.42608, 73], [0.46072, 67], [0.48888, 46], [0.52234, 73], [0.56464, 57], [0.6125, 60], [0.66062, 40], [0.69534, 21], [0.72872, 29], [0.76962, 21], [0.81285, 23], [0.8573, 16], [0.8964, 22], [0.93227, 15], [0.96695, 13], [1.01147, 3], [1.03983, 2], [1.09701, 5], [1.15219, 6], [1.24281, 6], [1.28332, 2], [1.32619, 3], [1.42872, 1], [1.59306, 1], [1.66191, 1], [1.81357, 1]], "exact_histogram": {"open_max": 3, "populations": [6, 14, 50, 72, 98, 103, 82, 68, 55, 44, 35, 23, 25, 25, 24, 13, 5, 3, 5, 3, 3, 5, 3, 0, 1], "start": 0.2, "width": 0.05}, "maximum": 1.81357, "mean": 0.57692, "median": 0.52217, "minimum": 0.21119, "population": 768, "standard_deviation": 0.21827, "sum": 443.07838, "sum_squares": 292.16492, "variance": 0.04764}}, "ratio_ss": 0.47106, "total_ss": 292.16492, "within_ss": 154.53675}, "code": 200, "columns": 9, "configuration": null, "configuration_status": false, "created": "2017-06-29T00:07:23.266000", "credits": 0, "credits_per_prediction": 0.0, "critical_value": 5, "dataset": "dataset/572b9d1749c4a133a2008296", "dataset_field_types": {"categorical": 1, "datetime": 0, "effective_fields": 9, "items": 0, "numeric": 8, "preferred": 9, "text": 0, "total": 9}, "dataset_status": true, "dataset_type": 0, "description": "", "excluded_fields": [], "field_scales": {}, "fields_meta": {"count": 9, "limit": 1000, "offset": 0, "query_total": 9, "total": 9}, "input_fields": ["000000", "000001", "000002", "000003", "000004", "000005", "000006", "000007", "000008"], "k": 7, "locale": "en-us", "max_columns": 9, "max_rows": 768, "model_clusters": false, "name": "diabetes dataset's cluster", "number_of_batchcentroids": 0, "number_of_centroids": 0, "number_of_public_centroids": 0, "out_of_bag": false, "price": 0.0, "private": true, "project": "project/572a709f3bbd2130a2001a6c", "range": [1, 768], "replacement": false, "resource": "cluster/595444bb49c4a1364f0032e4", "rows": 768, "sample_rate": 1.0, "scales": {"000000": 0.06336138346167877, "000001": 0.006683258614263443, "000002": 0.011036666284803718, "000003": 0.013387896832249753, "000004": 0.0018538054418117465, "000005": 0.027103747698761376, "000006": 0.6450418835540328, "000007": 0.018169273936616194, "000008": 0.21336208417578217}, "shared": false, "size": 26106, "source": "source/572b9d113bbd2130a1002c31", "source_status": true, "status": {"code": 5, "elapsed": 4444, "message": "The cluster has been created", "progress": 1.0}, "subscription": true, "summary_fields": [], "tags": [], "updated": "2017-06-29T00:07:32.983000", "white_box": false}
<!DOCTYPE html>
<html>
<meta charset="utf-8">
<style>
#terms {
position: absolute;
top: 50px;
left: 700px;
display: flex;
flex-direction: column;
}
</style>
<head>
<script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/seedrandom/2.4.0/seedrandom.min.js"></script>
<script src="cluster-dist.js" charset="utf-8"></script>
<script src="tsne.js" charset="utf-8"></script>
</head>
<body>
<svg width="850" height="500"></svg>
<div id="terms"></div>
<script>
var width = 850;
var height = 500;
var colors = d3.scale.category20();
d3.json("clustered-diabetes.json", function(error, resource) {
var fields = resource.clusters.fields;
var coords = tsneLayout(resource, "defaultseed");
var clusters = resource.clusters.clusters;
var clusterCount = Object.keys(clusters).length;
var range = Array.apply(null, Array(clusterCount)).map(function (_, i) {return i;});
// scale the tsne coordinates relative to the svg size
coords = scaleDimension(width, width/10, 0, coords);
coords = scaleDimension(height, height/10, 1, coords);
var terms = d3.select("#terms");
var colorFn = function (i, lightness) {
return d3.lab(lightness,
(200 * coords[i][0] / width) - 100,
(200 * coords[i][1] / height) - 100);
};
var svg = d3.select("svg");
svg.style("width", width);
svg.style("height", height);
svg.selectAll(".dot").data(range).enter()
.append("circle")
.attr("class", "dot")
.attr("r", function (i) {
// set topic radius relative to the 'topic_means' score
var area = clusters[i].count / resource.rows;
r = Math.sqrt(area) * 60;
return r;
})
.on("mouseenter", function (i) {
terms.selectAll(".term").remove();
var cluster = clusters[i];
for (var k in cluster.center) {
var name = fields[k].name;
var val;
if (typeof cluster.center[k] == 'object') {
name += "(tokens)";
val = Array.from(cluster.center[k]);
} else {
val = cluster.center[k];
}
name += " : ";
terms.append("text")
.attr("class", "term")
.text(name + val);
}
terms.append("text").attr("class", "term")
.text("Cluster Size : " + cluster.count);
})
.on("mouseleave", function (i) {
terms.selectAll(".term").remove();
})
.attr("cx", function (i) {return coords[i][0];})
.attr("cy", function (i) {return coords[i][1];})
.style("fill", function (i) { return colorFn(i, 80); })
.style("stroke", function (i) { return colorFn(i, 65); });
});
function scaleDimension(dimSize, buffer, dimIndex, coords) {
var minDim = coords[0][dimIndex];
var maxDim = coords[0][dimIndex];
for (var i in coords) {
minDim = Math.min(minDim, coords[i][dimIndex]);
maxDim = Math.max(maxDim, coords[i][dimIndex]);
}
var diff = maxDim - minDim;
var dimScale = dimSize - 2 * buffer;
for (var i in coords) {
coords[i][dimIndex] = (((coords[i][dimIndex] - minDim) / diff) * dimScale) + buffer;
}
return coords;
}
function tsneLayout(resource, seed) {
Math.seedrandom(seed);
var clusterCount = resource.clusters.clusters.length;
var perplexity = Math.round(clusterCount / 10);
perplexity = Math.max(perplexity, 2);
console.log(perplexity);
var tsne = new tsnejs.tSNE({epsilon: 10, perplexity: perplexity});
tsne.initDataDist(distMatrix(resource));
for (var k = 0; k < 300; k++) {
tsne.step();
}
return tsne.getSolution();
}
</script>
</body>
</html>
// from https://github.com/karpathy/tsnejs
// create main global object
var tsnejs = tsnejs || { REVISION: 'ALPHA' };
(function(global) {
"use strict";
// utility function
var assert = function(condition, message) {
if (!condition) { throw message || "Assertion failed"; }
}
// syntax sugar
var getopt = function(opt, field, defaultval) {
if(opt.hasOwnProperty(field)) {
return opt[field];
} else {
return defaultval;
}
}
// return 0 mean unit standard deviation random number
var return_v = false;
var v_val = 0.0;
var gaussRandom = function() {
if(return_v) {
return_v = false;
return v_val;
}
var u = 2*Math.random()-1;
var v = 2*Math.random()-1;
var r = u*u + v*v;
if(r == 0 || r > 1) return gaussRandom();
var c = Math.sqrt(-2*Math.log(r)/r);
v_val = v*c; // cache this for next function call for efficiency
return_v = true;
return u*c;
}
// return random normal number
var randn = function(mu, std){ return mu+gaussRandom()*std; }
// utilitity that creates contiguous vector of zeros of size n
var zeros = function(n) {
if(typeof(n)==='undefined' || isNaN(n)) { return []; }
if(typeof ArrayBuffer === 'undefined') {
// lacking browser support
var arr = new Array(n);
for(var i=0;i<n;i++) { arr[i]= 0; }
return arr;
} else {
return new Float64Array(n); // typed arrays are faster
}
}
// utility that returns 2d array filled with random numbers
// or with value s, if provided
var randn2d = function(n,d,s) {
var uses = typeof s !== 'undefined';
var x = [];
for(var i=0;i<n;i++) {
var xhere = [];
for(var j=0;j<d;j++) {
if(uses) {
xhere.push(s);
} else {
xhere.push(randn(0.0, 1e-4));
}
}
x.push(xhere);
}
return x;
}
// compute L2 distance between two vectors
var L2 = function(x1, x2) {
var D = x1.length;
var d = 0;
for(var i=0;i<D;i++) {
var x1i = x1[i];
var x2i = x2[i];
d += (x1i-x2i)*(x1i-x2i);
}
return d;
}
// compute pairwise distance in all vectors in X
var xtod = function(X) {
var N = X.length;
var dist = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = L2(X[i], X[j]);
dist[i*N+j] = d;
dist[j*N+i] = d;
}
}
return dist;
}
// compute (p_{i|j} + p_{j|i})/(2n)
var d2p = function(D, perplexity, tol) {
var Nf = Math.sqrt(D.length); // this better be an integer
var N = Math.floor(Nf);
assert(N === Nf, "D should have square number of elements.");
var Htarget = Math.log(perplexity); // target entropy of distribution
var P = zeros(N * N); // temporary probability matrix
var prow = zeros(N); // a temporary storage compartment
for(var i=0;i<N;i++) {
var betamin = -Infinity;
var betamax = Infinity;
var beta = 1; // initial value of precision
var done = false;
var maxtries = 50;
// perform binary search to find a suitable precision beta
// so that the entropy of the distribution is appropriate
var num = 0;
while(!done) {
//debugger;
// compute entropy and kernel row with beta precision
var psum = 0.0;
for(var j=0;j<N;j++) {
var pj = Math.exp(- D[i*N+j] * beta);
if(i===j) { pj = 0; } // we dont care about diagonals
prow[j] = pj;
psum += pj;
}
// normalize p and compute entropy
var Hhere = 0.0;
for(var j=0;j<N;j++) {
var pj = prow[j] / psum;
prow[j] = pj;
if(pj > 1e-7) Hhere -= pj * Math.log(pj);
}
// adjust beta based on result
if(Hhere > Htarget) {
// entropy was too high (distribution too diffuse)
// so we need to increase the precision for more peaky distribution
betamin = beta; // move up the bounds
if(betamax === Infinity) { beta = beta * 2; }
else { beta = (beta + betamax) / 2; }
} else {
// converse case. make distrubtion less peaky
betamax = beta;
if(betamin === -Infinity) { beta = beta / 2; }
else { beta = (beta + betamin) / 2; }
}
// stopping conditions: too many tries or got a good precision
num++;
if(Math.abs(Hhere - Htarget) < tol) { done = true; }
if(num >= maxtries) { done = true; }
}
// console.log('data point ' + i + ' gets precision ' + beta + ' after ' + num + ' binary search steps.');
// copy over the final prow to P at row i
for(var j=0;j<N;j++) { P[i*N+j] = prow[j]; }
} // end loop over examples i
// symmetrize P and normalize it to sum to 1 over all ij
var Pout = zeros(N * N);
var N2 = N*2;
for(var i=0;i<N;i++) {
for(var j=0;j<N;j++) {
Pout[i*N+j] = Math.max((P[i*N+j] + P[j*N+i])/N2, 1e-100);
}
}
return Pout;
}
// helper function
function sign(x) { return x > 0 ? 1 : x < 0 ? -1 : 0; }
var tSNE = function(opt) {
var opt = opt || {};
this.perplexity = getopt(opt, "perplexity", 30); // effective number of nearest neighbors
this.dim = getopt(opt, "dim", 2); // by default 2-D tSNE
this.epsilon = getopt(opt, "epsilon", 10); // learning rate
this.iter = 0;
}
tSNE.prototype = {
// this function takes a set of high-dimensional points
// and creates matrix P from them using gaussian kernel
initDataRaw: function(X) {
var N = X.length;
var D = X[0].length;
assert(N > 0, " X is empty? You must have some data!");
assert(D > 0, " X[0] is empty? Where is the data?");
var dists = xtod(X); // convert X to distances using gaussian kernel
this.P = d2p(dists, this.perplexity, 1e-4); // attach to object
this.N = N; // back up the size of the dataset
this.initSolution(); // refresh this
},
// this function takes a given distance matrix and creates
// matrix P from them.
// D is assumed to be provided as a list of lists, and should be symmetric
initDataDist: function(D) {
var N = D.length;
assert(N > 0, " X is empty? You must have some data!");
// convert D to a (fast) typed array version
var dists = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = D[i][j];
dists[i*N+j] = d;
dists[j*N+i] = d;
}
}
this.P = d2p(dists, this.perplexity, 1e-4);
this.N = N;
this.initSolution(); // refresh this
},
// (re)initializes the solution to random
initSolution: function() {
// generate random solution to t-SNE
this.Y = randn2d(this.N, this.dim); // the solution
this.gains = randn2d(this.N, this.dim, 1.0); // step gains to accelerate progress in unchanging directions
this.ystep = randn2d(this.N, this.dim, 0.0); // momentum accumulator
this.iter = 0;
},
// return pointer to current solution
getSolution: function() {
return this.Y;
},
// perform a single step of optimization to improve the embedding
step: function() {
this.iter += 1;
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
// perform gradient step
var ymean = zeros(this.dim);
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var gid = grad[i][d];
var sid = this.ystep[i][d];
var gainid = this.gains[i][d];
// compute gain update
var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
if(newgain < 0.01) newgain = 0.01; // clamp
this.gains[i][d] = newgain; // store for next turn
// compute momentum step direction
var momval = this.iter < 250 ? 0.5 : 0.8;
var newsid = momval * sid - this.epsilon * newgain * grad[i][d];
this.ystep[i][d] = newsid; // remember the step we took
// step!
this.Y[i][d] += newsid;
ymean[d] += this.Y[i][d]; // accumulate mean so that we can center later
}
}
// reproject Y to be zero mean
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
this.Y[i][d] -= ymean[d]/N;
}
}
//if(this.iter%100===0) console.log('iter ' + this.iter + ', cost: ' + cost);
return cost; // return current cost
},
// for debugging: gradient check
debugGrad: function() {
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
var e = 1e-5;
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var yold = this.Y[i][d];
this.Y[i][d] = yold + e;
var cg0 = this.costGrad(this.Y);
this.Y[i][d] = yold - e;
var cg1 = this.costGrad(this.Y);
var analytic = grad[i][d];
var numerical = (cg0.cost - cg1.cost) / ( 2 * e );
console.log(i + ',' + d + ': gradcheck analytic: ' + analytic + ' vs. numerical: ' + numerical);
this.Y[i][d] = yold;
}
}
},
// return cost and gradient, given an arrangement
costGrad: function(Y) {
var N = this.N;
var dim = this.dim; // dim of output space
var P = this.P;
var pmul = this.iter < 100 ? 4 : 1; // trick that helps with local optima
// compute current Q distribution, unnormalized first
var Qu = zeros(N * N);
var qsum = 0.0;
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var dsum = 0.0;
for(var d=0;d<dim;d++) {
var dhere = Y[i][d] - Y[j][d];
dsum += dhere * dhere;
}
var qu = 1.0 / (1.0 + dsum); // Student t-distribution
Qu[i*N+j] = qu;
Qu[j*N+i] = qu;
qsum += 2 * qu;
}
}
// normalize Q distribution to sum to 1
var NN = N*N;
var Q = zeros(NN);
for(var q=0;q<NN;q++) { Q[q] = Math.max(Qu[q] / qsum, 1e-100); }
var cost = 0.0;
var grad = [];
for(var i=0;i<N;i++) {
var gsum = new Array(dim); // init grad for point i
for(var d=0;d<dim;d++) { gsum[d] = 0.0; }
for(var j=0;j<N;j++) {
cost += - P[i*N+j] * Math.log(Q[i*N+j]); // accumulate cost (the non-constant portion at least...)
var premult = 4 * (pmul * P[i*N+j] - Q[i*N+j]) * Qu[i*N+j];
for(var d=0;d<dim;d++) {
gsum[d] += premult * (Y[i][d] - Y[j][d]);
}
}
grad.push(gsum);
}
return {cost: cost, grad: grad};
}
}
global.tSNE = tSNE; // export tSNE class
})(tsnejs);
// export the library to window, or to module in nodejs
(function(lib) {
"use strict";
if (typeof module === "undefined" || typeof module.exports === "undefined") {
window.tsnejs = lib; // in ordinary browser attach library to window
} else {
module.exports = lib; // in nodejs
}
})(tsnejs);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment