-
-
Save suensummit/994c6a44166dbb28a19f216a1caa9ef5 to your computer and use it in GitHub Desktop.
kMeans using JavaScript UDFs in BigQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TEMPORARY FUNCTION | |
-- In this function, we're going to be working on arrays of values. | |
-- we're also going to define a set of functions 'inside' the kMeans. | |
-- *heavily borrowing from https://github.com/NathanEpstein/clusters* -- | |
kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression | |
y ARRAY<FLOAT64>, -- EGFR gene expression | |
iterations FLOAT64, -- the number of iterations | |
k FLOAT64) -- the number of clusters | |
RETURNS ARRAY<FLOAT64> -- returns the cluster assignments | |
LANGUAGE js AS """ | |
'use strict' | |
function sumOfSquareDiffs(oneVector, anotherVector) { | |
// the sum of squares error // | |
var squareDiffs = oneVector.map(function(component, i) { | |
return Math.pow(component - anotherVector[i], 2); | |
}); | |
return squareDiffs.reduce(function(a, b) { return a + b }, 0); | |
}; | |
function mindex(array) { | |
// returns the index to the minimum value in the array | |
var min = array.reduce(function(a, b) { | |
return Math.min(a, b); | |
}); | |
return array.indexOf(min); | |
}; | |
function sumVectors(a, b) { | |
// The map function gets used frequently in JavaScript | |
return a.map(function(val, i) { return val + b[i] }); | |
}; | |
function averageLocation(points) { | |
// Take all the points assigned to a cluster | |
// and find the averge center point. | |
// This gets used to update the cluster centroids. | |
var zeroVector = points[0].location.map(function() { return 0 }); | |
var locations = points.map(function(point) { return point.location }); | |
var vectorSum = locations.reduce(function(a, b) { return sumVectors(a, b) }, zeroVector); | |
return vectorSum.map(function(val) { return val / points.length }); | |
}; | |
function Point(location) { | |
// A point object, each sample is represented as a point // | |
var self = this; | |
this.location = location; | |
this.label = 1; | |
this.updateLabel = function(centroids) { | |
var distancesSquared = centroids.map(function(centroid) { | |
return sumOfSquareDiffs(self.location, centroid.location); | |
}); | |
self.label = mindex(distancesSquared); | |
}; | |
}; | |
function Centroid(initialLocation, label) { | |
// The cluster centroids // | |
var self = this; | |
this.location = initialLocation; | |
this.label = label; | |
this.updateLocation = function(points) { | |
var pointsWithThisCentroid = points.filter(function(point) { return point.label == self.label }); | |
if (pointsWithThisCentroid.length > 0) { | |
self.location = averageLocation(pointsWithThisCentroid); | |
} | |
}; | |
}; | |
var data = []; | |
// Our data list is list of lists. The small list being each (x,y) point | |
for (var i = 0; i < x.length; i++) { | |
data.push([x[i],y[i]]) | |
} | |
// initialize point objects with data | |
var points = data.map(function(vector) { return new Point(vector) }); | |
// intialize centroids | |
var centroids = []; | |
for (var i = 0; i < k; i++) { | |
centroids.push(new Centroid(points[i % points.length].location, i)); | |
}; | |
// update labels and centroid locations until convergence | |
for (var iter = 0; iter < iterations; iter++) { | |
points.forEach(function(point) { point.updateLabel(centroids) }); | |
centroids.forEach(function(centroid) { centroid.updateLocation(points) }); | |
}; | |
// return the cluster labels. | |
var labels = [] | |
for (var i = 0; i < points.length; i++) { | |
labels.push(points[i].label) | |
} | |
return labels; | |
"""; | |
-- | |
-- *** In this query, we create two subtables, one for each gene of | |
-- interest, then create a set of arrays in joining the two tables. | |
-- We call the UDF using the arrays, and get a result back | |
-- made of arrays. | |
-- | |
-- Due to a technical issue we save the table of arrays to | |
-- to a personal dataset, then unpack it. *** | |
-- | |
WITH | |
-- gene1, the first subtable | |
-- | |
gene1 AS ( | |
SELECT | |
ROW_NUMBER() OVER() row_number, | |
AliquotBarcode AS barcode1, | |
HGNC_gene_symbol AS gene_id1, | |
AVG(LOG(normalized_count+1, 2)) AS count1 | |
FROM | |
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM` | |
WHERE | |
Study = 'BRCA' | |
AND SampleTypeLetterCode = 'TP' | |
AND HGNC_gene_symbol = 'ESR1' | |
AND normalized_count >= 0 | |
GROUP BY | |
AliquotBarcode, | |
gene_id1), | |
-- | |
-- gene2, the second subtable | |
-- | |
gene2 AS ( | |
SELECT | |
AliquotBarcode AS barcode2, | |
HGNC_gene_symbol AS gene_id2, | |
AVG(LOG(normalized_count+1, 2)) AS count2 | |
FROM | |
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM` | |
WHERE | |
Study = 'BRCA' | |
AND SampleTypeLetterCode = 'TP' | |
AND HGNC_gene_symbol = 'EGFR' | |
AND normalized_count >= 0 | |
GROUP BY | |
AliquotBarcode, | |
HGNC_gene_symbol), | |
-- | |
-- Then we create a table of arrays | |
-- and join the two gene tables. | |
-- ** We need to make sure all the arrays are constructed using the same index. ** | |
-- | |
arrayTable AS ( | |
SELECT | |
ARRAY_AGG(m1.row_number ORDER BY m1.barcode1) AS arrayn, | |
ARRAY_AGG(m1.barcode1 ORDER BY m1.barcode1) AS barcode, | |
ARRAY_AGG(count1 ORDER BY m1.barcode1) AS esr1, | |
ARRAY_AGG(count2 ORDER BY m1.barcode1) AS egfr | |
FROM | |
gene1 AS m1 | |
JOIN | |
gene2 AS m2 | |
ON | |
m1.barcode1 = m2.barcode2 ) | |
-- | |
-- Now we call the k-means UDF. | |
-- | |
SELECT | |
arrayn, | |
barcode, | |
esr1, | |
egfr, | |
kMeans(esr1, egfr, 200.0, 2.0) AS cluster | |
FROM | |
arrayTable | |
-- | |
-- save the resulting table to a personal dataset |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment