Skip to content

Instantly share code, notes, and snippets.

@suensummit
Forked from smrgit/kMeans_in_BQ.sql
Created December 5, 2018 10:40
Show Gist options
  • Save suensummit/994c6a44166dbb28a19f216a1caa9ef5 to your computer and use it in GitHub Desktop.
Save suensummit/994c6a44166dbb28a19f216a1caa9ef5 to your computer and use it in GitHub Desktop.
kMeans using JavaScript UDFs in BigQuery
CREATE TEMPORARY FUNCTION
-- In this function, we're going to be working on arrays of values.
-- we're also going to define a set of functions 'inside' the kMeans.
-- *heavily borrowing from https://github.com/NathanEpstein/clusters* --
kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression
y ARRAY<FLOAT64>, -- EGFR gene expression
iterations FLOAT64, -- the number of iterations
k FLOAT64) -- the number of clusters
RETURNS ARRAY<FLOAT64> -- returns the cluster assignments
LANGUAGE js AS """
'use strict'
function sumOfSquareDiffs(oneVector, anotherVector) {
// the sum of squares error //
var squareDiffs = oneVector.map(function(component, i) {
return Math.pow(component - anotherVector[i], 2);
});
return squareDiffs.reduce(function(a, b) { return a + b }, 0);
};
function mindex(array) {
// returns the index to the minimum value in the array
var min = array.reduce(function(a, b) {
return Math.min(a, b);
});
return array.indexOf(min);
};
function sumVectors(a, b) {
// The map function gets used frequently in JavaScript
return a.map(function(val, i) { return val + b[i] });
};
function averageLocation(points) {
// Take all the points assigned to a cluster
// and find the averge center point.
// This gets used to update the cluster centroids.
var zeroVector = points[0].location.map(function() { return 0 });
var locations = points.map(function(point) { return point.location });
var vectorSum = locations.reduce(function(a, b) { return sumVectors(a, b) }, zeroVector);
return vectorSum.map(function(val) { return val / points.length });
};
function Point(location) {
// A point object, each sample is represented as a point //
var self = this;
this.location = location;
this.label = 1;
this.updateLabel = function(centroids) {
var distancesSquared = centroids.map(function(centroid) {
return sumOfSquareDiffs(self.location, centroid.location);
});
self.label = mindex(distancesSquared);
};
};
function Centroid(initialLocation, label) {
// The cluster centroids //
var self = this;
this.location = initialLocation;
this.label = label;
this.updateLocation = function(points) {
var pointsWithThisCentroid = points.filter(function(point) { return point.label == self.label });
if (pointsWithThisCentroid.length > 0) {
self.location = averageLocation(pointsWithThisCentroid);
}
};
};
var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
data.push([x[i],y[i]])
}
// initialize point objects with data
var points = data.map(function(vector) { return new Point(vector) });
// intialize centroids
var centroids = [];
for (var i = 0; i < k; i++) {
centroids.push(new Centroid(points[i % points.length].location, i));
};
// update labels and centroid locations until convergence
for (var iter = 0; iter < iterations; iter++) {
points.forEach(function(point) { point.updateLabel(centroids) });
centroids.forEach(function(centroid) { centroid.updateLocation(points) });
};
// return the cluster labels.
var labels = []
for (var i = 0; i < points.length; i++) {
labels.push(points[i].label)
}
return labels;
""";
--
-- *** In this query, we create two subtables, one for each gene of
-- interest, then create a set of arrays in joining the two tables.
-- We call the UDF using the arrays, and get a result back
-- made of arrays.
--
-- Due to a technical issue we save the table of arrays to
-- to a personal dataset, then unpack it. ***
--
WITH
-- gene1, the first subtable
--
gene1 AS (
SELECT
ROW_NUMBER() OVER() row_number,
AliquotBarcode AS barcode1,
HGNC_gene_symbol AS gene_id1,
AVG(LOG(normalized_count+1, 2)) AS count1
FROM
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
WHERE
Study = 'BRCA'
AND SampleTypeLetterCode = 'TP'
AND HGNC_gene_symbol = 'ESR1'
AND normalized_count >= 0
GROUP BY
AliquotBarcode,
gene_id1),
--
-- gene2, the second subtable
--
gene2 AS (
SELECT
AliquotBarcode AS barcode2,
HGNC_gene_symbol AS gene_id2,
AVG(LOG(normalized_count+1, 2)) AS count2
FROM
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
WHERE
Study = 'BRCA'
AND SampleTypeLetterCode = 'TP'
AND HGNC_gene_symbol = 'EGFR'
AND normalized_count >= 0
GROUP BY
AliquotBarcode,
HGNC_gene_symbol),
--
-- Then we create a table of arrays
-- and join the two gene tables.
-- ** We need to make sure all the arrays are constructed using the same index. **
--
arrayTable AS (
SELECT
ARRAY_AGG(m1.row_number ORDER BY m1.barcode1) AS arrayn,
ARRAY_AGG(m1.barcode1 ORDER BY m1.barcode1) AS barcode,
ARRAY_AGG(count1 ORDER BY m1.barcode1) AS esr1,
ARRAY_AGG(count2 ORDER BY m1.barcode1) AS egfr
FROM
gene1 AS m1
JOIN
gene2 AS m2
ON
m1.barcode1 = m2.barcode2 )
--
-- Now we call the k-means UDF.
--
SELECT
arrayn,
barcode,
esr1,
egfr,
kMeans(esr1, egfr, 200.0, 2.0) AS cluster
FROM
arrayTable
--
-- save the resulting table to a personal dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment