Last active
April 29, 2016 10:12
-
-
Save drincruz/38d5b49f8bb6b2cc26dba0ea3c255a47 to your computer and use it in GitHub Desktop.
Getting variance and standard deviation in BigQuery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
variance, | |
standard_deviation, | |
input | |
FROM (standardDeviation( | |
SELECT | |
'[' + GROUP_CONCAT_UNQUOTED(STRING(score)) + ']' AS json_array_input | |
FROM ( | |
SELECT | |
score | |
FROM | |
[bigquery-public-data:hacker_news.full_201510] | |
WHERE | |
score IS NOT NULL | |
ORDER BY | |
id | |
LIMIT | |
10))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
VARIANCE(score), | |
STDDEV(score), | |
AVG(score) | |
FROM | |
[bigquery-public-data:hacker_news.full_201510] | |
WHERE | |
score IS NOT NULL | |
LIMIT | |
10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Gets the average of an array of numbers | |
* @param {array} arr - An array of numbers | |
* @return {number} The average | |
*/ | |
function average(arr) { | |
var sum = 0; | |
arr.forEach(function(element, index, array) { sum += element; }); | |
return sum * 1.0 / arr.length; | |
} | |
/** | |
* Subtract the mean from each element in an array, then square each | |
* Where x is an element in the array, the result is(x - mean)^2 | |
* @param {array} arr - An array of numbers | |
* @return {array} Array of modified numbers | |
*/ | |
function getDifferenceSquares(arr, mean) { | |
var ret = []; | |
arr.forEach(function(element, index, array) { | |
ret[index] = Math.pow((element - mean), 2); }); | |
return ret; | |
} | |
function standardDeviation(row, emit) { | |
var ints = JSON.parse(row.json_array_input); | |
var mean = average(ints); | |
var variance = average(getDifferenceSquares(ints, mean)); | |
emit({variance: variance, | |
standard_deviation: Math.sqrt(variance), | |
input: row.json_array_input}); | |
} | |
bigquery.defineFunction( | |
'standardDeviation', | |
['json_array_input'], | |
[{'name': 'variance', 'type': 'float'}, | |
{'name': 'standard_deviation', 'type': 'float'}, | |
{'name': 'input', 'type': 'string'}], | |
standardDeviation | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment