Created
March 6, 2019 08:39
-
-
Save allenday/c8d52f06b68ef6c291d6309c2469b61b to your computer and use it in GitHub Desktop.
Rice 3K analysis 1: genetic variants are not uniformly distributed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
ind AS ( | |
-- count variants for each sample/ref/bin | |
SELECT | |
call.name AS sample, reference_name AS ref, CAST(start_position/1000000 AS INT64) AS bin, COUNT(call.name) AS n | |
FROM `bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0` | |
JOIN UNNEST(call) AS call | |
JOIN UNNEST(alternate_bases) AS alt | |
WHERE alt.alt != '<*>' | |
GROUP BY sample, ref, bin | |
--ORDER BY sample, ref, bin | |
), | |
pop AS ( | |
-- over population of all ref/bin tuples | |
SELECT ref, bin, AVG(n) AS pop_mu, STDDEV(n) AS pop_sigma | |
FROM ind | |
GROUP BY ref, bin | |
), | |
zscore AS ( | |
-- Z-score for each individual's bin vs. population avg of bin | |
SELECT | |
ind.sample, | |
ind.n AS ind_n, | |
(ind.n-pop.pop_mu)/pop.pop_sigma AS z, | |
pop.ref, | |
pop.bin, | |
pop.pop_mu, | |
pop.pop_sigma | |
FROM pop, ind | |
WHERE ind.ref = pop.ref AND ind.bin = pop.bin | |
) | |
SELECT * from zscore ORDER BY sample,ref,bin --ABS(Z) DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment