Created
March 6, 2019 10:52
-
-
Save allenday/422c983a13c110985c1a375fb7447016 to your computer and use it in GitHub Desktop.
Rice3K analysis 2: some specific regions are under selective pressure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#standardSQL | |
-- | |
-- The following query computes the Hardy-Weinberg equilibrium for variants. | |
-- | |
WITH variants AS ( | |
SELECT reference_name, start_position, end_position, reference_bases, alt, | |
SUM(HOM_REF) AS HOM_REF, | |
SUM(HOM_ALT) AS HOM_ALT, | |
SUM(HET) AS HET | |
FROM ( | |
SELECT | |
reference_name, | |
start_position, | |
end_position, | |
reference_bases, | |
alt.alt, | |
-- Within each variant count the number of HOM_REF/HOM_ALT/HET samples. | |
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 0) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_REF, | |
(SELECT SUM(CAST((SELECT LOGICAL_AND(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HOM_ALT, | |
(SELECT SUM(CAST((SELECT LOGICAL_OR(gt = 0) AND LOGICAL_OR(gt = 1) FROM UNNEST(call.genotype) gt) AS INT64)) FROM v.call) AS HET | |
FROM | |
`bigquery-public-data.genomics_rice.Rice3K_DeepVariant_Os_Nipponbare_Reference_IRGSP_1_0` v | |
JOIN UNNEST(alternate_bases) AS alt | |
WHERE | |
--reference_name = 'Chr9' AND start_position = 4691919 AND | |
# Only include biallelic snps. | |
reference_bases IN ('A','C','G','T') | |
AND ARRAY_LENGTH(alternate_bases) = 1 | |
AND alt.alt IN ('A','C','G','T') | |
) | |
AS x | |
GROUP BY reference_name, start_position, end_position, reference_bases, alt | |
), | |
observations AS ( | |
SELECT | |
reference_name, | |
start_position, | |
reference_bases, | |
alt, | |
HOM_REF AS OBS_HOM1, | |
HET AS OBS_HET, | |
HOM_ALT AS OBS_HOM2, | |
HOM_REF + HET + HOM_ALT AS SAMPLE_COUNT, | |
3024 - (HOM_REF + HET + HOM_ALT) AS MISSING_COUNT | |
FROM variants | |
), | |
expectations AS ( | |
SELECT | |
reference_name, | |
start_position, | |
reference_bases, | |
alt, | |
OBS_HOM1, | |
OBS_HET, | |
OBS_HOM2, | |
# we use F as an inbreeding coefficient. | |
# to revert to standard Hardy-Weiberg F=0, but for selfing/inbred rice population we need to use a higher value 0.95 | |
# Expected AA | |
# p^2 | |
# ((COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2) * SAMPLE_COUNT | |
POW((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT), 2) * (SAMPLE_COUNT + MISSING_COUNT) | |
+ 0.95 * ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT)) * ((OBS_HOM2 + OBS_HET/2) / (SAMPLE_COUNT)) * (MISSING_COUNT + SAMPLE_COUNT) | |
AS E_HOM1, | |
# Expected Aa | |
# 2pq | |
# 2 * (COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) * (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) * SAMPLE_COUNT | |
(1.0 - 0.95) * 2 | |
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT)) | |
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT AS E_HET, | |
# Expected aa | |
# q^2 | |
# (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT) ^ 2 * SAMPLE_COUNT | |
POW((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT, 2) * SAMPLE_COUNT | |
+ 0.95 | |
* ((MISSING_COUNT + OBS_HOM1 + OBS_HET/2) / (MISSING_COUNT + SAMPLE_COUNT)) | |
* ((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT) * SAMPLE_COUNT | |
AS E_HOM2 | |
FROM observations | |
WHERE SAMPLE_COUNT > 0 | |
), | |
stats AS ( | |
SELECT | |
reference_name, | |
start_position, | |
reference_bases, | |
alt, | |
OBS_HOM1, | |
OBS_HET, | |
OBS_HOM2, | |
E_HOM1, | |
E_HET, | |
E_HOM2, | |
# Chi Squared Calculation | |
# SUM(((Observed - Expected)^2) / Expected ) | |
ROUND((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1) | |
+ (POW(OBS_HET - E_HET, 2) / E_HET) | |
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2), 6) | |
AS ChiSq, | |
# Determine if Chi Sq value is significant | |
# The chi-squared score corresponding to a nominal P-value of 0.05 | |
# for a table with 2 degrees of freedom is 5.991. | |
IF((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1) | |
+ (POW(OBS_HET - E_HET, 2) / E_HET) | |
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2) | |
> 5.991, TRUE, FALSE) AS PVALUE_SIG | |
FROM expectations | |
WHERE | |
E_HOM1 > 0 AND E_HET > 0 AND E_HOM2 > 0 | |
-- Optionally add a clause here to constrain the results. | |
) | |
SELECT pt.ref,pt.bin,pt.n AS t,pf.n AS f,pt.n/(pt.n+pf.n) AS pct_t,pf.n/(pt.n+pf.n) AS pct_f | |
FROM | |
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n | |
FROM stats | |
WHERE PVALUE_SIG IS TRUE | |
GROUP BY ref, bin | |
) AS pt, | |
(SELECT reference_name AS ref, CAST(start_position/10000 AS INT64) AS bin, COUNT(1) AS n | |
FROM stats | |
WHERE PVALUE_SIG IS FALSE | |
GROUP BY ref, bin | |
) AS pf | |
WHERE pt.ref = pf.ref AND pt.bin = pf.bin | |
ORDER BY ref, bin |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment