Created
March 21, 2023 19:12
-
-
Save kieranrcampbell/0d198f5f17aca34278bce49dc390281c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Going to take non tumour cells as just fibroblast and T/NK | |
## as these are (i) abundant, and (ii) sufficiently distinct transcriptionally | |
## from tumour that we're unlikely to confuse them | |
non_tumour_cells <- c("Fibroblast", "T/NK") | |
get_baf_df <- function(sce) { | |
message(paste0("[Running] Sample ", sce$id[1])) | |
colnames(sce) <- paste0(colData(sce)$id, "-", colData(sce)$barcode) | |
sce$cell_type <- cell_annots[colnames(sce)] | |
## First, define a set of sites that we're really confident | |
## are heterozygous in normal tissue. To do this we take only | |
## normal cells, and sum the ad/dp per cell, then perform a | |
## binomial test (2 tailed) to test whether the fractions | |
## are significantly different from 0.5 | |
sce_normal <- sce[, sce$cell_type %in% non_tumour_cells] | |
ad_sum <- rowSums(assay(sce_normal, 'ad')) | |
dp_sum <- rowSums(assay(sce_normal, 'dp')) | |
# if we can't compute the p value, reject it has having p < 0.05 | |
pvals <- rep(0, length(dp_sum)) | |
for(i in seq_along(ad_sum)) { | |
if(dp_sum[i] > 0) { | |
test <- binom.test(ad_sum[i], dp_sum[i], p=0.5) | |
pvals[i] <- c(test$p.value) | |
} | |
} | |
## Here we accept heterozygous sites as having a p-value > 0.05 | |
## (i.e. we can't reject the null that BAF=0.5), and filter | |
## on DP sum > 20 to avoid sites with very few reads | |
highconf_hzyg_sites <- dp_sum > 20 & pvals > 0.05 | |
## Subset to heterozygous sites | |
sce_hc <- sce[highconf_hzyg_sites,] | |
## Boiler plate setting up stuff we want to return | |
all_cell_types = unique(sce$cell_type) | |
all_cell_types <- all_cell_types[!is.na(all_cell_types)] | |
bafs = rep(NA, length(all_cell_types)) | |
names(bafs) <- all_cell_types | |
dp_sums = rep(NA, length(all_cell_types)) | |
names(dp_sums) <- all_cell_types | |
pvals = rep(NA, length(all_cell_types)) | |
names(pvals) <- all_cell_types | |
## Iterate over each cell type, looking at high confidence | |
## heterozygous sites | |
for(cell_type in all_cell_types) { | |
is_cell_type <- which(sce_hc$cell_type ==cell_type) | |
sce_hc_ct <- sce_hc[, is_cell_type] | |
dp_vec <- as.vector(assay(sce_hc_ct, 'dp')) | |
ad_vec <- as.vector(assay(sce_hc_ct, 'ad')) | |
ad_sum <- sum(ad_vec)#[dp_vec > 5]) | |
dp_sum <- sum(dp_vec)#[dp_vec > 5]) | |
## Compute BAFs, DP sums over all cells and positions | |
avg_baf <- ad_sum / dp_sum | |
bafs[cell_type] <- avg_baf | |
dp_sums[cell_type] <- dp_sum | |
if(dp_sum > 0) { | |
## Finally, we want to test if the BAF is significantly | |
## different to 0.5, using a binomial test, but this | |
## time interested in the case of p being < 0.05 | |
test <- binom.test(ad_sum, dp_sum) | |
pvals[cell_type] <- test$p.value | |
} | |
} | |
## Flip the BAFs to always be < 0.5 | |
for(i in seq_along(bafs)) { | |
if(!is.nan(bafs[i]) && !is.na(bafs[i])) { | |
if(bafs[i] > 0.5) { | |
bafs[i] <- 1 - bafs[i] | |
} | |
} | |
} | |
tibble(cell_type = all_cell_types, baf = bafs, dp_sum = dp_sums, pval = pvals, id = sce$id[1]) | |
} | |
df_baf <- bind_rows( | |
lapply(sce_gt, get_baf_df) | |
) | |
df_baf$p_adj <- p.adjust(df_baf$pval, method="BH") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment