library(tidyverse)
## read in the mutect files
mix.files<- as.list(dir(".", pattern= "*.tsv"))
## need to add the file name into a column
mix_mutect_datlist <- lapply(mix.files, function(f) {
dat = read.table(f, header =T, sep ="\t", quote = "\"")
dat$sample = gsub("_mutect_annotated.tsv", "", f)
return(dat)
})
## combine to a single dataframe
mix_mutect<- do.call(rbind, mix_mutect_datlist)
## select out the columns and add a column for nt_change
mix_mutect<- mix_mutect %>% select(chr, start, end, ref_allele, alt_allele, t_lod_fstar, sample, t_ref_count, t_alt_count, n_ref_count, n_alt_count, strand1_sample, strand2_sample, strand1_ref, strand2_ref) %>%
mutate(nt_change = paste(ref_allele, alt_allele, sep = ">"))
## define lod cut-offs
lods<- seq(5,100, by = 5)
## two functions for calculating the percentage
calculate_G2A_rate_per_sample<- function(lod, df){
df %>% filter(t_lod_fstar > lod) %>% group_by(sample) %>% summarise(G2A_percent = sum(nt_change == "G>A")/n())
}
calculate_C2T_rate_per_sample<- function(lod, df){
df %>% filter(t_lod_fstar > lod) %>% group_by(sample) %>% summarise(C2T_percent = sum(nt_change == "C>T")/n())
}
## now calculate for each lod-cutoff
mix_G2A_per_sample<- lapply(lods, calculate_G2A_rate_per_sample, df = mix_mutect)
names(mix_G2A_per_sample)<- lods
mix_C2T_per_sample<- lapply(lods, calculate_C2T_rate_per_sample, df = mix_mutect)
names(mix_C2T_per_sample)<- lods
## collapse list of dataframe and put the name of the dataframe as a new column!
mix_C2T_per_sample_df<- bind_rows(mix_C2T_per_sample, .id = "lod_cutoff")
mix_G2A_per_sample_df<- bind_rows(mix_G2A_per_sample, .id = "lod_cutoff")
mix_C2T_per_sample_df$lod_cutoff<- as.numeric(mix_C2T_per_sample_df$lod_cutoff)
mix_G2A_per_sample_df$lod_cutoff<- as.numeric(mix_G2A_per_sample_df$lod_cutoff)
mix_lod_df_per_sample<- inner_join(mix_C2T_per_sample_df, mix_G2A_per_sample_df)
mix_lod_per_sample<- gather(mix_lod_df_per_sample, "type", "percent", 3:4)
## plot
ggplot(mix_lod_per_sample, aes(x = lod_cutoff, y = percent * 100)) +
geom_point(aes(color = type)) +
geom_line(aes(color = type)) +
ggtitle("mixing histology lod cutoff per sample") +
ylab("percentage") +
theme_bw(base_size = 14) +
facet_wrap(~sample)
Last active
January 17, 2017 17:05
-
-
Save crazyhottommy/90256b4d0660d52b6207350e9bf45fb2 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment