Skip to content

Instantly share code, notes, and snippets.

@FloWuenne
Created August 2, 2017 14:23
Show Gist options
  • Save FloWuenne/656aad5b050d0cb5c15ad8dace8d90f1 to your computer and use it in GitHub Desktop.
Save FloWuenne/656aad5b050d0cb5c15ad8dace8d90f1 to your computer and use it in GitHub Desktop.
Visualize the output from GatherMolecularBarcodeDistributionByGene to assess UMI duplication rates
library(stringr)
## Look at the number of duplicated and unique UMIs per Cell
umi_by_gene <- read.table("UMI_by_gene_dist.tab",sep="\t",header=T)
## Count number of duplicated UMIs per cell
num_of_duplicated_umis_per_cell <- umi_by_gene %>%
group_by(Cell.Barcode) %>%
subset(Num_Obs > 1) %>%
count()
colnames(num_of_duplicated_umis_per_cell) <- c("Cell.Barcode","Num_duplicated_UMIs")
## Count number of unique UMIs per cell
unique_umis_per_cell <- umi_by_gene %>%
group_by(Cell.Barcode) %>%
subset(Num_Obs == 1) %>%
count()
colnames(unique_umis_per_cell) <- c("Cell.Barcode","Num_of_unique_UMIs")
## Calculate the mean number of duplications per duplicated UMI
mean_duplications_per_cell <- umi_by_gene %>%
group_by(Cell.Barcode) %>%
subset(Num_Obs > 1) %>%
summarise(Mean_of_duplications = mean(Num_Obs))
## Join unique and duplicated UMI counts
joined_umi_counts <- full_join(unique_umis_per_cell,num_of_duplicated_umis_per_cell,by="Cell.Barcode")
joined_umi_counts <- full_join(joined_umi_counts,mean_duplications_per_cell,by="Cell.Barcode")
## Calculate the percentage of UMI duplication
## Calculate the total number of UMIs as well as the percent of duplicated UMIs
joined_umi_counts <- joined_umi_counts %>%
group_by(Cell.Barcode) %>%
mutate(Total_number_individual_UMIs_detected = sum(Num_duplicated_UMIs,Num_of_unique_UMIs)) %>%
mutate(Percent_duplicate_UMIs = Num_duplicated_UMIs/Total_number_individual_UMIs_detected*100) %>%
mutate(Percent_unique_UMIs = Num_of_unique_UMIs/Total_number_individual_UMIs_detected*100)
## Plot correlation between duplicated and unique UMIs per cell
unique_vs_duplicated_Umis <- ggplot(joined_umi_counts,aes(Num_duplicated_UMIs,Num_of_unique_UMIs,text=paste("Mean_UMI_duplications=",round(Mean_of_duplications,2),sep=""))) +
geom_point(size=2,fill="grey",colour="darkgrey",pch=21) +
theme_light() +
labs(title="Duplicated vs unique UMIs per Cell",
x = "Number of duplicated UMIs",
y = "Number of unique UMIs",
subtitle = "Every point represents a Cell! Coloured by Percentage of duplicated UMIs for Cell barcode") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
ggplotly(unique_vs_duplicated_Umis)
```
### **Duplicated UMIs vs unique UMIs per Gene**
```{r}
## Look at the number of duplicated and unique UMIs per Gene
## Count number of duplicated UMIs per cell
num_of_duplicated_umis_per_gene <- umi_by_gene %>%
group_by(Gene) %>%
subset(Num_Obs > 1) %>%
count()
colnames(num_of_duplicated_umis_per_gene) <- c("Gene","Num_duplicated_UMIs")
## Count number of unique UMIs per cell
unique_umis_per_gene <- umi_by_gene %>%
group_by(Gene) %>%
subset(Num_Obs == 1) %>%
count()
colnames(unique_umis_per_gene) <- c("Gene","Num_of_unique_UMIs")
## Calculate the mean number of duplications per duplicated UMI
mean_duplications_per_cell <- umi_by_gene %>%
group_by(Gene) %>%
subset(Num_Obs > 1) %>%
summarise(Mean_of_duplications = mean(Num_Obs))
## Join duplicated and unique counts into one tibble
joined_umi_gene_counts <- full_join(num_of_duplicated_umis_per_gene,unique_umis_per_gene,by="Gene")
joined_umi_gene_counts <- full_join(joined_umi_gene_counts,mean_duplications_per_cell,by="Gene")
## Calculate the total number of UMIs as well as the percent of duplicated UMIs
joined_umi_gene_counts <- joined_umi_gene_counts %>%
group_by(Gene) %>%
mutate(Total_number_individual_UMIs_detected = sum(Num_duplicated_UMIs,Num_of_unique_UMIs)) %>%
mutate(Percent_duplicate_UMIs = Num_duplicated_UMIs/Total_number_individual_UMIs_detected*100) %>%
mutate(Percent_unique_UMIs = Num_of_unique_UMIs/Total_number_individual_UMIs_detected*100)
## Sort table based on Total number of UMIs per Cells
joined_umi_gene_counts <- joined_umi_gene_counts %>%
arrange(desc(Num_of_unique_UMIs))
umi_duplication_by_gene <- ggplot(joined_umi_gene_counts,aes(Num_duplicated_UMIs,Num_of_unique_UMIs,text=paste("Gene = ",Gene,sep=""))) +
geom_point(size=2,fill="grey",colour="darkgrey",pch=21) +
theme_light() +
labs(title="Duplicated vs Unique UMIs per Gene",
x = "Number of duplicated UMIs",
y = "Number of unique UMIs",
subtitle = "Every represents is a Gene!") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
ggplotly(umi_duplication_by_gene)
```
Column {data-width=600}
-------------------------------------
### **Distribution of duplicated UMIs per cell**
```{r,echo=FALSE}
ggplot(joined_umi_counts,aes(log10(Mean_of_duplications))) +
geom_histogram(colour="black",fill="grey50") +
theme_light() +
labs(title="Distribution of mean UMI duplication per Cell",
subtitle = "For all duplicated UMIs, mean of duplication events was calculated per Cell.",
x = "log10 of mean UMI duplications events per Cell)",
y = "Number of cells") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
```
### **Distribution of duplicate UMIs per cell**
```{r,echo=FALSE}
ggplot(joined_umi_counts,aes(Percent_duplicate_UMIs)) +
geom_histogram(colour="black",fill="grey50") +
theme_light() +
labs(title="Percentage of duplicated UMIs per cell",
subtitle = "Distribution of percentage of duplicated UMIs",
x = "Duplicate UMIs (%)",
y = "Number of cells") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment