Last active
July 20, 2018 15:35
-
-
Save FloWuenne/ff003a56a50d34acedaaa6512b6cb960 to your computer and use it in GitHub Desktop.
Analyze and fix .gtf file for Drop-seq pipeline to remove warnings and skipping for genes with issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
# Load libraries | |
```{r} | |
library(refGenome) | |
library(dplyr) | |
library(tidyr) | |
``` | |
# Read in original gtf | |
```{r} | |
# create ensemblGenome object for storing Ensembl genomic annotation data | |
ens <- ensemblGenome() | |
read.gtf(ens, "./mm10_with_Adamts19_tm4a_tm4b_allele.gtf") | |
``` | |
```{r} | |
# create table of genes | |
my_gene <- getGenePositions(ens) | |
``` | |
```{r} | |
## Get all genes that are found more than once | |
skipped_genes <- my_gene %>% | |
count(gene_name) %>% | |
subset(n > 1) | |
``` | |
# Test if all skipped genes are missing in modified REF and are present in standard REF | |
```{r} | |
good_DGE <- read.table("DGE_to_test_skipped_genes/Florian_170814_R1_1200STAMPS_E14_5.aligned.tagged.cleaned.selected_STAMPS.DGE.txt", | |
sep="\t", | |
header=T, | |
row.names =1, | |
quote="") | |
``` | |
```{r} | |
bad_DGE <- read.table("DGE_to_test_skipped_genes/E14.5_r3.DGE.txt", | |
sep="\t", | |
header=T, | |
row.names =1, | |
quote="") | |
``` | |
```{r} | |
good_genes <- rownames(good_DGE) | |
bad_genes <- rownames(bad_DGE) | |
``` | |
```{r} | |
## Check how many genes are found in alignment against reference GSE_mm10 and against reference + lacZ | |
good_genes_intersect <- intersect(skipped_genes$gene_name,good_genes) | |
bad_genes_intersect <- intersect(skipped_genes$gene_name,bad_genes) | |
## Check how many of these are found in both | |
overlap_bad_good <- intersect(good_genes_intersect,bad_genes_intersect) | |
``` | |
```{r} | |
## Subset gtf for gene of interest | |
subset(my_gene,gene_name=="1700040F15Rik") | |
``` | |
# Get most recent ensemble IDs using biomart | |
```{r} | |
library("biomaRt") | |
ensembl=useMart("ensembl") | |
ensembl = useDataset("mmusculus_gene_ensembl",mart=ensembl) | |
``` | |
```{r} | |
attributes = listAttributes(ensembl) | |
ensemble_gene_IDS <- getBM(attributes=c('ensembl_gene_id', 'ensembl_gene_id_version','ensembl_transcript_id','chromosome_name'), | |
mart = ensembl) | |
``` | |
# Check which gene IDS overlap between genes with multiple IDs from gtf and current ensemble IDs | |
```{r} | |
## Get all gene IDs for genes that have more than 1 | |
skipped_genes_IDs <- subset(my_gene,gene_name %in%skipped_genes$gene_name) | |
overlap_skipped_ensemble <- intersect(ensemble_gene_IDS$ensembl_gene_id,skipped_genes_IDs$gene_id) | |
``` | |
```{r} | |
## Get the genes that only had 1 gene ID | |
skipped_genes_IDs_overlap <- subset(skipped_genes_IDs,gene_id %in% overlap_skipped_ensemble) | |
## Get the gene ID that were not in ensemble for filtering later on | |
to_filter_for_multiple_gene_IDs <- subset(skipped_genes_IDs,!gene_id %in% overlap_skipped_ensemble) | |
more_than_two_geneID <- to_filter_for_multiple_gene_IDs %>% | |
count(gene_name) %>% | |
arrange(desc(n)) | |
## Count number of genes that still have more than 1 gene_ID | |
multiple_gene_IDs_table <- skipped_genes_IDs_overlap %>% | |
count(gene_name) %>% | |
subset(n > 1) %>% | |
arrange(desc(n)) | |
## Genes that were filtered for multiple gene IDs | |
multiple_gene_IDs <- multiple_gene_IDs_table$gene_name | |
## filter my_gene subset for the remaining genes | |
skipped_genes_IDs_overlap_geneIDs <- subset(skipped_genes_IDs_overlap,gene_name %in% multiple_gene_IDs_table$gene_name) | |
## Find chromosomal disagreement genes | |
chromosomal_disagreement <- skipped_genes_IDs_overlap_geneIDs %>% | |
group_by(gene_name) %>% | |
count(seqid) %>% | |
count(gene_name) %>% | |
arrange(desc(nn)) %>% | |
subset(nn > 1) | |
chromosomal_disagreement_genes <- chromosomal_disagreement$gene_name | |
## Get info for genes with chromosomal disagreement | |
chromosomal_disagreement_genes_table <- subset(skipped_genes_IDs_overlap_geneIDs,gene_name %in% chromosomal_disagreement$gene_name) | |
## Genes that are not filtered by duplicate Gene ID NOR chromosomal disagreement | |
count_skipped_genes_IDs_left <- skipped_genes_IDs_overlap_geneIDs %>% | |
group_by(gene_name) %>% | |
count(seqid) %>% | |
count(gene_name) %>% | |
arrange(desc(nn)) %>% | |
subset(nn == 1) | |
``` | |
```{r} | |
## Get genes that have strand disagreement | |
skipped_genes_IDs_overlap_geneIDs <- subset(skipped_genes_IDs_overlap_geneIDs,!gene_name %in% chromosomal_disagreement$gene_name) | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand <- subset(skipped_genes_IDs_overlap_geneIDs,gene_name %in% count_skipped_genes_IDs_left$gene_name) | |
## Find genes that have more than one strand | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand_sum <- skipped_genes_IDs_overlap_geneIDs_chrom_strand %>% | |
group_by(gene_name) %>% | |
count(strand) %>% | |
subset(n == 1) | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_sum$gene_name) | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand_complicated <- subset(skipped_genes_IDs_overlap_geneIDs_chrom_strand, | |
!gene_name %in% skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes) | |
``` | |
## Finally, filter out all these gene IDs that fall into one of the four categories that cause problems | |
```{r} | |
## Multiple gene IDs | |
to_filter_for_multiple_gene_IDs | |
## Chromosomal disagreement | |
chromosomal_disagreement_genes | |
## Strand disagreement | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes | |
## Complicated reasons | |
filtered_complicated_cases <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_complicated$gene_name) | |
## Sanity checks to make sure all genes included | |
length(intersect(skipped_genes_IDs$gene_name,to_filter_for_multiple_gene_IDs$gene_name)) | |
length(intersect(skipped_genes_IDs$gene_name,chromosomal_disagreement_genes)) | |
length(intersect(skipped_genes_IDs$gene_name,skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes)) | |
## Make a table with all gene IDs and their properties | |
multiple_geneIDs_genenames <- unique(to_filter_for_multiple_gene_IDs$gene_name) | |
chromosomal_disagreement_genes <- unique(chromosomal_disagreement_genes) | |
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes) | |
filtered_complicated_cases <- unique(filtered_complicated_cases) | |
multiple_gene_ids_table <- data.frame("gene_name"=multiple_geneIDs_genenames, | |
"issue"=replicate(length(multiple_geneIDs_genenames),"Multiple_gene_IDs")) | |
chromosomal_disagreements_table <- data.frame("gene_name"=chromosomal_disagreement_genes, | |
"issue"=replicate(length(chromosomal_disagreement_genes),"Chromosomal_disagreement")) | |
strand_disagreements_table <- data.frame("gene_name"=skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes, | |
"issue"=replicate(length(skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes),"Strand_disagreement")) | |
complicated_cases_table <- data.frame("gene_name"=filtered_complicated_cases, | |
"issue"=replicate(length(filtered_complicated_cases),"Complicated_case")) | |
full_issues_table <- rbind(multiple_gene_ids_table, | |
chromosomal_disagreements_table, | |
strand_disagreements_table, | |
complicated_cases_table) | |
write.table(full_issues_table, | |
file="./gtf_issues_genes.tsv", | |
sep="\t", | |
row.names=FALSE, | |
col.names=TRUE, | |
quote=FALSE) | |
``` | |
# Finally clean up the gtf from all the genes that cause warnings | |
```{r} | |
original_gtf <- ens@ev$gtf | |
## First we remove the duplicate gene IDs that are not the current ones | |
filtered_gtf <- subset(original_gtf,!gene_id %in% to_filter_for_multiple_gene_IDs$gene_id) | |
## Then filter out the genes that have chromosome and strand disagreements | |
filtered_gtf <- subset(filtered_gtf,!gene_name %in% chromosomal_disagreement_genes) | |
filtered_gtf <- subset(filtered_gtf,!gene_name %in% skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes) | |
## Finally, for now, filter out the complicated gene cases as well | |
filtered_gtf <- subset(filtered_gtf,!gene_name %in% filtered_complicated_cases) | |
## Save a table that contains information | |
``` | |
```{r} | |
library(tidyr) | |
library(dplyr) | |
## Export the gtf in the original formatting | |
filtered_gtf_formatted <- filtered_gtf | |
## Use apply to add the last column to the gtf | |
## This will effectively loose the protein_id and exon_id entries but we don't need these in the Drop-seq pipe | |
filtered_gtf_formatted$last_column <- paste("gene_id \"",filtered_gtf_formatted[,13],"\"; ", | |
"transcript_id \"",filtered_gtf_formatted[,15],"\"; ", | |
"exon_number \"",filtered_gtf_formatted[,17],"\"; ", | |
"gene_name \"",filtered_gtf_formatted[,16],"\"; ", | |
"gene_biotype \"",filtered_gtf_formatted[,11],"\"; ", | |
"transcript_name \"",filtered_gtf_formatted[,12],"\";", | |
sep="") | |
filtered_gtf_formatted <- filtered_gtf_formatted %>% | |
dplyr::select(-c(id,protein_id,gene_biotype,transcript_name,gene_id,exon_id,transcript_id,gene_name,gene_name,exon_number)) | |
``` | |
```{r} | |
write.table(filtered_gtf_formatted, | |
file="./mm10_with_Adamts19_tm4a_tm4b_allele.fixed.gtf", | |
sep="\t", | |
row.names=FALSE, | |
col.names=FALSE, | |
quote=FALSE) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment