Skip to content

Instantly share code, notes, and snippets.

@coleoguy
Last active January 4, 2016 09:28
Show Gist options
  • Save coleoguy/8601904 to your computer and use it in GitHub Desktop.
Save coleoguy/8601904 to your computer and use it in GitHub Desktop.
creates a fasta file with 300bp chunks from the start of all exons larger than 300 bp
setwd("~/Desktop/retrogenes/data/")
gff <- read.csv("exons.csv", header = F, as.is=T) #This is a parsed GFF3 file that contains the lines for exons
# lets get big ones >300 bp
gff <- gff[gff[, 3] - gff[, 2] > 300, ]
setwd("~/Desktop/retrogenes/data/chromosomes") #This is the directory that contains the chromosomes of the genome
library(ape)
seq <- name <- list()
LGS <- totals <- vector()
counter <- 0
fileName <- "NONE"
fileName <- paste(gff[1, 1], "-2.fa", sep="")
chromosome <- read.dna(fileName, format = 'fasta', as.character = T) #This opens the first chromosome
for(i in 1:nrow(gff)){
if(gff[i, 1] != substr(fileName, 1, stop=nchar(fileName) - 5)){
LGS <- c(LGS, fileName)
totals <- c(totals, counter)
fileName <- paste(gff[i, 1], "-2.fa", sep="")
chromosome <- read.dna(fileName, format = 'fasta', as.character = T)
print(paste("Working on:", fileName))
counter <- 0
}
counter <- counter + 1
print(paste("exon:", counter))
seq[[i]] <- paste(chromosome[gff[i, 2]:(gff[i, 2]+300)], collapse="")
name[[i]] <- paste(gff[i, 1], gff[i,2])
fileName <- paste(gff[i, 1], "-2.fa", sep="")
}
LGS <- c(LGS, fileName)
totals <- c(totals, counter)
names(totals) <- LGS
totals
names(seq) <- name
write.dna(seq, format = "fasta", file = "300+bpExons.fa", colw=300) #This saves the final file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment