Last active
March 13, 2017 22:39
-
-
Save johnsolk/5375428dba77362c2a7875e9cf27e870 to your computer and use it in GitHub Desktop.
dammit_annotations_parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##gff-version 3.2.1 | |
Transcript_0 transdecoder CDS 1664 2062 . + . ID=cds.Transcript_0|m.1;Parent=Transcript_0|m.1 | |
Transcript_0 transdecoder exon 1 2064 . + . ID=Transcript_0|m.1.exon1;Parent=Transcript_0|m.1 | |
Transcript_0 transdecoder five_prime_UTR 1 1663 . + . ID=Transcript_0|m.1.utr5p1;Parent=Transcript_0|m.1 | |
Transcript_0 transdecoder gene 1 2064 . + . ID=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29 | |
Transcript_0 transdecoder mRNA 1 2064 . + . ID=Transcript_0|m.1;Parent=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29 | |
Transcript_0 transdecoder three_prime_UTR 2063 2064 . + . ID=Transcript_0|m.1.utr3p1;Parent=Transcript_0|m.1 | |
Transcript_100002 transdecoder CDS 1 1047 . + . ID=cds.Transcript_100002|m.114344;Parent=Transcript_100002|m.114344 | |
Transcript_100002 transdecoder CDS 1411 1722 . + . ID=cds.Transcript_100002|m.114346;Parent=Transcript_100002|m.114346 | |
Transcript_100002 transdecoder CDS 936 1316 . - . ID=cds.Transcript_100002|m.114345;Parent=Transcript_100002|m.114345 | |
Transcript_100002 transdecoder exon 1 1767 . + . ID=Transcript_100002|m.114344.exon1;Parent=Transcript_100002|m.114344 | |
Transcript_100002 transdecoder exon 1 1767 . - . ID=Transcript_100002|m.114345.exon1;Parent=Transcript_100002|m.114345 | |
Transcript_100002 transdecoder exon 1 1767 . + . ID=Transcript_100002|m.114346.exon1;Parent=Transcript_100002|m.114346 | |
Transcript_100002 transdecoder five_prime_UTR 1 1410 . + . ID=Transcript_100002|m.114346.utr5p1;Parent=Transcript_100002|m.114346 | |
Transcript_100002 transdecoder five_prime_UTR 1317 1767 . - . ID=Transcript_100002|m.114345.utr5p1;Parent=Transcript_100002|m.114345 | |
Transcript_100002 transdecoder gene 1 1767 . + . ID=Transcript_100002|g.114344;Name=ORF%20Transcript_100002%7Cg.114344%20Transcript_100002%7Cm.114344%20type%3A5prime_partial%20len%3A349%20%28%2B%29 | |
Transcript_100002 transdecoder gene 1 1767 . - . ID=Transcript_100002|g.114345;Name=ORF%20Transcript_100002%7Cg.114345%20Transcript_100002%7Cm.114345%20type%3Acomplete%20len%3A127%20%28-%29 | |
Transcript_100002 transdecoder gene 1 1767 . + . ID=Transcript_100002|g.114346;Name=ORF%20Transcript_100002%7Cg.114346%20Transcript_100002%7Cm.114346%20type%3Acomplete%20len%3A104%20%28%2B%29 | |
Transcript_100002 transdecoder mRNA 1 1767 . + . ID=Transcript_100002|m.114344;Parent=Transcript_100002|g.114344;Name=ORF%20Transcript_100002%7Cg.114344%20Transcript_100002%7Cm.114344%20type%3A5prime_partial%20len%3A349%20%28%2B%29 | |
Transcript_100002 transdecoder mRNA 1 1767 . - . ID=Transcript_100002|m.114345;Parent=Transcript_100002|g.114345;Name=ORF%20Transcript_100002%7Cg.114345%20Transcript_100002%7Cm.114345%20type%3Acomplete%20len%3A127%20%28-%29 | |
Transcript_100002 transdecoder mRNA 1 1767 . + . ID=Transcript_100002|m.114346;Parent=Transcript_100002|g.114346;Name=ORF%20Transcript_100002%7Cg.114346%20Transcript_100002%7Cm.114346%20type%3Acomplete%20len%3A104%20%28%2B%29 | |
Transcript_100002 transdecoder three_prime_UTR 1048 1767 . + . ID=Transcript_100002|m.114344.utr3p1;Parent=Transcript_100002|m.114344 | |
Transcript_100002 transdecoder three_prime_UTR 1723 1767 . + . ID=Transcript_100002|m.114346.utr3p1;Parent=Transcript_100002|m.114346 | |
Transcript_100002 transdecoder three_prime_UTR 1 935 . - . ID=Transcript_100002|m.114345.utr3p1;Parent=Transcript_100002|m.114345 | |
Transcript_100003 shmlast.LAST conditional_reciprocal_best_LAST 48 538 8.600000e-184 + . ID=homology:230094;Name=gi|768937493|ref|XP_011608972.1| PREDICTED: uncharacterized protein LOC105417370 [Takifugu rubripes];Target=gi|768937493|ref|XP_011608972.1| PREDICTED: uncharacterized protein LOC105417370 [Takifugu rubripes] 5 498 +;database=protein.fa |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# loops through directories with annotations from multiple species | |
# sorts each by ID | |
# drops all rows not annotated to cutsom reference protein database, startswith("gi") | |
# returns transcript ID and gene name | |
# writes to .csv file | |
import pandas as pd | |
import os | |
# requires dammit env: | |
# source activate py3.dammit | |
from dammit.fileio.gff3 import GFF3Parser | |
dammit_dir = '/home/ljcohen/osmotic_damit/' | |
dammit_dirs = os.listdir(dammit_dir) | |
print(dammit_dirs) | |
for dammit_dirname in dammit_dirs: | |
if dammit_dirname != "sbatch_files": | |
genus_species = dammit_dirname.split(".")[0] | |
dammit_gff = dammit_dir + dammit_dirname + "/" + genus_species + ".trinity_out.Trinity.fasta.dammit.gff3" | |
print(dammit_gff) | |
annotations = GFF3Parser(filename=dammit_gff).read() | |
all_names = annotations.sort_values(by=['seqid'],ascending=True)[['seqid','Name']] | |
annotations = annotations.dropna(subset=['Name']) | |
fund = annotations[annotations['Name'].str.startswith("gi")] | |
names = fund.sort_values(by=['seqid'], ascending=True)[['seqid', 'Name']] | |
names_out = '/home/ljcohen/osmotic_assemblies_farm/'+genus_species+'.trinity_out.Trinity.fasta.Fundulus.genenames.csv' | |
#names.to_csv(names_out) | |
#print("Written:",names_out) | |
all_names_out = '/home/ljcohen/osmotic_assemblies_farm/'+genus_species+'.trinity_out.Trinity.fasta.all_gene_names.csv' | |
all_names.to_csv(all_names_out) | |
print("Written:",all_names_out) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# takes Trinity .fasta file | |
# sorts by ID and e-value | |
# checks that score < 1e-05 | |
# drops duplicates (picks lowest e-value) | |
# returns transcript ID and gene name | |
# writes to .csv file | |
import pandas as pd | |
from dammit.fileio.gff3 import GFF3Parser | |
annotations = GFF3Parser(filename='mahi.trinity_out.Trinity.fasta.dammit.gff3').read() | |
alignments = annotations.query('source != "transdecoder"') | |
names = alignments.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name']] | |
names.to_csv('genenames.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment