Created
February 27, 2012 00:10
-
-
Save zachcp/1919989 to your computer and use it in GitHub Desktop.
batch upload of genbankfiles to BioSQL database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Load multiple genbankfiles into BioSQL | |
You will need to adjust the database parameters and have a BioSQL database set | |
up. See: http://biopython.org/wiki/BioSQL | |
adapted from bchapmanns github script | |
https://github.com/chapmanb/bcbb > gff > Scripts > gff > gff_to_biosql.py | |
and from single upload script: | |
https://gist.github.com/1919511 | |
Usage: | |
python batch_gb_to_biosql.py | |
""" | |
from __future__ import with_statement | |
import os, sys | |
from BioSQL import BioSeqDatabase | |
from Bio import SeqIO | |
driver = "MySQLdb" | |
user = "root" | |
passwd = "" | |
host = "localhost" | |
dbname = "bioseqdb" | |
def main(): | |
gbdir=os.listdir(os.getcwd()) | |
inputfiles = [file for file in gbdir if (".gbk" in file or ".GBK" in file or ".gb" in file or ".GB" in file) ] | |
print "Starting Batch Processing ..... " | |
for inputfile in inputfiles: | |
genbanktoBioSql(inputfile) | |
def genbanktoBioSql(gbfile, length=10000): | |
print "Parsing Genbank file sequence file...." | |
with open(gbfile) as gb_handle: | |
records = list(SeqIO.parse(gb_handle, "genbank")) | |
print "Sorting by size and name......." | |
longrecords = [record for record in records if len(record) > length] | |
longrecords.sort(key=lambda x: x.name) #sort by name | |
#the following is to avoid duplciate entry names. | |
# Writes a temp file. is there a better way to do this? | |
for record in longrecords: | |
record.id = "{0}".format(record.name) | |
print record.id | |
SeqIO.write(longrecords, "temp.gbk", "gb") | |
with open("temp.gbk") as gb_handle: | |
records = list(SeqIO.parse(gb_handle, "genbank")) | |
print "Writing to BioSQL database..." | |
server = BioSeqDatabase.open_database(driver=driver, user=user, | |
passwd=passwd, host=host, db=dbname) | |
try: | |
if gbfile[:-4] not in server.keys(): | |
server.new_database(gbfile[:-4]) | |
else: | |
server.remove_database(gbfile[:-4]) | |
server.adaptor.commit() | |
server.new_database(gbfile[:-4]) | |
db = server[gbfile[:-4]] | |
db.load(records) | |
server.adaptor.commit() | |
except: | |
server.adaptor.rollback() | |
raise | |
if os.path.exists("temp.gbk"): os.remove("temp.gbk") | |
if __name__ == "__main__": | |
if len(sys.argv) >1: | |
print __doc__ | |
sys.exit() | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment