Skip to content

Instantly share code, notes, and snippets.

@zachcp
Created February 27, 2012 00:10
Show Gist options
  • Save zachcp/1919989 to your computer and use it in GitHub Desktop.
Save zachcp/1919989 to your computer and use it in GitHub Desktop.
batch upload of genbankfiles to BioSQL database
#!/usr/bin/env python
"""Load multiple genbankfiles into BioSQL
You will need to adjust the database parameters and have a BioSQL database set
up. See: http://biopython.org/wiki/BioSQL
adapted from bchapmanns github script
https://github.com/chapmanb/bcbb > gff > Scripts > gff > gff_to_biosql.py
and from single upload script:
https://gist.github.com/1919511
Usage:
python batch_gb_to_biosql.py
"""
from __future__ import with_statement
import os, sys
from BioSQL import BioSeqDatabase
from Bio import SeqIO
driver = "MySQLdb"
user = "root"
passwd = ""
host = "localhost"
dbname = "bioseqdb"
def main():
gbdir=os.listdir(os.getcwd())
inputfiles = [file for file in gbdir if (".gbk" in file or ".GBK" in file or ".gb" in file or ".GB" in file) ]
print "Starting Batch Processing ..... "
for inputfile in inputfiles:
genbanktoBioSql(inputfile)
def genbanktoBioSql(gbfile, length=10000):
print "Parsing Genbank file sequence file...."
with open(gbfile) as gb_handle:
records = list(SeqIO.parse(gb_handle, "genbank"))
print "Sorting by size and name......."
longrecords = [record for record in records if len(record) > length]
longrecords.sort(key=lambda x: x.name) #sort by name
#the following is to avoid duplciate entry names.
# Writes a temp file. is there a better way to do this?
for record in longrecords:
record.id = "{0}".format(record.name)
print record.id
SeqIO.write(longrecords, "temp.gbk", "gb")
with open("temp.gbk") as gb_handle:
records = list(SeqIO.parse(gb_handle, "genbank"))
print "Writing to BioSQL database..."
server = BioSeqDatabase.open_database(driver=driver, user=user,
passwd=passwd, host=host, db=dbname)
try:
if gbfile[:-4] not in server.keys():
server.new_database(gbfile[:-4])
else:
server.remove_database(gbfile[:-4])
server.adaptor.commit()
server.new_database(gbfile[:-4])
db = server[gbfile[:-4]]
db.load(records)
server.adaptor.commit()
except:
server.adaptor.rollback()
raise
if os.path.exists("temp.gbk"): os.remove("temp.gbk")
if __name__ == "__main__":
if len(sys.argv) >1:
print __doc__
sys.exit()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment