Last active
September 20, 2017 19:11
-
-
Save jrjhealey/06a7fbdfe495bc5a8824ed152dff7919 to your computer and use it in GitHub Desktop.
Creating subsetted operons/gene genbank files from a 'parent' sequence!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# This script is designed to take a genbank file and 'slice out'/'subset' | |
# regions (genes/operons etc.) and produce a separate file. This can be | |
# done explicitly by telling the script which base sites to use, or can | |
# 'decide' for itself by blasting a fasta of the sequence you're inter- | |
# ed in against the Genbank you want to slice a record out of. | |
# Note, the script (obviously) does not preseve the index number of the | |
# bases from the original | |
# Based upon the tutorial at: | |
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44 | |
# This script depends on BLAST and having the makeblastdb functionality | |
# available if BLAST_MODE is active. It also depends on Biopython. | |
# Set up and handle arguments: | |
import warnings | |
import sys | |
import subprocess | |
import os | |
import time | |
from Bio import SeqIO | |
__author__ = "Joe R. J. Healey" | |
__version__ = "1.1.0" | |
__title__ = "Genbank_slicer" | |
__license__ = "GPLv3" | |
__author_email__ = "[email protected]" | |
# Import SearchIO and suppress experimental warning | |
from Bio import BiopythonExperimentalWarning | |
with warnings.catch_warnings(): | |
warnings.simplefilter('ignore', BiopythonExperimentalWarning) | |
from Bio import SearchIO | |
def convert(basename, genbank): | |
'''Convert the provided genbank to a fasta to BLAST.''' | |
refFasta = "{}.fasta.tmp".format(basename) | |
SeqIO.convert(genbank, 'genbank', refFasta, 'fasta') | |
return refFasta | |
def runBlast(basename, refFasta, fasta, verbose): | |
'''Synthesise BLAST commands and make system calls''' | |
resultHandle = "{}.blastout.tmp".format(basename) | |
blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta) | |
blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle) | |
print("Constructing BLAST Database: " + '\n' + blastdb_cmd) | |
print("BLASTing: " + '\n' + blastn_cmd) | |
DB_process = subprocess.Popen(blastdb_cmd, | |
shell=True, | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE) | |
DB_process.wait() | |
BLAST_process = subprocess.Popen(blastn_cmd, | |
shell=True, | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE) | |
BLAST_process.wait() | |
return resultHandle | |
def getIndices(resultHandle): | |
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.''' | |
blast_result = SearchIO.read(resultHandle, 'blast-tab') | |
print(blast_result[0][0]) | |
start = blast_result[0][0].hit_start | |
end = blast_result[0][0].hit_end | |
return start, end | |
def slice(start, end, genbank, FPoffset, TPoffset): | |
'''Subset the provided genbank to return the sub record.''' | |
seqObj = SeqIO.read(genbank, 'genbank') | |
subRecord = seqObj[start-FPoffset:end+TPoffset] | |
return subRecord | |
def main(): | |
################################################################################################### | |
# Parse command line arguments | |
import argparse | |
try: | |
parser = argparse.ArgumentParser( | |
description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.') | |
parser.add_argument( | |
'-g', | |
'--genbank', | |
action='store', | |
required=True, | |
help='The genbank file you wish to subset.') | |
parser.add_argument( | |
'-o', | |
'--outfile', | |
action='store', | |
help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.') | |
parser.add_argument( | |
'-s', | |
'--start', | |
type=int, | |
help='Integer base index to slice from.') | |
parser.add_argument( | |
'-e', | |
'--end', | |
type=int, | |
default=0, | |
help='Integer base index to slice to.') | |
parser.add_argument( | |
'-F', | |
'--FPoffset', | |
action='store', | |
type=int, | |
default=0, | |
help='If you want to capture regions around the target site, specify the 5\' offset.') | |
parser.add_argument( | |
'-T', | |
'--TPoffset', | |
action='store', | |
type=int, | |
default=0, | |
help='If you want to capture regions around the target site, specify the 3\' offset.') | |
parser.add_argument( | |
'-b', | |
'--blastmode', | |
action='store_true', | |
help='If flag is set, provide an input fasta (-f | --fasta), and BLAST will be called to find your sequence indices in the original genbank.') | |
parser.add_argument( | |
'-f', | |
'--fasta', | |
action='store', | |
help='The operon fasta to pull annotations from the provided genbank.') | |
parser.add_argument( | |
'-t', | |
'--tidy', | |
action='store_true', | |
help='Tell the script whether or not to remove the temporary files it generated during processing. Off by default. WARNING: removes files based on the "tmp" string.') | |
parser.add_argument( | |
'-m', | |
'--meta', | |
action='store', | |
default=None, | |
help='Specify a string to use as the Genbank meta-data fields if the parent one doesn\'t contain anything. Else inherits from parent sequence.') | |
parser.add_argument( | |
'-v', | |
'--verbose', | |
action='store_true', | |
help='Verbose behaviour. Shows progress of BLAST etc. if appropriate.') | |
args = parser.parse_args() | |
except NameError: | |
print "An exception occured with argument parsing. Check your provided options." | |
genbank = args.genbank | |
fasta = args.fasta | |
split = os.path.splitext(args.genbank) | |
basename = os.path.basename(split[0]) | |
start = args.start | |
end = args.end | |
FPoffset = args.FPoffset | |
TPoffset = args.TPoffset | |
blastMode = args.blastmode | |
outfile = args.outfile | |
fasta = args.fasta | |
tidy = args.tidy | |
meta = args.meta | |
verbose = args.verbose | |
# Main code: | |
if blastMode is not False and fasta is not None: | |
refFasta = convert(basename,genbank) | |
resultHandle = runBlast(basename, refFasta, fasta, verbose) | |
start, end = getIndices(resultHandle) | |
else: | |
if fasta is None: | |
print("No fasta was provided so BLAST mode cannot be used.") | |
if start is None or end is None: | |
print('No slice indices have been specified or retrieved from blastout') | |
sys.exit(1) | |
subRecord = slice(start, end, genbank, FPoffset, TPoffset) | |
# Populate the metadata of the genbank | |
if meta is not None: | |
subRecord.id = meta | |
subRecord.locus = meta + 'subregion' | |
subRecord.accession = meta | |
subRecord.name = meta | |
subRecord.annotations["date"] = time.strftime("%d-%b-%Y").upper() | |
# Other field options include: | |
# subRecord.annotations["source"] | |
# ["taxonomy"] | |
# ["keywords"] | |
# ["references"] | |
# ["accessions"] | |
# ["data_file_division"] e.g. BCT, PLN, UNK | |
# ["organism"] | |
# ["topology"] | |
if outfile is not None: | |
SeqIO.write(subRecord, outfile, "genbank") | |
else: | |
print(subRecord.format('genbank')) | |
if tidy is True: | |
if verbose is True: | |
rm="rm -v ./*tmp*" | |
else: | |
rm="rm ./*tmp*" | |
subprocess.Popen(rm,shell=True) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment