Last active
August 29, 2015 14:01
-
-
Save LeeBergstrand/3811b40a7f3d0b8c6080 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Created by: Lee Bergstrand | |
# Descript: A simple program takes a list of protein accessions and an amino acide fasta file | |
# and extracts protiens that cluster around those clusters according to clustering | |
# generated by CD-HIT. | |
# | |
# Requirements: - This script requires the Biopython module: http://biopython.org/wiki/Download | |
# | |
# Usage: CDHITtoFASTA.py <sequenceList.txt> <sequences.faa> <cdhit.clstr> | |
# Example: CDHITtoFASTA.py sequenceList.txt sequences.faa cdhit.clstr | |
#---------------------------------------------------------------------------------------- | |
#=========================================================================================================== | |
#Imports: | |
import sys | |
import re | |
from Bio import SeqIO | |
#=========================================================================================================== | |
# Functions: | |
#----------------------------------------------------------------------------------------------------------- | |
# 1: Checks if in proper number of arguments are passed gives instructions on proper use. | |
def argsCheck(numArgs): | |
if len(sys.argv) < numArgs or len(sys.argv) > numArgs: | |
print "Sequence Downloader" | |
print "By Lee Bergstrand\n" | |
print "Usage: " + sys.argv[0] + " <sequenceList.txt> <sequences.faa> <cdhit.clstr>" | |
print "Examples: " + sys.argv[0] + " sequenceList.txt sequences.faa cdhit.clstr " | |
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
#----------------------------------------------------------------------------------------------------------- | |
# 2: Returns a list containing each cluster in the cd hit-file as a list of accession strings. | |
def createClusters(cdHitClustering): | |
clusterList = cdHitClustering.split("Cluster") | |
del clusterList[0] # The first cluster in the file is split into an empty empty element and and the first cluster | |
# Del removes this empty element. | |
for x in xrange(len(clusterList)): | |
accessions = AccessionExtractRegex.findall(clusterList[x]) | |
for i in xrange(len(accessions)): | |
accessions[i] = accessions[i].lstrip('>').rstrip('.') # Strips accession to bare id number. | |
clusterList[x] = accessions | |
return clusterList | |
#----------------------------------------------------------------------------------------------------------- | |
# 3: Extracts the accessions of each member of the CD-HIT cluster that contains a sequence from the sequence list file. | |
def getClusterAccessions(seqList, clusterList): | |
clustersToCreateIntoFASTA = [] | |
for seq in seqList: | |
for cluster in clusterList: | |
if seq in cluster: | |
if cluster not in clustersToCreateIntoFASTA: | |
clustersToCreateIntoFASTA.append(cluster) | |
break | |
return clustersToCreateIntoFASTA | |
#----------------------------------------------------------------------------------------------------------- | |
# 4: Extracts the FASTA formatted sequece of each member of the CD-HIT cluster. | |
def getClusterFASTAs(clustersToCreateIntoFASTA, seqRecordDict): | |
FASTAsToWrite = [] | |
for cluster in clustersToCreateIntoFASTA: | |
ClusterFASTA = [] | |
for sequence in cluster: | |
try: | |
FASTA = seqRecordDict[sequence].format("fasta") | |
except KeyError: | |
print "Error: " + sequence + " could not be found in input FASTA file!" | |
continue | |
ClusterFASTA.append(FASTA) | |
ClusterFASTAString = "".join(ClusterFASTA) | |
FASTAsToWrite.append(ClusterFASTAString) | |
return FASTAsToWrite | |
#=========================================================================================================== | |
# Main program code: | |
# House keeping... | |
argsCheck(4) # Checks if the number of arguments are correct. | |
AccessionExtractRegex = re.compile(">.*\.\.\.") | |
# Stores file one for input checking. | |
seqListFile = sys.argv[1] | |
seqFile = sys.argv[2] | |
clusterFile = sys.argv[3] | |
# File extension checks. | |
if not seqListFile.endswith(".txt"): | |
print "[Warning] " + seqListFile + " may not be a txt file!" | |
if not seqFile.endswith(".faa"): | |
print "[Warning] " + seqFile + " may not be a FASTA file!" | |
if not clusterFile.endswith(".clstr"): | |
print "[Warning] " + clusterFile + " may not be a CD-HIT file!" | |
# Reads sequence file list and stores it as a list strings. Safely closes file: | |
try: | |
print ">> Opening sequence list..." | |
with open(seqListFile,"rU") as newFile: | |
sequences = newFile.read() | |
seqList = sequences.splitlines() # Splits string into a list. Each element is a single line from the string. | |
newFile.close() | |
print ">> Done." | |
except IOError: | |
print "Failed to open " + seqListFile | |
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
print ">> You have listed", len(seqList), "sequences. They are:" | |
print "\n" + sequences | |
# Reads a FASTA file and stores sequences as a dictionary of sequence record objects. Safely closes file: | |
try: | |
print ">> Opening FASTA file..." | |
handle = open(seqFile, "rU") | |
seqRecords = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) | |
handle.close() | |
print ">> Done." | |
except IOError: | |
print "Failed to open " + seqFile | |
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
# Reads a CD-HIT CLuster file and reads it into memory. Safely closes file: | |
try: | |
print ">> Opening CD-HIT Clustering..." | |
with open(clusterFile,"rU") as newFile: | |
clustering = newFile.read() | |
clusterList = createClusters(clustering) | |
print ">> Done." | |
except IOError: | |
print "Failed to open " + clusterFile | |
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
print ">> Extracting clusters which contain sequences from the sequence list." | |
ClusterAccessions = getClusterAccessions(seqList, clusterList) | |
print ">> Extracting clusters from the input FASTA file." | |
FASTAsToWrite = getClusterFASTAs(ClusterAccessions, seqRecords) | |
# Writes clusters to file. | |
print ">> Writing clusters to file in FASTA format." | |
ClusterCount = 1 | |
for FASTA in FASTAsToWrite: | |
OutFile = seqFile.rstrip(".faa") + "Cluster" + str(ClusterCount) + ".faa" | |
print ">> Writing Cluster " + str(ClusterCount) + " to file..." | |
try: | |
with open(OutFile,"w") as newFile: | |
newFile.write(FASTA) | |
newFile.close() | |
print ">> Done." | |
except IOError: | |
print "Failed to write to " + OutFile | |
sys.exit(1) # Aborts program. (exit(1) indicates that an error occurred) | |
ClusterCount += 1 | |
print ">> All Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment