Last active
August 20, 2022 08:23
-
-
Save terrymun/74b89de2116b52f06e2917d9ec8ce0ad to your computer and use it in GitHub Desktop.
Exploding multi-sequence FASTA file into individual sequence files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#=====================================================# | |
# Script for exploding FASTA files into individual | |
# FASTA files | |
# | |
# Author: Terry Mun <[email protected]> | |
# Date: November 9, 2016 | |
#=====================================================# | |
# Load libraries | |
import os, sys, time, re | |
from Bio import SeqIO | |
# File path | |
currentDir = os.path.dirname(os.path.realpath(__file__)) | |
# Configuration | |
minSeqLength = 80 | |
# Start reading/writing | |
with \ | |
open(currentDir + '/badRecords.txt', 'w') as badRecordsFile, \ | |
open(currentDir + '/correctedRecords.txt', 'w') as correctedRecordsFile, \ | |
open(currentDir + '/shortRecords.txt', 'w') as shortRecordsFile, \ | |
open(currentDir + '/example.fa', 'r') as fasta: | |
count = 0 | |
records = {'corrected': [], 'bad': [], 'tooShort': []} | |
# Make output directory if not present | |
if not os.path.exists(currentDir + '/seqs'): | |
os.makedirs(currentDir + '/seqs') | |
# Iterate through all records | |
for record in SeqIO.parse(fasta, 'fasta'): | |
count += 1 | |
# Only process if sequence is longer than 80 amino acids | |
if len(str(record.seq)) >= minSeqLength: | |
if re.match(r'(.*)\*$', str(record.seq)): | |
records['corrected'].append(record.id) | |
correctedRecordsFile.write(record.id+'\n') | |
sequence = re.sub(r'(.*)\*$', r'\1', str(record.seq)) | |
if '*' in sequence: | |
records['bad'].append(record.id) | |
badRecordsFile.write(record.id+'\n') | |
print(str(record.id) + ' Warning: Entry contains non-trailing stop codon.') | |
else: | |
with open(currentDir + '/seqs/' + record.id + '.fa', 'w') as entry: | |
entry.write('>'+str(record.id)+'\n') | |
entry.write(sequence+'\n') | |
if count%1000 == 0: | |
print('Records processed: '+str(count)) | |
else: | |
records['tooShort'].append(record.id) | |
shortRecordsFile.write(record.id+'\n') | |
print(str(record.id) + ' Warning: Entry is shorter than minimum sequence length of '+str(minSeqLength)) | |
print('====================================') | |
print('End of file') | |
print('====================================') | |
print('Total records processed: '+str(count)) | |
print('Corrected records (all is good): '+str(len(records['corrected']))) | |
print('Problematic records (requires attention): '+str(len(records['bad']))) | |
print('Short records (not written): '+str(len(records['tooShort']))) | |
print('====================================') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment