Last active
April 16, 2025 04:38
-
-
Save ammaraziz/6afc8ecce51ee320f9ac77e800f0e98b to your computer and use it in GitHub Desktop.
convert bioedit project binary file to fasta
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pathlib import Path | |
import argparse | |
parser = argparse.ArgumentParser( | |
description='Convert BioEdit Project binary file to fasta', | |
prog='bio2fasta', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
# input options | |
parser.add_argument( | |
'-f', | |
'--files', | |
nargs='+', | |
help = 'One or more bioedit files. Extension does not matter. Accepts bash expansion eg *.bio', | |
required = True | |
) | |
rename_group = parser.add_mutually_exclusive_group(required=True) | |
rename_group.add_argument( | |
'-p', | |
'--filename-pattern', | |
help="Regex pattern to match the file name, used in naming output. Must include named regex group", | |
default=r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)", | |
required=False, | |
) | |
rename_group.add_argument( | |
'-r', | |
'--use-input-name', | |
help='Use file input name as output name', | |
action='store_true', | |
) | |
# parser.add_argument( | |
# '-n', | |
# '--output-name-pattern', | |
# help="Must match at least one of --filename-pattern groups. Do not add extension", | |
# default="{id}_{name}", | |
# required=False, | |
# ) | |
parser.add_argument( | |
'-s', | |
'--skip-reference', | |
help="Skip first sequence? ", | |
action='store_false' | |
) | |
parser.add_argument( | |
'-e', | |
'--encoding', | |
help="What foramt to expect for decoding the binary? default is latin1 to support acented characters. Choose ascii if unsure" | |
default="latin1" | |
) | |
## functions | |
def parse_filename(filename: str, pattern = r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)"): | |
matches = re.match(pattern, filename) | |
return("_".join(matches.groupdict().values())) | |
def read_bio_binary(path: Path, encoding: str = "latin1") -> str: | |
''' | |
read in bioedit project file | |
expects binary file | |
converts to latin1 by default; | |
this is because bioedit supports extended ascii format | |
ascii can also work. | |
''' | |
with open(path, "rb") as file: | |
data = file.read().decode(encoding) | |
return(data) | |
def match_n_clean(ascii_data: str, pattern: str = '\x04\x00\x00[A-Za-z]+', escape_len: int = 3): | |
''' | |
use regex pattern to match and clean the nucleotide sequence | |
''' | |
pattern = re.compile(pattern) | |
matches = pattern.findall(ascii_data) | |
matches_clean = [] | |
for m in matches: | |
matches_clean.append(m[escape_len:]) | |
matches_clean = [m[escape_len:] for m in matches] | |
return(matches_clean) | |
def write_fasta(sequences : list, fasta_header: str, outfile: Path, skip_reference = True) -> None: | |
if skip_reference: | |
sequences = sequences[1:] | |
with open(outfile, 'w') as file: | |
pos = 0 | |
for s in sequences: | |
if pos == 0: | |
file.write(">" + fasta_header) | |
file.write("\n") | |
file.write(s) | |
file.write("\n") | |
pos = pos + 1 | |
elif pos > 0: | |
file.write(">" + fasta_header + "_" + str(pos)) | |
file.write("\n") | |
file.write(s) | |
file.write("\n") | |
pos = pos + 1 | |
def process(pattern: str, files: list, use_file_name = True, skip_reference = True, encoding = encoding): | |
# process input files | |
for f in files: | |
f = Path(f) | |
data = read_bio_binary(f, encoding = encoding) | |
matches = match_n_clean(data) | |
print(matches) | |
if use_file_name: | |
fasta_header = f.stem | |
outfile_name = fasta_header + ".fasta" | |
else: | |
fasta_header = parse_filename(filename = f.name(), pattern = pattern) | |
outfile_name = fasta_header + ".fasta" | |
write_fasta(sequences = matches, fasta_header = fasta_header, outfile = outfile_name, skip_reference = skip_reference) | |
def main(): | |
args = parser.parse_args() | |
process( | |
pattern=args.filename_pattern, | |
files=args.files, | |
skip_reference=args.skip_reference, | |
use_file_name=args.use_input_name, | |
encoding=args.encoding, | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment