Skip to content

Instantly share code, notes, and snippets.

@ammaraziz
Last active April 16, 2025 04:38
Show Gist options
  • Save ammaraziz/6afc8ecce51ee320f9ac77e800f0e98b to your computer and use it in GitHub Desktop.
Save ammaraziz/6afc8ecce51ee320f9ac77e800f0e98b to your computer and use it in GitHub Desktop.
convert bioedit project binary file to fasta
import re
from pathlib import Path
import argparse
parser = argparse.ArgumentParser(
description='Convert BioEdit Project binary file to fasta',
prog='bio2fasta',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# input options
parser.add_argument(
'-f',
'--files',
nargs='+',
help = 'One or more bioedit files. Extension does not matter. Accepts bash expansion eg *.bio',
required = True
)
rename_group = parser.add_mutually_exclusive_group(required=True)
rename_group.add_argument(
'-p',
'--filename-pattern',
help="Regex pattern to match the file name, used in naming output. Must include named regex group",
default=r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)",
required=False,
)
rename_group.add_argument(
'-r',
'--use-input-name',
help='Use file input name as output name',
action='store_true',
)
# parser.add_argument(
# '-n',
# '--output-name-pattern',
# help="Must match at least one of --filename-pattern groups. Do not add extension",
# default="{id}_{name}",
# required=False,
# )
parser.add_argument(
'-s',
'--skip-reference',
help="Skip first sequence? ",
action='store_false'
)
parser.add_argument(
'-e',
'--encoding',
help="What foramt to expect for decoding the binary? default is latin1 to support acented characters. Choose ascii if unsure"
default="latin1"
)
## functions
def parse_filename(filename: str, pattern = r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)"):
matches = re.match(pattern, filename)
return("_".join(matches.groupdict().values()))
def read_bio_binary(path: Path, encoding: str = "latin1") -> str:
'''
read in bioedit project file
expects binary file
converts to latin1 by default;
this is because bioedit supports extended ascii format
ascii can also work.
'''
with open(path, "rb") as file:
data = file.read().decode(encoding)
return(data)
def match_n_clean(ascii_data: str, pattern: str = '\x04\x00\x00[A-Za-z]+', escape_len: int = 3):
'''
use regex pattern to match and clean the nucleotide sequence
'''
pattern = re.compile(pattern)
matches = pattern.findall(ascii_data)
matches_clean = []
for m in matches:
matches_clean.append(m[escape_len:])
matches_clean = [m[escape_len:] for m in matches]
return(matches_clean)
def write_fasta(sequences : list, fasta_header: str, outfile: Path, skip_reference = True) -> None:
if skip_reference:
sequences = sequences[1:]
with open(outfile, 'w') as file:
pos = 0
for s in sequences:
if pos == 0:
file.write(">" + fasta_header)
file.write("\n")
file.write(s)
file.write("\n")
pos = pos + 1
elif pos > 0:
file.write(">" + fasta_header + "_" + str(pos))
file.write("\n")
file.write(s)
file.write("\n")
pos = pos + 1
def process(pattern: str, files: list, use_file_name = True, skip_reference = True, encoding = encoding):
# process input files
for f in files:
f = Path(f)
data = read_bio_binary(f, encoding = encoding)
matches = match_n_clean(data)
print(matches)
if use_file_name:
fasta_header = f.stem
outfile_name = fasta_header + ".fasta"
else:
fasta_header = parse_filename(filename = f.name(), pattern = pattern)
outfile_name = fasta_header + ".fasta"
write_fasta(sequences = matches, fasta_header = fasta_header, outfile = outfile_name, skip_reference = skip_reference)
def main():
args = parser.parse_args()
process(
pattern=args.filename_pattern,
files=args.files,
skip_reference=args.skip_reference,
use_file_name=args.use_input_name,
encoding=args.encoding,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment