ammaraziz · April 16, 2025 04:38
diff --git a/bio2fasta.py b/bio2fasta.py
 import re
 from pathlib import Path
 import argparse

 parser = argparse.ArgumentParser(
    description='Convert BioEdit Project binary file to fasta',
    prog='bio2fasta',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

 # input options
 parser.add_argument(
    '-f',
    '--files',
    nargs='+',
    help = 'One or more bioedit files. Extension does not matter. Accepts bash expansion eg *.bio',
    required = True
    )

 rename_group = parser.add_mutually_exclusive_group(required=True)
 rename_group.add_argument(
    '-p',
    '--filename-pattern',
    help="Regex pattern to match the file name, used in naming output. Must include named regex group",
    default=r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)",
    required=False,
 )
 rename_group.add_argument(
    '-r',
    '--use-input-name',
    help='Use file input name as output name',
    action='store_true',
 )

 # parser.add_argument(
 #     '-n',
 #     '--output-name-pattern',
 #     help="Must match at least one of --filename-pattern groups. Do not add extension",
 #     default="{id}_{name}",
 #     required=False,
 # )

 parser.add_argument(
    '-s',
    '--skip-reference',
    help="Skip first sequence? ",
    action='store_false'
 )

 parser.add_argument(
    '-e',
    '--encoding',
    help="What foramt to expect for decoding the binary? default is latin1 to support acented characters. Choose ascii if unsure"
    default="latin1"
 )

 ## functions

 def parse_filename(filename: str, pattern = r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)"):
    matches = re.match(pattern, filename)
    return("_".join(matches.groupdict().values()))

 def read_bio_binary(path: Path, encoding: str = "latin1") -> str:
    '''
    read in bioedit project file
    expects binary file
    converts to latin1 by default;
        this is because bioedit  supports extended ascii format
        ascii can also work.
    '''
    with open(path, "rb") as file:
        data = file.read().decode(encoding)
    return(data)

 def match_n_clean(ascii_data: str, pattern: str = '\x04\x00\x00[A-Za-z]+', escape_len: int = 3):
    '''
    use regex pattern to match and clean the nucleotide sequence
    '''

    pattern = re.compile(pattern)
    matches = pattern.findall(ascii_data)

    matches_clean = []
    for m in matches:
        matches_clean.append(m[escape_len:])
    
    matches_clean = [m[escape_len:] for m in matches]
    return(matches_clean)

 def write_fasta(sequences : list, fasta_header: str, outfile: Path, skip_reference = True) -> None:
    
    if skip_reference:
        sequences = sequences[1:]
    with open(outfile, 'w') as file:
        pos = 0
        for s in sequences:
            if pos == 0:
                file.write(">" + fasta_header)
                file.write("\n")
                file.write(s)
                file.write("\n")
                pos = pos + 1
            elif pos > 0:
                file.write(">" + fasta_header + "_" + str(pos))
                file.write("\n")
                file.write(s)
                file.write("\n")
                pos = pos + 1

 def process(pattern: str, files: list, use_file_name = True, skip_reference = True, encoding = encoding):
    # process input files
    for f in files:
        f = Path(f)
        data = read_bio_binary(f, encoding = encoding)
        matches = match_n_clean(data)
        print(matches)

        if use_file_name:
             fasta_header = f.stem
             outfile_name = fasta_header + ".fasta"
        else:
            fasta_header = parse_filename(filename = f.name(), pattern = pattern)
            outfile_name = fasta_header + ".fasta"

        write_fasta(sequences = matches, fasta_header = fasta_header, outfile = outfile_name, skip_reference = skip_reference)

 def main():
    args = parser.parse_args()

    process(
        pattern=args.filename_pattern,
        files=args.files,
        skip_reference=args.skip_reference,
        use_file_name=args.use_input_name,
        encoding=args.encoding,
    )

 if __name__ == "__main__":
    main()
	import re
	from pathlib import Path
	import argparse

	parser = argparse.ArgumentParser(
	description='Convert BioEdit Project binary file to fasta',
	prog='bio2fasta',
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)

	# input options
	parser.add_argument(
	'-f',
	'--files',
	nargs='+',
	help = 'One or more bioedit files. Extension does not matter. Accepts bash expansion eg *.bio',
	required = True
	)

	rename_group = parser.add_mutually_exclusive_group(required=True)
	rename_group.add_argument(
	'-p',
	'--filename-pattern',
	help="Regex pattern to match the file name, used in naming output. Must include named regex group",
	default=r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)",
	required=False,
	)
	rename_group.add_argument(
	'-r',
	'--use-input-name',
	help='Use file input name as output name',
	action='store_true',
	)

	# parser.add_argument(
	# '-n',
	# '--output-name-pattern',
	# help="Must match at least one of --filename-pattern groups. Do not add extension",
	# default="{id}_{name}",
	# required=False,
	# )

	parser.add_argument(
	'-s',
	'--skip-reference',
	help="Skip first sequence? ",
	action='store_false'
	)

	parser.add_argument(
	'-e',
	'--encoding',
	help="What foramt to expect for decoding the binary? default is latin1 to support acented characters. Choose ascii if unsure"
	default="latin1"
	)

	## functions

	def parse_filename(filename: str, pattern = r"(?P<id>^\d+)_(?P<name>\w+)_(?P<other>\w+)"):
	matches = re.match(pattern, filename)
	return("_".join(matches.groupdict().values()))

	def read_bio_binary(path: Path, encoding: str = "latin1") -> str:
	'''
	read in bioedit project file
	expects binary file
	converts to latin1 by default;
	this is because bioedit supports extended ascii format
	ascii can also work.
	'''
	with open(path, "rb") as file:
	data = file.read().decode(encoding)
	return(data)

	def match_n_clean(ascii_data: str, pattern: str = '\x04\x00\x00[A-Za-z]+', escape_len: int = 3):
	'''
	use regex pattern to match and clean the nucleotide sequence
	'''

	pattern = re.compile(pattern)
	matches = pattern.findall(ascii_data)

	matches_clean = []
	for m in matches:
	matches_clean.append(m[escape_len:])

	matches_clean = [m[escape_len:] for m in matches]
	return(matches_clean)

	def write_fasta(sequences : list, fasta_header: str, outfile: Path, skip_reference = True) -> None:

	if skip_reference:
	sequences = sequences[1:]
	with open(outfile, 'w') as file:
	pos = 0
	for s in sequences:
	if pos == 0:
	file.write(">" + fasta_header)
	file.write("\n")
	file.write(s)
	file.write("\n")
	pos = pos + 1
	elif pos > 0:
	file.write(">" + fasta_header + "_" + str(pos))
	file.write("\n")
	file.write(s)
	file.write("\n")
	pos = pos + 1

	def process(pattern: str, files: list, use_file_name = True, skip_reference = True, encoding = encoding):
	# process input files
	for f in files:
	f = Path(f)
	data = read_bio_binary(f, encoding = encoding)
	matches = match_n_clean(data)
	print(matches)

	if use_file_name:
	fasta_header = f.stem
	outfile_name = fasta_header + ".fasta"
	else:
	fasta_header = parse_filename(filename = f.name(), pattern = pattern)
	outfile_name = fasta_header + ".fasta"

	write_fasta(sequences = matches, fasta_header = fasta_header, outfile = outfile_name, skip_reference = skip_reference)

	def main():
	args = parser.parse_args()

	process(
	pattern=args.filename_pattern,
	files=args.files,
	skip_reference=args.skip_reference,
	use_file_name=args.use_input_name,
	encoding=args.encoding,
	)

	if __name__ == "__main__":
	main()