Created
December 21, 2013 21:29
-
-
Save michaelenger/8075359 to your computer and use it in GitHub Desktop.
Example of how to use regular expressions to modify FASTA headers. See: https://en.wikipedia.org/wiki/FASTA_format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin python | |
# We're using the sys and re modules | |
import sys | |
import re | |
# This is the function which will do our replacing | |
def modify_fasta_headers(inpath, outpath): | |
# Open the in and out files | |
infile = open(inpath, "r") | |
outfile = open(outpath, "w") | |
# Create the pattern we're looking for | |
pattern = re.compile(r"^>[^|]+\|[^|]+\|[^|]+\|(?P<refnum>[^|]+)\|(?P<name>.+)$") | |
# Iterate through the lines in the infile | |
for line in infile.readlines(): | |
# Check if the line matches the pattern | |
match = pattern.match(line) | |
if match: | |
# Extract the refnum and name from the match | |
refnum = match.group("refnum") | |
name = match.group("name") | |
# Convert the refnum and name to the desired formats | |
refnum = re.sub(r"^(.+)\..+$", r"\1", refnum) | |
name = re.sub(r"^.*?\[(.+?)\].*$", r"\1", name).replace(" ", "_") | |
# Write the modified line | |
outfile.write(">" + refnum + "_" + name + "\n") | |
else: | |
# Just write the line | |
outfile.write(line) | |
# Close the files | |
infile.close() | |
outfile.close() | |
# Check if this is the main script | |
if __name__ == "__main__": | |
# If there are no arguments (the list only has one entry: modifyFastaHeaders.py) | |
if len(sys.argv) == 1: | |
# Print out instructions | |
print "Usage: python modifyFastaHeaders.py infile [outfile]" | |
# Exit | |
sys.exit(1); | |
# Use the first argument as the infile | |
infile = sys.argv[1] | |
# If there are at least 2 arguments | |
if len(sys.argv) > 2: | |
# Use the second argument as the outfile | |
outfile = sys.argv[2] | |
else: | |
# Add "_converted" to the end of the infile | |
# For example: | |
# example.fasta => example_converted.fasta | |
outfile = re.sub(r"(\.\w+)$", r"_converted\1", infile) | |
# Run the convertions function | |
modify_fasta_headers(infile, outfile) | |
# Let the user know what has happened | |
print infile + " converted to: " + outfile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment