Last active
May 16, 2018 12:22
-
-
Save Tabea-K/b5324eeca96de9289be82c637405827c to your computer and use it in GitHub Desktop.
Orders a multi fasta genome file by chromosome name in karyotypic order.
I.e., after chr1 follows chr2, and not chr10.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/local/bin/python | |
| # File created by Tabea Kischka at Thu May 19 15:24:59 CEST 2016 | |
| # This script orders the sequence in a multi fasta file by the chromosome name in | |
| # karyotypic oder, and not lexicographically. | |
| # I.e., it creates this order: chr1, chr2, chr10, ... | |
| # instead of the order chr1, chr10, chr2 | |
| import sys | |
| def yield_fasta(fh): | |
| """ | |
| Yields a tuple of the name | |
| and sequence of each entry | |
| """ | |
| from itertools import groupby | |
| faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) | |
| for header in faiter: | |
| # drop the ">" | |
| header = header.next()[1:].strip() | |
| # join all sequence lines to one. | |
| seq = "".join(s.strip() for s in faiter.next()) | |
| yield header, seq | |
| fh = open(sys.argv[1], 'r') | |
| seqs = {} | |
| for header, seq in yield_fasta(fh): | |
| seqs[header.replace(">chr","")] = seq | |
| karyo_ordered = {} | |
| unordered = seqs.keys() | |
| unordered_temp = [] | |
| max_chrom = 0 | |
| for i, elem in enumerate(seqs.keys()): | |
| elemshort = elem.replace("chr", "") | |
| try: | |
| karyo_ordered[elem]=int(elemshort) | |
| if int(elemshort) > max_chrom: max_chrom=int(elemshort) | |
| except ValueError: | |
| continue | |
| for i, elem in enumerate(seqs.keys()): | |
| if elem in karyo_ordered: continue | |
| elemshort = elem.split('_')[0].replace("chr","") | |
| try: | |
| karyo_ordered[elem]=int(elemshort)+max_chrom | |
| except ValueError: | |
| continue | |
| karyo_ordered_sorted = sorted(karyo_ordered.items(), key=operator.itemgetter(1)) | |
| for fullname, order in karyo_ordered_sorted: | |
| print ">%s\n%s" % (fullname, seqs[fullname]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment