Skip to content

Instantly share code, notes, and snippets.

@Tabea-K
Last active May 16, 2018 12:22
Show Gist options
  • Select an option

  • Save Tabea-K/b5324eeca96de9289be82c637405827c to your computer and use it in GitHub Desktop.

Select an option

Save Tabea-K/b5324eeca96de9289be82c637405827c to your computer and use it in GitHub Desktop.
Orders a multi fasta genome file by chromosome name in karyotypic order. I.e., after chr1 follows chr2, and not chr10.
#!/usr/local/bin/python
# File created by Tabea Kischka at Thu May 19 15:24:59 CEST 2016
# This script orders the sequence in a multi fasta file by the chromosome name in
# karyotypic oder, and not lexicographically.
# I.e., it creates this order: chr1, chr2, chr10, ...
# instead of the order chr1, chr10, chr2
import sys
def yield_fasta(fh):
"""
Yields a tuple of the name
and sequence of each entry
"""
from itertools import groupby
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in faiter:
# drop the ">"
header = header.next()[1:].strip()
# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.next())
yield header, seq
fh = open(sys.argv[1], 'r')
seqs = {}
for header, seq in yield_fasta(fh):
seqs[header.replace(">chr","")] = seq
karyo_ordered = {}
unordered = seqs.keys()
unordered_temp = []
max_chrom = 0
for i, elem in enumerate(seqs.keys()):
elemshort = elem.replace("chr", "")
try:
karyo_ordered[elem]=int(elemshort)
if int(elemshort) > max_chrom: max_chrom=int(elemshort)
except ValueError:
continue
for i, elem in enumerate(seqs.keys()):
if elem in karyo_ordered: continue
elemshort = elem.split('_')[0].replace("chr","")
try:
karyo_ordered[elem]=int(elemshort)+max_chrom
except ValueError:
continue
karyo_ordered_sorted = sorted(karyo_ordered.items(), key=operator.itemgetter(1))
for fullname, order in karyo_ordered_sorted:
print ">%s\n%s" % (fullname, seqs[fullname])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment