Skip to content

Instantly share code, notes, and snippets.

@celestialphineas
Created March 19, 2019 18:04
Show Gist options
  • Save celestialphineas/ee78c55acbd170437cc0784ec0c0c8e6 to your computer and use it in GitHub Desktop.
Save celestialphineas/ee78c55acbd170437cc0784ec0c0c8e6 to your computer and use it in GitHub Desktop.
A program for counting dinucleotide frequency for each contig in a .FASTA file
#!/usr/bin/python
import sys
from Bio import SeqIO
from itertools import groupby, product
def canonical(x):
rules = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
paired = ''.join(map(lambda x: rules[x], list(x[::-1])))
return paired if paired < x else x
def dnf(x):
symbols = sorted(list(set(map(canonical, [''.join(i) for i in product('ACGT',repeat=2)]))))
splitted = [canonical(x[i:i+2]) for i in range(0, len(x) - 1)]
counts = [(i, len(list(c))) for i, c in groupby(sorted(splitted))]
count_dict = dict((x, y) for x, y in counts)
return map(lambda x: count_dict[x] if x in count_dict else 0, symbols)
with open(sys.argv[2], 'w') as output:
for seq_record in SeqIO.parse(sys.argv[1], "fasta"):
result = reduce(lambda x, y: str(x) + '\t' + str(y), dnf(str(seq_record.seq))) + '\n'
output.write(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment