Created
March 19, 2019 18:04
-
-
Save celestialphineas/ee78c55acbd170437cc0784ec0c0c8e6 to your computer and use it in GitHub Desktop.
A program for counting dinucleotide frequency for each contig in a .FASTA file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| import sys | |
| from Bio import SeqIO | |
| from itertools import groupby, product | |
| def canonical(x): | |
| rules = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} | |
| paired = ''.join(map(lambda x: rules[x], list(x[::-1]))) | |
| return paired if paired < x else x | |
| def dnf(x): | |
| symbols = sorted(list(set(map(canonical, [''.join(i) for i in product('ACGT',repeat=2)])))) | |
| splitted = [canonical(x[i:i+2]) for i in range(0, len(x) - 1)] | |
| counts = [(i, len(list(c))) for i, c in groupby(sorted(splitted))] | |
| count_dict = dict((x, y) for x, y in counts) | |
| return map(lambda x: count_dict[x] if x in count_dict else 0, symbols) | |
| with open(sys.argv[2], 'w') as output: | |
| for seq_record in SeqIO.parse(sys.argv[1], "fasta"): | |
| result = reduce(lambda x, y: str(x) + '\t' + str(y), dnf(str(seq_record.seq))) + '\n' | |
| output.write(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment