Last active
June 30, 2022 21:45
-
-
Save YiweiNiu/a6fc41ce0aa62be011f84454d3e549ba to your computer and use it in GitHub Desktop.
Calculate some basic assembly stats using Python2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Purpose: calculate some basic stats of an assembly. | |
Note: most of the ideas were from the web. | |
Usage: python2 assembly_stats.py xxx.fa | |
''' | |
import sys | |
from collections import OrderedDict | |
import numpy as np | |
def assembly_stats(file=None): | |
''' | |
given an assembly (fasta), return some basic stats (total base, %GC, N50, number, min, max, avg, N90, %N or %gap) | |
''' | |
stats_dict = OrderedDict([ | |
("Size_includeN", 0), | |
("Size_withoutN", 0), | |
("Seq_Num", 0), | |
("Mean_Size", 0), | |
("Median_Size", 0), | |
("Longest_Seq", 0), | |
("Shortest_Seq", 0), | |
("GC_Content (%)", 0), | |
("N50", 0), | |
('L50', 0), | |
("N90", 0), | |
("Gap (%)", 0) | |
]) | |
seqLen = [] | |
no_c, no_g ,no_a ,no_t ,no_n = 0, 0, 0, 0, 0 | |
tmp_len = 0 | |
with open(file, 'r') as fin: | |
for line in fin: | |
line = line.strip() | |
if line.startswith('>'): | |
if tmp_len == 0: | |
continue | |
else: | |
seqLen.append(tmp_len) | |
tmp_len = 0 | |
continue | |
else: | |
tmp_len += len(line) | |
line = line.lower() | |
no_c += line.count('c') | |
no_g += line.count('g') | |
no_a += line.count('a') | |
no_t += line.count('t') | |
no_n += line.count('n') | |
seqLen.append(tmp_len) | |
seqLen = sorted(seqLen, reverse=True) | |
N50_pos = sum(seqLen) * 0.5 | |
N90_pos = sum(seqLen) * 0.9 | |
tmp_sum = 0 | |
L50 = 0 | |
for value in seqLen: | |
tmp_sum += value | |
L50 += 1 | |
if N50_pos <= tmp_sum: | |
N50 = value | |
break | |
tmp_sum = 0 | |
for value in seqLen: | |
tmp_sum += value | |
if N90_pos <= tmp_sum: | |
N90 = value | |
break | |
stats_dict["Size_includeN"] = sum(seqLen) | |
stats_dict["Size_withoutN"] = sum(seqLen) - no_n | |
stats_dict["Seq_Num"] = len(seqLen) | |
stats_dict["Mean_Size"] = int(np.mean(seqLen)) | |
stats_dict["Median_Size"] = int(np.median(seqLen)) | |
stats_dict["Longest_Seq"] = max(seqLen) | |
stats_dict["Shortest_Seq"] = min(seqLen) | |
stats_dict["GC_Content (%)"] = round(float((no_g + no_c)*100)/float(no_t + no_a + no_g + no_c), 2) | |
stats_dict["N50"] = int(N50) | |
stats_dict['L50'] = int(L50) | |
stats_dict["N90"] = int(N90) | |
stats_dict["Gap (%)"] = round(float(no_n * 100)/sum(seqLen), 2) | |
for key in stats_dict: | |
print key + '\t' + str(stats_dict[key]) | |
if __name__ == '__main__': | |
assembly_stats(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment