Skip to content

Instantly share code, notes, and snippets.

@YiweiNiu
Last active June 30, 2022 21:45
Show Gist options
  • Save YiweiNiu/a6fc41ce0aa62be011f84454d3e549ba to your computer and use it in GitHub Desktop.
Save YiweiNiu/a6fc41ce0aa62be011f84454d3e549ba to your computer and use it in GitHub Desktop.
Calculate some basic assembly stats using Python2
#!/usr/bin/env python
'''
Purpose: calculate some basic stats of an assembly.
Note: most of the ideas were from the web.
Usage: python2 assembly_stats.py xxx.fa
'''
import sys
from collections import OrderedDict
import numpy as np
def assembly_stats(file=None):
'''
given an assembly (fasta), return some basic stats (total base, %GC, N50, number, min, max, avg, N90, %N or %gap)
'''
stats_dict = OrderedDict([
("Size_includeN", 0),
("Size_withoutN", 0),
("Seq_Num", 0),
("Mean_Size", 0),
("Median_Size", 0),
("Longest_Seq", 0),
("Shortest_Seq", 0),
("GC_Content (%)", 0),
("N50", 0),
('L50', 0),
("N90", 0),
("Gap (%)", 0)
])
seqLen = []
no_c, no_g ,no_a ,no_t ,no_n = 0, 0, 0, 0, 0
tmp_len = 0
with open(file, 'r') as fin:
for line in fin:
line = line.strip()
if line.startswith('>'):
if tmp_len == 0:
continue
else:
seqLen.append(tmp_len)
tmp_len = 0
continue
else:
tmp_len += len(line)
line = line.lower()
no_c += line.count('c')
no_g += line.count('g')
no_a += line.count('a')
no_t += line.count('t')
no_n += line.count('n')
seqLen.append(tmp_len)
seqLen = sorted(seqLen, reverse=True)
N50_pos = sum(seqLen) * 0.5
N90_pos = sum(seqLen) * 0.9
tmp_sum = 0
L50 = 0
for value in seqLen:
tmp_sum += value
L50 += 1
if N50_pos <= tmp_sum:
N50 = value
break
tmp_sum = 0
for value in seqLen:
tmp_sum += value
if N90_pos <= tmp_sum:
N90 = value
break
stats_dict["Size_includeN"] = sum(seqLen)
stats_dict["Size_withoutN"] = sum(seqLen) - no_n
stats_dict["Seq_Num"] = len(seqLen)
stats_dict["Mean_Size"] = int(np.mean(seqLen))
stats_dict["Median_Size"] = int(np.median(seqLen))
stats_dict["Longest_Seq"] = max(seqLen)
stats_dict["Shortest_Seq"] = min(seqLen)
stats_dict["GC_Content (%)"] = round(float((no_g + no_c)*100)/float(no_t + no_a + no_g + no_c), 2)
stats_dict["N50"] = int(N50)
stats_dict['L50'] = int(L50)
stats_dict["N90"] = int(N90)
stats_dict["Gap (%)"] = round(float(no_n * 100)/sum(seqLen), 2)
for key in stats_dict:
print key + '\t' + str(stats_dict[key])
if __name__ == '__main__':
assembly_stats(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment