Skip to content

Instantly share code, notes, and snippets.

@vogdb
Created September 4, 2018 18:22
Show Gist options
  • Save vogdb/cf03492f062efe5587f74adae25f5406 to your computer and use it in GitHub Desktop.
Save vogdb/cf03492f062efe5587f74adae25f5406 to your computer and use it in GitHub Desktop.
Allele frequencies count
# Usage example:
# > python count_allele_frq.py -f out.frq
import csv
import re
import argparse
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
from scipy.interpolate import interp1d
class FrqList:
def __init__(self):
self._list = []
self._count = {}
# to make plots nicer
self._count[-0.01] = 0
self._count[0.509] = 0
def add(self, frq):
self._list.append(frq)
if frq in self._count:
self._count[frq] += 1
else:
self._count[frq] = 1
def get_splitted_count(self):
'''
:return:
(list of frq values, list of frq values counts)
both of lists are ordered in the same order, so you have to frq_set[i], frq_counts[i] to access the i-th.
'''
frq_set = sorted(list(map(float, self._count.keys())))
frq_counts = [self._count[frq] for frq in frq_set]
return frq_set, frq_counts
def get_plain(self):
return self._list
def get_count(self):
return self._list
def calc_frq(frq_filepath):
frq_list = FrqList()
with open(frq_filepath) as frq_file:
frq_reader = csv.reader(frq_file, delimiter='\t')
# skip header
next(frq_file)
allele_pattern = re.compile(r'[ACGT]+:([\d.]+)')
for row_idx, row in enumerate(frq_reader):
if len(row) < 6:
print('WARNING. Non paired value in the 5th column of raw {},\n{}'.format(row_idx, row))
row_frq_list = []
for allele in row[4:]:
if allele_pattern.match(allele):
row_frq_list.append(float(allele.split(':')[1]))
else:
print('WARNING. Incorrect allele in raw {},\n{}'.format(row_idx, row))
frq = sorted(row_frq_list)[0]
frq_list.add(frq)
return frq_list
def write(frq_list):
with open('frq_list.txt', 'w') as f:
for item in frq_list.get_plain():
f.write("%s\n" % item)
frq_set, frq_counts = frq_list.get_splitted_count()
with open('frq_count.txt', 'w') as f:
for i in range(len(frq_set)):
f.write('{}:{}\n'.format(frq_set[i], frq_counts[i]))
def plot_plain(frq_list):
frq_set, frq_counts = frq_list.get_splitted_count()
fig, ax = plt.subplots()
# plot histogram
ax.hist(frq_list.get_plain(), len(frq_set) - 1)
# plot density
frq_range = np.linspace(frq_set[0], frq_set[-1], num=1000, endpoint=True)
frq_count_spline = interp1d(frq_set, frq_counts, kind='cubic')
# to get above the histogram
y_shift = 10
ax.plot(frq_range, frq_count_spline(frq_range) + y_shift, '-g', linewidth=1)
# layout format
count_all = sum(frq_counts)
percent_formatter = ticker.FuncFormatter(lambda count, y: int(100 * count / count_all))
ax.yaxis.set_major_formatter(percent_formatter)
ax.set_xlabel('Minor allele frequency')
ax.set_ylabel('Total SNVs, %')
plt.savefig('frq_count.svg')
plt.savefig('frq_count.pdf')
# Invalid for now. Alternative KDE plot
def plot_kde(frq_list):
frq_set, frq_counts = frq_list.get_splitted_count()
import seaborn as sns
ax = sns.distplot(frq_list.get_plain(), bins=len(frq_set) - 1, norm_hist=True)
# 0 - 16.65
# 0.5 - 109.8
# count_formatter = ticker.FuncFormatter(lambda count, y: count * 10)
# ax.yaxis.set_major_formatter(count_formatter)
plt.show()
def main():
parser = argparse.ArgumentParser(description='script params')
parser.add_argument('-f', '--file', default='out_10000.frq', help='filepath to the allele file')
args = parser.parse_args()
frq_list = calc_frq(args.file)
write(frq_list)
plot_plain(frq_list)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment