Created
September 4, 2018 18:22
-
-
Save vogdb/cf03492f062efe5587f74adae25f5406 to your computer and use it in GitHub Desktop.
Allele frequencies count
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage example: | |
# > python count_allele_frq.py -f out.frq | |
import csv | |
import re | |
import argparse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib import ticker | |
from scipy.interpolate import interp1d | |
class FrqList: | |
def __init__(self): | |
self._list = [] | |
self._count = {} | |
# to make plots nicer | |
self._count[-0.01] = 0 | |
self._count[0.509] = 0 | |
def add(self, frq): | |
self._list.append(frq) | |
if frq in self._count: | |
self._count[frq] += 1 | |
else: | |
self._count[frq] = 1 | |
def get_splitted_count(self): | |
''' | |
:return: | |
(list of frq values, list of frq values counts) | |
both of lists are ordered in the same order, so you have to frq_set[i], frq_counts[i] to access the i-th. | |
''' | |
frq_set = sorted(list(map(float, self._count.keys()))) | |
frq_counts = [self._count[frq] for frq in frq_set] | |
return frq_set, frq_counts | |
def get_plain(self): | |
return self._list | |
def get_count(self): | |
return self._list | |
def calc_frq(frq_filepath): | |
frq_list = FrqList() | |
with open(frq_filepath) as frq_file: | |
frq_reader = csv.reader(frq_file, delimiter='\t') | |
# skip header | |
next(frq_file) | |
allele_pattern = re.compile(r'[ACGT]+:([\d.]+)') | |
for row_idx, row in enumerate(frq_reader): | |
if len(row) < 6: | |
print('WARNING. Non paired value in the 5th column of raw {},\n{}'.format(row_idx, row)) | |
row_frq_list = [] | |
for allele in row[4:]: | |
if allele_pattern.match(allele): | |
row_frq_list.append(float(allele.split(':')[1])) | |
else: | |
print('WARNING. Incorrect allele in raw {},\n{}'.format(row_idx, row)) | |
frq = sorted(row_frq_list)[0] | |
frq_list.add(frq) | |
return frq_list | |
def write(frq_list): | |
with open('frq_list.txt', 'w') as f: | |
for item in frq_list.get_plain(): | |
f.write("%s\n" % item) | |
frq_set, frq_counts = frq_list.get_splitted_count() | |
with open('frq_count.txt', 'w') as f: | |
for i in range(len(frq_set)): | |
f.write('{}:{}\n'.format(frq_set[i], frq_counts[i])) | |
def plot_plain(frq_list): | |
frq_set, frq_counts = frq_list.get_splitted_count() | |
fig, ax = plt.subplots() | |
# plot histogram | |
ax.hist(frq_list.get_plain(), len(frq_set) - 1) | |
# plot density | |
frq_range = np.linspace(frq_set[0], frq_set[-1], num=1000, endpoint=True) | |
frq_count_spline = interp1d(frq_set, frq_counts, kind='cubic') | |
# to get above the histogram | |
y_shift = 10 | |
ax.plot(frq_range, frq_count_spline(frq_range) + y_shift, '-g', linewidth=1) | |
# layout format | |
count_all = sum(frq_counts) | |
percent_formatter = ticker.FuncFormatter(lambda count, y: int(100 * count / count_all)) | |
ax.yaxis.set_major_formatter(percent_formatter) | |
ax.set_xlabel('Minor allele frequency') | |
ax.set_ylabel('Total SNVs, %') | |
plt.savefig('frq_count.svg') | |
plt.savefig('frq_count.pdf') | |
# Invalid for now. Alternative KDE plot | |
def plot_kde(frq_list): | |
frq_set, frq_counts = frq_list.get_splitted_count() | |
import seaborn as sns | |
ax = sns.distplot(frq_list.get_plain(), bins=len(frq_set) - 1, norm_hist=True) | |
# 0 - 16.65 | |
# 0.5 - 109.8 | |
# count_formatter = ticker.FuncFormatter(lambda count, y: count * 10) | |
# ax.yaxis.set_major_formatter(count_formatter) | |
plt.show() | |
def main(): | |
parser = argparse.ArgumentParser(description='script params') | |
parser.add_argument('-f', '--file', default='out_10000.frq', help='filepath to the allele file') | |
args = parser.parse_args() | |
frq_list = calc_frq(args.file) | |
write(frq_list) | |
plot_plain(frq_list) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment