vogdb · September 4, 2018 18:22
diff --git a/count_allele_frq.py b/count_allele_frq.py
 # Usage example:
 # > python count_allele_frq.py -f out.frq

 import csv
 import re
 import argparse
 import numpy as np
 import matplotlib.pyplot as plt

 from matplotlib import ticker
 from scipy.interpolate import interp1d


 class FrqList:

    def __init__(self):
        self._list = []
        self._count = {}

        # to make plots nicer
        self._count[-0.01] = 0
        self._count[0.509] = 0

    def add(self, frq):
        self._list.append(frq)
        if frq in self._count:
            self._count[frq] += 1
        else:
            self._count[frq] = 1

    def get_splitted_count(self):
        '''
        :return:
            (list of frq values, list of frq values counts)
            both of lists are ordered in the same order, so you have to frq_set[i], frq_counts[i] to access the i-th.
        '''
        frq_set = sorted(list(map(float, self._count.keys())))
        frq_counts = [self._count[frq] for frq in frq_set]
        return frq_set, frq_counts

    def get_plain(self):
        return self._list

    def get_count(self):
        return self._list


 def calc_frq(frq_filepath):
    frq_list = FrqList()
    with open(frq_filepath) as frq_file:
        frq_reader = csv.reader(frq_file, delimiter='\t')
        # skip header
        next(frq_file)
        allele_pattern = re.compile(r'[ACGT]+:([\d.]+)')
        for row_idx, row in enumerate(frq_reader):
            if len(row) < 6:
                print('WARNING. Non paired value in the 5th column of raw {},\n{}'.format(row_idx, row))
            row_frq_list = []
            for allele in row[4:]:
                if allele_pattern.match(allele):
                    row_frq_list.append(float(allele.split(':')[1]))
                else:
                    print('WARNING. Incorrect allele in raw {},\n{}'.format(row_idx, row))
            frq = sorted(row_frq_list)[0]
            frq_list.add(frq)
    return frq_list


 def write(frq_list):
    with open('frq_list.txt', 'w') as f:
        for item in frq_list.get_plain():
            f.write("%s\n" % item)

    frq_set, frq_counts = frq_list.get_splitted_count()
    with open('frq_count.txt', 'w') as f:
        for i in range(len(frq_set)):
            f.write('{}:{}\n'.format(frq_set[i], frq_counts[i]))


 def plot_plain(frq_list):
    frq_set, frq_counts = frq_list.get_splitted_count()
    fig, ax = plt.subplots()

    # plot histogram
    ax.hist(frq_list.get_plain(), len(frq_set) - 1)
    # plot density
    frq_range = np.linspace(frq_set[0], frq_set[-1], num=1000, endpoint=True)
    frq_count_spline = interp1d(frq_set, frq_counts, kind='cubic')
    # to get above the histogram
    y_shift = 10
    ax.plot(frq_range, frq_count_spline(frq_range) + y_shift, '-g', linewidth=1)

    # layout format
    count_all = sum(frq_counts)
    percent_formatter = ticker.FuncFormatter(lambda count, y: int(100 * count / count_all))
    ax.yaxis.set_major_formatter(percent_formatter)
    ax.set_xlabel('Minor allele frequency')
    ax.set_ylabel('Total SNVs, %')

    plt.savefig('frq_count.svg')
    plt.savefig('frq_count.pdf')


 # Invalid for now. Alternative KDE plot
 def plot_kde(frq_list):
    frq_set, frq_counts = frq_list.get_splitted_count()
    import seaborn as sns

    ax = sns.distplot(frq_list.get_plain(), bins=len(frq_set) - 1, norm_hist=True)
    # 0 - 16.65
    # 0.5 - 109.8
    # count_formatter = ticker.FuncFormatter(lambda count, y: count * 10)
    # ax.yaxis.set_major_formatter(count_formatter)
    plt.show()


 def main():
    parser = argparse.ArgumentParser(description='script params')
    parser.add_argument('-f', '--file', default='out_10000.frq', help='filepath to the allele file')
    args = parser.parse_args()

    frq_list = calc_frq(args.file)
    write(frq_list)
    plot_plain(frq_list)


 main()
	# Usage example:
	# > python count_allele_frq.py -f out.frq

	import csv
	import re
	import argparse
	import numpy as np
	import matplotlib.pyplot as plt

	from matplotlib import ticker
	from scipy.interpolate import interp1d


	class FrqList:

	def __init__(self):
	self._list = []
	self._count = {}

	# to make plots nicer
	self._count[-0.01] = 0
	self._count[0.509] = 0

	def add(self, frq):
	self._list.append(frq)
	if frq in self._count:
	self._count[frq] += 1
	else:
	self._count[frq] = 1

	def get_splitted_count(self):
	'''
	:return:
	(list of frq values, list of frq values counts)
	both of lists are ordered in the same order, so you have to frq_set[i], frq_counts[i] to access the i-th.
	'''
	frq_set = sorted(list(map(float, self._count.keys())))
	frq_counts = [self._count[frq] for frq in frq_set]
	return frq_set, frq_counts

	def get_plain(self):
	return self._list

	def get_count(self):
	return self._list


	def calc_frq(frq_filepath):
	frq_list = FrqList()
	with open(frq_filepath) as frq_file:
	frq_reader = csv.reader(frq_file, delimiter='\t')
	# skip header
	next(frq_file)
	allele_pattern = re.compile(r'[ACGT]+:([\d.]+)')
	for row_idx, row in enumerate(frq_reader):
	if len(row) < 6:
	print('WARNING. Non paired value in the 5th column of raw {},\n{}'.format(row_idx, row))
	row_frq_list = []
	for allele in row[4:]:
	if allele_pattern.match(allele):
	row_frq_list.append(float(allele.split(':')[1]))
	else:
	print('WARNING. Incorrect allele in raw {},\n{}'.format(row_idx, row))
	frq = sorted(row_frq_list)[0]
	frq_list.add(frq)
	return frq_list


	def write(frq_list):
	with open('frq_list.txt', 'w') as f:
	for item in frq_list.get_plain():
	f.write("%s\n" % item)

	frq_set, frq_counts = frq_list.get_splitted_count()
	with open('frq_count.txt', 'w') as f:
	for i in range(len(frq_set)):
	f.write('{}:{}\n'.format(frq_set[i], frq_counts[i]))


	def plot_plain(frq_list):
	frq_set, frq_counts = frq_list.get_splitted_count()
	fig, ax = plt.subplots()

	# plot histogram
	ax.hist(frq_list.get_plain(), len(frq_set) - 1)
	# plot density
	frq_range = np.linspace(frq_set[0], frq_set[-1], num=1000, endpoint=True)
	frq_count_spline = interp1d(frq_set, frq_counts, kind='cubic')
	# to get above the histogram
	y_shift = 10
	ax.plot(frq_range, frq_count_spline(frq_range) + y_shift, '-g', linewidth=1)

	# layout format
	count_all = sum(frq_counts)
	percent_formatter = ticker.FuncFormatter(lambda count, y: int(100 * count / count_all))
	ax.yaxis.set_major_formatter(percent_formatter)
	ax.set_xlabel('Minor allele frequency')
	ax.set_ylabel('Total SNVs, %')

	plt.savefig('frq_count.svg')
	plt.savefig('frq_count.pdf')


	# Invalid for now. Alternative KDE plot
	def plot_kde(frq_list):
	frq_set, frq_counts = frq_list.get_splitted_count()
	import seaborn as sns

	ax = sns.distplot(frq_list.get_plain(), bins=len(frq_set) - 1, norm_hist=True)
	# 0 - 16.65
	# 0.5 - 109.8
	# count_formatter = ticker.FuncFormatter(lambda count, y: count * 10)
	# ax.yaxis.set_major_formatter(count_formatter)
	plt.show()


	def main():
	parser = argparse.ArgumentParser(description='script params')
	parser.add_argument('-f', '--file', default='out_10000.frq', help='filepath to the allele file')
	args = parser.parse_args()

	frq_list = calc_frq(args.file)
	write(frq_list)
	plot_plain(frq_list)


	main()