Last active
January 4, 2022 16:30
-
-
Save Phlya/9af1ffde527afe51e0558eb35e0025c7 to your computer and use it in GitHub Desktop.
Saves a few useful plots to visually check quality of Hi-C libraries based on *.stats files from distiller (pairtools stats output)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Sep 4 11:54:20 2018 | |
@author: Ilya Flyamer | |
""" | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from cooltools.sandbox.pairs_scaling_functions import contact_areas | |
#import cooler | |
import glob | |
from functools import reduce | |
#from adjustText import adjust_text | |
import seaborn as sns | |
import os | |
def get_areas(d, chroms): | |
bins = np.append(np.unique(d['Start'].values), [np.inf]) | |
areas = np.zeros(len(bins)-1) | |
for i in range(len(chroms)): | |
areas = areas + contact_areas(bins, (0, chroms['length'][i]), (0, chroms['length'][i])) | |
return areas | |
def get_scaling_by_orienation(stats): | |
d = stats.loc[stats.index.str.startswith('dist_freq')] | |
d = d.reset_index() | |
d[['Distance', 'Orientation']] = d['index'].str.split('/', expand=True).iloc[:, 1:] | |
d = d.drop('index', axis=1) | |
d[['Start', 'End']] = d['Distance'].str.split('-', expand=True).apply(lambda x: [int(i.replace('+', '')) if i is not None else np.inf for i in x]) | |
d['Distance'] = np.exp((np.log(d['Start'])+np.log(d['End']))/2) | |
d.columns = ['Value'] + list(d.columns)[1:] | |
d = d[['Orientation', 'Distance', 'Start', 'End', 'Value']] | |
return d | |
def plot_scaling_by_orientation(stats, ax, chroms): | |
d = get_scaling_by_orienation(stats) | |
areas = get_areas(d, chroms) | |
for orientation in '+-', '-+', '++', '--': | |
sd = d[d['Orientation']==orientation] | |
sd['Value'] /= areas*d['Value'].sum() | |
sd = sd.dropna() | |
pos = np.logical_and(sd['Value']>0, sd['Distance']>1) | |
pos = np.logical_and(pos, sd['Distance']<=2*10**4) | |
# sd['Value'][pos] /= np.trapz(sd['Value'][pos], sd['Distance'][pos]) | |
ax.plot(sd['Distance'][pos], sd['Value'][pos], 'o-', | |
label=orientation[0]+' '+orientation[1]) | |
def get_scaling(stats, chroms, xmin=1000): | |
d = get_scaling_by_orienation(stats).groupby('Distance').mean() | |
# d[np.isinf(d)]=np.nan | |
d = d.reset_index() | |
d = d[d['Distance']>=xmin].reset_index(drop=True) | |
areas = get_areas(d, chroms) | |
d['Value'] = d['Value']/areas/4 | |
d = d.dropna() | |
d['Value'] /= np.trapz(d['Value'], d['Distance']) | |
return d | |
def plot_scaling(stats, ax, chroms, xmin=1000, *args, **kwargs): | |
d = get_scaling(stats, chroms=chroms, xmin=xmin) | |
# print(d) | |
ax.plot(d['Distance'], d['Value'], *args, **kwargs) | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("folder", type=str, | |
help='Folder with .stats files') | |
parser.add_argument("chromsizes", type=argparse.FileType(), | |
help='File with .sizes of chromosomes') | |
parser.add_argument("--order", type=str, default=None, required=False, | |
help='Order of samples to plot, comma-separated') | |
parser.add_argument("--folder2", default=None, type=str, required=False, | |
help='Second folder with .stats files') | |
args = parser.parse_args() | |
chroms = pd.read_table(args.chromsizes, names=['chrom', 'length']) | |
chroms = chroms[~chroms['chrom'].isin(['chrY', 'chrM'])].dropna().reset_index(drop=True) | |
statfiles = sorted(glob.glob('%s*.stats' % args.folder)) | |
stats = [pd.read_table(f, index_col=0, names=[f.split('/')[-1].split('.')[0]]) for f in statfiles] | |
stats = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), stats).T | |
if args.order is not None: | |
order = args.order.split(',') | |
print(order) | |
stats = stats.loc[order, :] | |
print(stats.index) | |
if args.folder2 is not None: | |
statfiles2 = glob.glob('%s*.stats' % args.folder2) | |
stats2 = [pd.read_table(f, index_col=0, names=[f.split('/')[-1].split('.')[0]]) for f in statfiles2] | |
stats2 = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), stats2).T | |
else: | |
stats2=False | |
sns.set_context('talk') | |
sns.set_style('whitegrid') | |
f, ax, = plt.subplots(figsize=(8, 8)) | |
plt.loglog() | |
p = sns.palettes.color_palette('tab20', len(stats.index)) | |
if stats2 is not False: | |
for name in sorted(stats2.index): | |
plot_scaling(stats2.T[name], ax, chroms=chroms, label='_', c='grey', alpha=0.5, lw=1.5) | |
for i, name in enumerate(stats.index): | |
plot_scaling(stats.T[name], ax, chroms=chroms, label=name, c=p[i], lw=1.5) | |
ax.grid(True) | |
ax.set_aspect('equal') | |
ax.set_xlabel('Distance') | |
ax.set_ylabel('Normalized contact frequency') | |
ax.legend() | |
plt.savefig(os.path.join(args.folder, 'Scalings.pdf'), dpi=300, | |
bbox_inches='tight') | |
width = int(np.ceil(np.sqrt(stats.shape[0]))) | |
if width**2-width > stats.shape[0]: | |
height = width-1 | |
else: | |
height=width | |
f, axarr = plt.subplots(height, width, | |
sharex=True, sharey=True, | |
figsize=(3*stats.shape[0], | |
3*stats.shape[0]+1)) | |
axarr = axarr.flatten() | |
i = 0 | |
for name in stats.index: | |
plot_scaling_by_orientation(stats.T[name], axarr[i], chroms=chroms) | |
plt.loglog() | |
axarr[i].set_title(name) | |
axarr[i].grid(True) | |
# axarr[i].set_aspect('equal') | |
if i%width==0: | |
axarr[i].set_ylabel('Contact frequency') | |
if i>=(height-1)*width or height==1: | |
axarr[i].set_xlabel('Distance') | |
i+=1 | |
for j in range(i, width*height): | |
axarr[j].remove() | |
axarr[0].set_xlim(left=10**1, right=10**4) | |
# axarr[0].set_ylim(bottom=10**-17) | |
axarr[0].legend() | |
plt.savefig(os.path.join(args.folder, 'Orientation_scaling.pdf'), dpi=300, | |
bbox_inches='tight') | |
plt.close() | |
f, ax = plt.subplots(figsize=(8, 8)) | |
axarr = axarr.flatten() | |
for name in stats.index: | |
data = get_scaling_by_orienation(stats.T[name]) | |
std = data.groupby(['Distance']).agg(np.std) | |
mean = data.groupby(['Distance']).agg(np.mean) | |
cv = (std/mean)['Value'].dropna() | |
plt.loglog() | |
ax.plot(cv, '-o', label=name) | |
if stats2 is not False: | |
for name in stats2.index: | |
data = get_scaling_by_orienation(stats.T[name]) | |
std = data.groupby(['Distance']).agg(np.std) | |
mean = data.groupby(['Distance']).agg(np.mean) | |
cv = (std/mean)['Value'].dropna() | |
plt.loglog() | |
ax.plot(cv, c='grey', alpha=0.5) | |
ax.set_ylabel('CV') | |
ax.set_xlabel('Distance') | |
plt.legend() | |
ax.set_xlim(left=10**1, right=10**5) | |
plt.savefig(os.path.join(args.folder, 'Orientation_CV.pdf'), dpi=300, | |
bbox_inches='tight') | |
plt.close() | |
f, ax = plt.subplots(figsize=(stats.shape[0]*2, 6)) | |
if stats2 is not False: | |
cis2 = (stats2['cis']/stats2['total_nodups']) | |
for val in cis2: | |
plt.axhline(val, c='grey', alpha=0.5) | |
cis = (stats['trans']/(stats['trans']+stats['cis_1kb+'])) | |
plt.bar(stats.index, cis, width=0.8) | |
for x, val in zip(stats.index, cis): | |
plt.text(x, val, round(val, 3), ha='center', va='bottom') | |
plt.ylabel('Trans fraction') | |
plt.ylim(0) | |
plt.xticks(rotation=0) | |
plt.savefig(os.path.join(args.folder, 'Trans_fraction.pdf'), dpi=300, | |
bbox_inches='tight') | |
plt.close() | |
f, ax = plt.subplots(figsize=(10, 8)) | |
stats[['total', 'total_mapped','total_nodups', 'cis', 'cis_1kb+', 'trans']].plot(kind='bar', ax=ax) | |
ax.legend_.remove() | |
plt.legend(loc=(1.04,0)) | |
plt.xticks(rotation=0) | |
ax.xaxis.grid(False) | |
plt.savefig(os.path.join(args.folder, 'Total_contacts.pdf'), dpi=300, | |
bbox_inches='tight') | |
plt.close() | |
f, ax = plt.subplots(figsize=(stats.shape[0]*2, 6)) | |
if stats2 is not False: | |
dups2 = (stats2['total_dups']/stats2['total_mapped']) | |
for val in dups2: | |
plt.axhline(val, c='grey', alpha=0.5) | |
dups = stats['total_dups']/stats['total_mapped'] | |
plt.bar(stats.index, dups, width=0.8) | |
for x, val in zip(stats.index, dups): | |
plt.text(x, val, round(val, 2), ha='center', va='bottom') | |
plt.ylabel('Duplicate fraction') | |
plt.xticks(rotation=0) | |
plt.savefig(os.path.join(args.folder, 'Duplicate_fraction.pdf'), dpi=300, | |
bbox_inches='tight') | |
plt.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment