Skip to content

Instantly share code, notes, and snippets.

@Phlya
Last active January 4, 2022 16:30
Show Gist options
  • Save Phlya/9af1ffde527afe51e0558eb35e0025c7 to your computer and use it in GitHub Desktop.
Save Phlya/9af1ffde527afe51e0558eb35e0025c7 to your computer and use it in GitHub Desktop.
Saves a few useful plots to visually check quality of Hi-C libraries based on *.stats files from distiller (pairtools stats output)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 11:54:20 2018
@author: Ilya Flyamer
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from cooltools.sandbox.pairs_scaling_functions import contact_areas
#import cooler
import glob
from functools import reduce
#from adjustText import adjust_text
import seaborn as sns
import os
def get_areas(d, chroms):
bins = np.append(np.unique(d['Start'].values), [np.inf])
areas = np.zeros(len(bins)-1)
for i in range(len(chroms)):
areas = areas + contact_areas(bins, (0, chroms['length'][i]), (0, chroms['length'][i]))
return areas
def get_scaling_by_orienation(stats):
d = stats.loc[stats.index.str.startswith('dist_freq')]
d = d.reset_index()
d[['Distance', 'Orientation']] = d['index'].str.split('/', expand=True).iloc[:, 1:]
d = d.drop('index', axis=1)
d[['Start', 'End']] = d['Distance'].str.split('-', expand=True).apply(lambda x: [int(i.replace('+', '')) if i is not None else np.inf for i in x])
d['Distance'] = np.exp((np.log(d['Start'])+np.log(d['End']))/2)
d.columns = ['Value'] + list(d.columns)[1:]
d = d[['Orientation', 'Distance', 'Start', 'End', 'Value']]
return d
def plot_scaling_by_orientation(stats, ax, chroms):
d = get_scaling_by_orienation(stats)
areas = get_areas(d, chroms)
for orientation in '+-', '-+', '++', '--':
sd = d[d['Orientation']==orientation]
sd['Value'] /= areas*d['Value'].sum()
sd = sd.dropna()
pos = np.logical_and(sd['Value']>0, sd['Distance']>1)
pos = np.logical_and(pos, sd['Distance']<=2*10**4)
# sd['Value'][pos] /= np.trapz(sd['Value'][pos], sd['Distance'][pos])
ax.plot(sd['Distance'][pos], sd['Value'][pos], 'o-',
label=orientation[0]+' '+orientation[1])
def get_scaling(stats, chroms, xmin=1000):
d = get_scaling_by_orienation(stats).groupby('Distance').mean()
# d[np.isinf(d)]=np.nan
d = d.reset_index()
d = d[d['Distance']>=xmin].reset_index(drop=True)
areas = get_areas(d, chroms)
d['Value'] = d['Value']/areas/4
d = d.dropna()
d['Value'] /= np.trapz(d['Value'], d['Distance'])
return d
def plot_scaling(stats, ax, chroms, xmin=1000, *args, **kwargs):
d = get_scaling(stats, chroms=chroms, xmin=xmin)
# print(d)
ax.plot(d['Distance'], d['Value'], *args, **kwargs)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("folder", type=str,
help='Folder with .stats files')
parser.add_argument("chromsizes", type=argparse.FileType(),
help='File with .sizes of chromosomes')
parser.add_argument("--order", type=str, default=None, required=False,
help='Order of samples to plot, comma-separated')
parser.add_argument("--folder2", default=None, type=str, required=False,
help='Second folder with .stats files')
args = parser.parse_args()
chroms = pd.read_table(args.chromsizes, names=['chrom', 'length'])
chroms = chroms[~chroms['chrom'].isin(['chrY', 'chrM'])].dropna().reset_index(drop=True)
statfiles = sorted(glob.glob('%s*.stats' % args.folder))
stats = [pd.read_table(f, index_col=0, names=[f.split('/')[-1].split('.')[0]]) for f in statfiles]
stats = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), stats).T
if args.order is not None:
order = args.order.split(',')
print(order)
stats = stats.loc[order, :]
print(stats.index)
if args.folder2 is not None:
statfiles2 = glob.glob('%s*.stats' % args.folder2)
stats2 = [pd.read_table(f, index_col=0, names=[f.split('/')[-1].split('.')[0]]) for f in statfiles2]
stats2 = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True), stats2).T
else:
stats2=False
sns.set_context('talk')
sns.set_style('whitegrid')
f, ax, = plt.subplots(figsize=(8, 8))
plt.loglog()
p = sns.palettes.color_palette('tab20', len(stats.index))
if stats2 is not False:
for name in sorted(stats2.index):
plot_scaling(stats2.T[name], ax, chroms=chroms, label='_', c='grey', alpha=0.5, lw=1.5)
for i, name in enumerate(stats.index):
plot_scaling(stats.T[name], ax, chroms=chroms, label=name, c=p[i], lw=1.5)
ax.grid(True)
ax.set_aspect('equal')
ax.set_xlabel('Distance')
ax.set_ylabel('Normalized contact frequency')
ax.legend()
plt.savefig(os.path.join(args.folder, 'Scalings.pdf'), dpi=300,
bbox_inches='tight')
width = int(np.ceil(np.sqrt(stats.shape[0])))
if width**2-width > stats.shape[0]:
height = width-1
else:
height=width
f, axarr = plt.subplots(height, width,
sharex=True, sharey=True,
figsize=(3*stats.shape[0],
3*stats.shape[0]+1))
axarr = axarr.flatten()
i = 0
for name in stats.index:
plot_scaling_by_orientation(stats.T[name], axarr[i], chroms=chroms)
plt.loglog()
axarr[i].set_title(name)
axarr[i].grid(True)
# axarr[i].set_aspect('equal')
if i%width==0:
axarr[i].set_ylabel('Contact frequency')
if i>=(height-1)*width or height==1:
axarr[i].set_xlabel('Distance')
i+=1
for j in range(i, width*height):
axarr[j].remove()
axarr[0].set_xlim(left=10**1, right=10**4)
# axarr[0].set_ylim(bottom=10**-17)
axarr[0].legend()
plt.savefig(os.path.join(args.folder, 'Orientation_scaling.pdf'), dpi=300,
bbox_inches='tight')
plt.close()
f, ax = plt.subplots(figsize=(8, 8))
axarr = axarr.flatten()
for name in stats.index:
data = get_scaling_by_orienation(stats.T[name])
std = data.groupby(['Distance']).agg(np.std)
mean = data.groupby(['Distance']).agg(np.mean)
cv = (std/mean)['Value'].dropna()
plt.loglog()
ax.plot(cv, '-o', label=name)
if stats2 is not False:
for name in stats2.index:
data = get_scaling_by_orienation(stats.T[name])
std = data.groupby(['Distance']).agg(np.std)
mean = data.groupby(['Distance']).agg(np.mean)
cv = (std/mean)['Value'].dropna()
plt.loglog()
ax.plot(cv, c='grey', alpha=0.5)
ax.set_ylabel('CV')
ax.set_xlabel('Distance')
plt.legend()
ax.set_xlim(left=10**1, right=10**5)
plt.savefig(os.path.join(args.folder, 'Orientation_CV.pdf'), dpi=300,
bbox_inches='tight')
plt.close()
f, ax = plt.subplots(figsize=(stats.shape[0]*2, 6))
if stats2 is not False:
cis2 = (stats2['cis']/stats2['total_nodups'])
for val in cis2:
plt.axhline(val, c='grey', alpha=0.5)
cis = (stats['trans']/(stats['trans']+stats['cis_1kb+']))
plt.bar(stats.index, cis, width=0.8)
for x, val in zip(stats.index, cis):
plt.text(x, val, round(val, 3), ha='center', va='bottom')
plt.ylabel('Trans fraction')
plt.ylim(0)
plt.xticks(rotation=0)
plt.savefig(os.path.join(args.folder, 'Trans_fraction.pdf'), dpi=300,
bbox_inches='tight')
plt.close()
f, ax = plt.subplots(figsize=(10, 8))
stats[['total', 'total_mapped','total_nodups', 'cis', 'cis_1kb+', 'trans']].plot(kind='bar', ax=ax)
ax.legend_.remove()
plt.legend(loc=(1.04,0))
plt.xticks(rotation=0)
ax.xaxis.grid(False)
plt.savefig(os.path.join(args.folder, 'Total_contacts.pdf'), dpi=300,
bbox_inches='tight')
plt.close()
f, ax = plt.subplots(figsize=(stats.shape[0]*2, 6))
if stats2 is not False:
dups2 = (stats2['total_dups']/stats2['total_mapped'])
for val in dups2:
plt.axhline(val, c='grey', alpha=0.5)
dups = stats['total_dups']/stats['total_mapped']
plt.bar(stats.index, dups, width=0.8)
for x, val in zip(stats.index, dups):
plt.text(x, val, round(val, 2), ha='center', va='bottom')
plt.ylabel('Duplicate fraction')
plt.xticks(rotation=0)
plt.savefig(os.path.join(args.folder, 'Duplicate_fraction.pdf'), dpi=300,
bbox_inches='tight')
plt.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment