Skip to content

Instantly share code, notes, and snippets.

@lukauskas
Last active November 12, 2020 19:15
Show Gist options
  • Save lukauskas/a7c28d8a96b141d6c1b343783d0cdbd1 to your computer and use it in GitHub Desktop.
Save lukauskas/a7c28d8a96b141d6c1b343783d0cdbd1 to your computer and use it in GitHub Desktop.
Converting deeptools matrix to a pandas dataframe

Converting deeptools matrix to pd.DataFrame

import matplotlib

# Deeptools will mess with matplotlib settings, fix that
_mpl_backend = matplotlib.rcParams['backend']
from deeptools.heatmapper import heatmapper as deeptools_heatmapper
matplotlib.use(_mpl_backend)
del _mpl_backend

import pandas as pd
import numpy as np

def read_deeptools_matrix_to_dataframe(matrix_filename):
    hm = deeptools_heatmapper()
    hm.read_matrix_file(matrix_filename)
    
    matrix = hm.matrix
    
    matrix_numpy = matrix.matrix
    
    
    index = []
    
    for group_label, regions in zip(matrix.group_labels, matrix.get_regions()):
        for region in regions:
            region_str = []
            
            chrom = region[0]
            coords = ','.join(['{}-{}'.format(*x) for x in region[1]])
            
            region_str.append(f'{chrom}:{coords}')
            
            if len(region) >= 3:
                name = region[2]
                region_str.append(name)
                
            # Skip region[3]
            
            if len(region) >= 5:
                strand = region[4]
                region_str.append(strand)
            
            region_str = '|'.join(region_str)
            index.append((group_label, region_str))

    
    index = pd.MultiIndex.from_tuples(index, names=['region_group', 'region'])
    
    cols = []
    
    for sample_label, (start_col, end_col) in zip(matrix.sample_labels, 
                                                  zip(matrix.sample_boundaries, matrix.sample_boundaries[1:])):
        bins_range = np.arange(0, end_col-start_col, dtype=int)
        
        for bin_ in bins_range:
            cols.append((sample_label, bin_))
            
    
    cols = pd.MultiIndex.from_tuples(cols, names=['sample_label', 'bin'])
    
    return pd.DataFrame(matrix_numpy, index=index, columns=cols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment