Last active
June 16, 2020 22:10
-
-
Save nvictus/3a012429fc24dbea83fe93f163a0fa73 to your computer and use it in GitHub Desktop.
motif PFM/PWM dataframe reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def _probability_mat_to_information_mat(prob_df, bg_df): | |
""" | |
Converts a probability matrix to an information matrix. | |
Taken from logomaker (https://github.com/jbkinney/logomaker). | |
""" | |
SMALL = np.finfo(float).tiny | |
fg_vals = prob_df.values | |
bg_vals = bg_df.values | |
tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL)) | |
info_vec = tmp_vals.sum(axis=1) | |
info_df = prob_df.copy() | |
info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis] | |
return info_df | |
def _read_motif_matrix(motif, type, **kwargs): | |
if type == 'counts': | |
dct = motif.counts | |
elif type in {'probability', 'pfm'}: | |
dct = motif.pwm | |
elif type in {'weight', 'pwm', 'pssm'}: | |
dct = motif.pssm | |
elif type == 'background': | |
dct = { | |
letter: [freq] * len(motif) | |
for letter, freq in motif.background.items() | |
} | |
elif type == 'information': | |
fg = pd.DataFrame(motif.pwm) | |
bg = pd.DataFrame({ | |
letter: [freq] * len(motif) | |
for letter, freq in motif.background.items() | |
}) | |
return _probability_mat_to_information_mat(fg, bg) | |
else: | |
raise ValueError("Unknown Position-Specific Matrix type: '{}'".format(type)) | |
return pd.DataFrame(dct, **kwargs) | |
def read_motif_matrix(filepath_or, format, type='counts', all=False, **kwargs): | |
""" | |
Read a position-specific matrix of frequencies or weights. | |
Parameters | |
---------- | |
filepath_or : str or file-like | |
File containing a position frequency or weight matrix. | |
format : str | |
A format that Biopython understands. Case-insensitive. | |
- AlignAce: AlignAce output file format | |
- ClusterBuster: Cluster Buster position frequency matrix format | |
- XMS: XMS matrix format | |
- MEME: MEME output file motif | |
- MINIMAL: MINIMAL MEME output file motif | |
- MAST: MAST output file motif | |
- TRANSFAC: TRANSFAC database file format | |
- pfm-four-columns: Generic position-frequency matrix format with | |
four columns. (cisbp, homer, hocomoco, neph, | |
tiffin) | |
- pfm-four-rows: Generic position-frequency matrix format with | |
four row. (scertf, yetfasco, hdpi, idmmpmm, | |
flyfactor survey) | |
- pfm: JASPAR-style position-frequency matrix | |
- jaspar: JASPAR-style multiple PFM format | |
- sites: JASPAR-style sites file | |
type : {'counts', 'probability', 'weight', 'information', 'background'} | |
The type of position-specific matrix. | |
- counts: | |
Counts/frequencies for each position. | |
- probability, pfm: | |
Normalized counts/frequencies for each position. | |
- weight, pwm, pssm: | |
Compute log odds (observed vs background) scores. | |
- information: | |
Compute information content (observed vs background). | |
- background: | |
Return the background frequencies. Background is assumed to be | |
uniform if not included in the file. | |
all : bool, optional | |
Return a dictionary of all motifs if more than one exists in the file. | |
By default, only the first motif found is returned. | |
Returns | |
------- | |
DataFrame (n, alphabet_size) | |
""" | |
import Bio.motifs as bmo | |
format = format.lower() | |
type = type.lower() | |
own_fh = False | |
if isinstance(filepath_or, str): | |
fh = open(filepath_or) | |
own_fh = True | |
try: | |
motifs = bmo.parse(fh, format, strict=False) | |
except ValueError as e: | |
if format == 'meme' and 'TRAINING SET' in str(e): | |
fh.seek(0) | |
motifs = bmo.parse(fh, 'MINIMAL', strict=False) | |
else: | |
raise e | |
finally: | |
if own_fh: | |
fh.close() | |
if len(motifs) == 0: | |
raise ValueError("No motifs found") | |
if not all: | |
return _read_motif_matrix(motifs[0], type, **kwargs) | |
else: | |
keys = [motif.name or motif.consensus for motif in motifs] | |
values = [_read_motif_matrix(motif, type, **kwargs) for motif in motifs] | |
return dict(zip(keys, values)) | |
def revcomp(mat_df): | |
return ( | |
mat_df | |
.iloc[::-1].reset_index(drop=True) # reverse | |
.iloc[:, ::-1] | |
.rename(columns={'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}) # complement | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment