Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active June 16, 2020 22:10
Show Gist options
  • Save nvictus/3a012429fc24dbea83fe93f163a0fa73 to your computer and use it in GitHub Desktop.
Save nvictus/3a012429fc24dbea83fe93f163a0fa73 to your computer and use it in GitHub Desktop.
motif PFM/PWM dataframe reader
import numpy as np
import pandas as pd
def _probability_mat_to_information_mat(prob_df, bg_df):
"""
Converts a probability matrix to an information matrix.
Taken from logomaker (https://github.com/jbkinney/logomaker).
"""
SMALL = np.finfo(float).tiny
fg_vals = prob_df.values
bg_vals = bg_df.values
tmp_vals = fg_vals * (np.log2(fg_vals + SMALL) - np.log2(bg_vals + SMALL))
info_vec = tmp_vals.sum(axis=1)
info_df = prob_df.copy()
info_df.loc[:, :] = fg_vals * info_vec[:, np.newaxis]
return info_df
def _read_motif_matrix(motif, type, **kwargs):
if type == 'counts':
dct = motif.counts
elif type in {'probability', 'pfm'}:
dct = motif.pwm
elif type in {'weight', 'pwm', 'pssm'}:
dct = motif.pssm
elif type == 'background':
dct = {
letter: [freq] * len(motif)
for letter, freq in motif.background.items()
}
elif type == 'information':
fg = pd.DataFrame(motif.pwm)
bg = pd.DataFrame({
letter: [freq] * len(motif)
for letter, freq in motif.background.items()
})
return _probability_mat_to_information_mat(fg, bg)
else:
raise ValueError("Unknown Position-Specific Matrix type: '{}'".format(type))
return pd.DataFrame(dct, **kwargs)
def read_motif_matrix(filepath_or, format, type='counts', all=False, **kwargs):
"""
Read a position-specific matrix of frequencies or weights.
Parameters
----------
filepath_or : str or file-like
File containing a position frequency or weight matrix.
format : str
A format that Biopython understands. Case-insensitive.
- AlignAce: AlignAce output file format
- ClusterBuster: Cluster Buster position frequency matrix format
- XMS: XMS matrix format
- MEME: MEME output file motif
- MINIMAL: MINIMAL MEME output file motif
- MAST: MAST output file motif
- TRANSFAC: TRANSFAC database file format
- pfm-four-columns: Generic position-frequency matrix format with
four columns. (cisbp, homer, hocomoco, neph,
tiffin)
- pfm-four-rows: Generic position-frequency matrix format with
four row. (scertf, yetfasco, hdpi, idmmpmm,
flyfactor survey)
- pfm: JASPAR-style position-frequency matrix
- jaspar: JASPAR-style multiple PFM format
- sites: JASPAR-style sites file
type : {'counts', 'probability', 'weight', 'information', 'background'}
The type of position-specific matrix.
- counts:
Counts/frequencies for each position.
- probability, pfm:
Normalized counts/frequencies for each position.
- weight, pwm, pssm:
Compute log odds (observed vs background) scores.
- information:
Compute information content (observed vs background).
- background:
Return the background frequencies. Background is assumed to be
uniform if not included in the file.
all : bool, optional
Return a dictionary of all motifs if more than one exists in the file.
By default, only the first motif found is returned.
Returns
-------
DataFrame (n, alphabet_size)
"""
import Bio.motifs as bmo
format = format.lower()
type = type.lower()
own_fh = False
if isinstance(filepath_or, str):
fh = open(filepath_or)
own_fh = True
try:
motifs = bmo.parse(fh, format, strict=False)
except ValueError as e:
if format == 'meme' and 'TRAINING SET' in str(e):
fh.seek(0)
motifs = bmo.parse(fh, 'MINIMAL', strict=False)
else:
raise e
finally:
if own_fh:
fh.close()
if len(motifs) == 0:
raise ValueError("No motifs found")
if not all:
return _read_motif_matrix(motifs[0], type, **kwargs)
else:
keys = [motif.name or motif.consensus for motif in motifs]
values = [_read_motif_matrix(motif, type, **kwargs) for motif in motifs]
return dict(zip(keys, values))
def revcomp(mat_df):
return (
mat_df
.iloc[::-1].reset_index(drop=True) # reverse
.iloc[:, ::-1]
.rename(columns={'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}) # complement
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment