Skip to content

Instantly share code, notes, and snippets.

@dela3499
Last active January 12, 2016 01:02
Show Gist options
  • Save dela3499/ec0a045bbdb8b5dd07f0 to your computer and use it in GitHub Desktop.
Save dela3499/ec0a045bbdb8b5dd07f0 to your computer and use it in GitHub Desktop.
CMAP KS-test Implementation
# type PrerankedProfile = Series(Index = Integer, Probe_ID = String, Rank = Integer)
# DataFrame -> List PrerankedProfile
def cmap2sortedseries(df):
return [series.order(ascending = True).reset_index() for series in df2series(df)]
# PrerankedProfile -> List a -> Float
def gsea_preranked(sorted_series, tag_list):
""" Apply KS test.
ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
"""
n_tags = float(len(tag_list))
relative_ranks = (sorted_series[sorted_series.ix[:,0].isin(tag_list)].index + 1)/ float(len(sorted_series))
relative_tag_ranks = np.linspace(0, 1, n_tags + 1)
a = max(relative_tag_ranks[1:] - relative_ranks)
b = max(relative_ranks - relative_tag_ranks[:-1])
return if_(a > b, a, -b)
# Example
cmap = pd.read_csv('/path/to/rankMatrix.txt', sep = '\t', index_col = 0)
instances = cmap2sortedseries(cmap)
list_of_probes = ['207199_at', '201058_s_at'] # example probe list to find enrichment for
enrichment_value = gsea_preranked(instances[0], list_of_probes)
# Faster version (about 160 times faster than above) - vectorized
# Matrix -> Matrix
sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)
cmap_relative_ranks = cmap.values / float(cmap.shape[0])
# PrerankedProfile -> List a -> Float
def gsea_preranked2(tag_indices):
""" Apply KS test.
ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
"""
n_tags = float(len(tag_indices))
sorted_relative_ranks = sort_columns(cmap_relative_ranks[tag_indices,:])
relative_tag_ranks = list2col(np.linspace(0, 1, n_tags + 1))
a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
return tz.thread_last(
[a,b],
np.vstack,
(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0))
# DataFrame[Row = Probe, Column = Profile]
def get_cmap():
""" Return dataframe with cmap data. """
return pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)
# DataFrame[Row = Probe, Column = Profile]
def get_relative_cmap():
""" Return dataframe with cmap data as relative ranks. """
cmap = pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)
return cmap / float(cmap.shape[0])
# DataFrame
def get_cmap_metadata():
""" Return DataFrame with CMAP metadata for each instance. """
return pd.read_csv('/home/bioage/scoring/CMAP test info/cmap_metadata.csv', delimiter = '\t', index_col = 0).dropna(how = 'all', axis = 0)
# String -> Dict
def read_msigdb_probesets(filename = '/home/bioage/scoring/msigdb_v5.0/probeset.json'):
""" Read newline-delimited json file with probesets into a
dictionary, where each record is a two-element list. """
return tz.thread_last(
filename,
read_file,
split_lines,
(map, json.loads),
dict)
# Matrix -> Matrix
sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)
# DataFrame[row = probe, col = drug, val = relative rank] -> List String -> Float
def gsea_preranked(cmap_relative_ranks, probes):
""" Apply KS test. Provide cmap containing relative ranks.
ex: scores = map(lambda profile: gsea_preranked(cmap, up_probe_indices), profiles)
"""
n_probes = float(len(probes))
sorted_relative_ranks = sort_columns(cmap_relative_ranks.loc[probes].values)
relative_tag_ranks = list2col(np.linspace(0, 1, n_probes + 1))
a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
return tz.thread_last(
[a,b],
np.vstack,
(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0)) # I can vectorize this better.
# Ex.
cmap = get_relative_cmap() # 30s
sigs = read_msigdb_probesets()
senescence_signatures = sorted([(name, probes) for name, probes in sigs.items() if 'senesc' in name.lower()], key = lambda (_, b): len(b)) # 7 signatures
# Step 2: Get enrichment scores for each signature across all CMAP instances
senescence_drug_scores = [gsea_preranked(cmap, probes) for _, probes in senescence_signatures]
senescence_results = pd.DataFrame(
senescence_drug_scores,
index = map(head, senescence_signatures),
columns = cmap.columns)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment