Last active
January 12, 2016 01:02
-
-
Save dela3499/ec0a045bbdb8b5dd07f0 to your computer and use it in GitHub Desktop.
CMAP KS-test Implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# type PrerankedProfile = Series(Index = Integer, Probe_ID = String, Rank = Integer) | |
# DataFrame -> List PrerankedProfile | |
def cmap2sortedseries(df): | |
return [series.order(ascending = True).reset_index() for series in df2series(df)] | |
# PrerankedProfile -> List a -> Float | |
def gsea_preranked(sorted_series, tag_list): | |
""" Apply KS test. | |
ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles) | |
""" | |
n_tags = float(len(tag_list)) | |
relative_ranks = (sorted_series[sorted_series.ix[:,0].isin(tag_list)].index + 1)/ float(len(sorted_series)) | |
relative_tag_ranks = np.linspace(0, 1, n_tags + 1) | |
a = max(relative_tag_ranks[1:] - relative_ranks) | |
b = max(relative_ranks - relative_tag_ranks[:-1]) | |
return if_(a > b, a, -b) | |
# Example | |
cmap = pd.read_csv('/path/to/rankMatrix.txt', sep = '\t', index_col = 0) | |
instances = cmap2sortedseries(cmap) | |
list_of_probes = ['207199_at', '201058_s_at'] # example probe list to find enrichment for | |
enrichment_value = gsea_preranked(instances[0], list_of_probes) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Faster version (about 160 times faster than above) - vectorized | |
# Matrix -> Matrix | |
sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat) | |
cmap_relative_ranks = cmap.values / float(cmap.shape[0]) | |
# PrerankedProfile -> List a -> Float | |
def gsea_preranked2(tag_indices): | |
""" Apply KS test. | |
ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles) | |
""" | |
n_tags = float(len(tag_indices)) | |
sorted_relative_ranks = sort_columns(cmap_relative_ranks[tag_indices,:]) | |
relative_tag_ranks = list2col(np.linspace(0, 1, n_tags + 1)) | |
a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0) | |
b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0) | |
return tz.thread_last( | |
[a,b], | |
np.vstack, | |
(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# DataFrame[Row = Probe, Column = Profile] | |
def get_cmap(): | |
""" Return dataframe with cmap data. """ | |
return pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1) | |
# DataFrame[Row = Probe, Column = Profile] | |
def get_relative_cmap(): | |
""" Return dataframe with cmap data as relative ranks. """ | |
cmap = pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1) | |
return cmap / float(cmap.shape[0]) | |
# DataFrame | |
def get_cmap_metadata(): | |
""" Return DataFrame with CMAP metadata for each instance. """ | |
return pd.read_csv('/home/bioage/scoring/CMAP test info/cmap_metadata.csv', delimiter = '\t', index_col = 0).dropna(how = 'all', axis = 0) | |
# String -> Dict | |
def read_msigdb_probesets(filename = '/home/bioage/scoring/msigdb_v5.0/probeset.json'): | |
""" Read newline-delimited json file with probesets into a | |
dictionary, where each record is a two-element list. """ | |
return tz.thread_last( | |
filename, | |
read_file, | |
split_lines, | |
(map, json.loads), | |
dict) | |
# Matrix -> Matrix | |
sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat) | |
# DataFrame[row = probe, col = drug, val = relative rank] -> List String -> Float | |
def gsea_preranked(cmap_relative_ranks, probes): | |
""" Apply KS test. Provide cmap containing relative ranks. | |
ex: scores = map(lambda profile: gsea_preranked(cmap, up_probe_indices), profiles) | |
""" | |
n_probes = float(len(probes)) | |
sorted_relative_ranks = sort_columns(cmap_relative_ranks.loc[probes].values) | |
relative_tag_ranks = list2col(np.linspace(0, 1, n_probes + 1)) | |
a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0) | |
b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0) | |
return tz.thread_last( | |
[a,b], | |
np.vstack, | |
(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0)) # I can vectorize this better. | |
# Ex. | |
cmap = get_relative_cmap() # 30s | |
sigs = read_msigdb_probesets() | |
senescence_signatures = sorted([(name, probes) for name, probes in sigs.items() if 'senesc' in name.lower()], key = lambda (_, b): len(b)) # 7 signatures | |
# Step 2: Get enrichment scores for each signature across all CMAP instances | |
senescence_drug_scores = [gsea_preranked(cmap, probes) for _, probes in senescence_signatures] | |
senescence_results = pd.DataFrame( | |
senescence_drug_scores, | |
index = map(head, senescence_signatures), | |
columns = cmap.columns) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment