Skip to content

Instantly share code, notes, and snippets.

@pgm
Created September 22, 2020 02:30
Show Gist options
  • Save pgm/ac2ac4c664ef81200ce49133cc4cee02 to your computer and use it in GitHub Desktop.
Save pgm/ac2ac4c664ef81200ce49133cc4cee02 to your computer and use it in GitHub Desktop.
Script based on code from the DepMap portal code to compute the top N pearson correlations and write to a CSV file
import argparse
import sqlite3
import sys
import numpy as np
import pandas as pd
def main():
parser = argparse.ArgumentParser()
parser.add_argument("in_csv_0")
parser.add_argument("in_csv_1")
parser.add_argument("--batchsize", type=int, default=500)
parser.add_argument("--limit", help="Top n correlates to keep", type=int, default=100)
parser.add_argument("output_file")
args = parser.parse_args()
in_0_df = pd.read_csv(args.in_csv_0, index_col="DepMap_ID")
in_1_df = pd.read_csv(args.in_csv_1, index_col="DepMap_ID")
in_0_df, in_1_df = with_shared_cell_lines(in_0_df, in_1_df)
in_0_cols = in_0_df.columns
in_1_cols = in_1_df.columns
correlations_df = create_correlations_df(in_0_df, in_1_df, args.batchsize, args.limit)
correlations_df["dim_0"] = [in_0_cols[i] for i in correlations_df["dim_0"]]
correlations_df["dim_1"] = [in_1_cols[i] for i in correlations_df["dim_1"]]
correlations_df.to_csv(args.output_file, index=False)
def with_shared_cell_lines(dep_df, biomarker_df):
shared_cell_lines = np.intersect1d(dep_df.index, biomarker_df.index)
dep_df = dep_df.loc[shared_cell_lines]
biomarker_df = biomarker_df.loc[shared_cell_lines]
return dep_df, biomarker_df
def create_correlations_df(dep_df, biomarker_df, batchsize, limit):
# assumes rows have already been aligned
biomarker_df.columns = list(range(len(biomarker_df.columns)))
partial_dfs = [
create_correlations_df_partial(dep_df, biomarker_df_partial, limit)
# for slices of 500 columns at a time from biomarker_df
for _, biomarker_df_partial in biomarker_df.groupby(
np.arange(len(biomarker_df.columns)) // batchsize, axis=1
)
]
return concat_dfs_and_filter(partial_dfs, limit)
def create_correlations_df_partial(dep_df, biomarker_df_partial, limit):
correlations = fast_cor_with_missing(dep_df.values, biomarker_df_partial.values)
(
top_ranked_cols_per_row,
top_ranked_rows_per_col,
row_indexes,
col_indexes,
) = top_ranked_indexes_per_row_and_col(-np.abs(correlations), limit)
df = pd.DataFrame(
{
"cor": np.hstack(
(
correlations[row_indexes, top_ranked_cols_per_row],
correlations[top_ranked_rows_per_col, col_indexes],
)
),
"dim_0": list(row_indexes) + list(top_ranked_rows_per_col),
"dim_1": biomarker_df_partial.columns[
list(top_ranked_cols_per_row) + list(col_indexes)
],
},
columns=["dim_0", "dim_1", "cor"],
)
return df.drop_duplicates(["dim_0", "dim_1"])
def top_ranked_indexes_per_row_and_col(matrix, limit):
"""Gets the coordinates for the largest `LIMIT` values, by row and by column."""
num_rows, num_cols = matrix.shape
limit_per_col = min(num_rows, limit)
limit_per_row = min(num_cols, limit)
top_ranked_cols_per_row = np.argpartition(matrix, limit_per_row - 1, axis=1)[
:, :limit_per_row
].flatten()
top_ranked_rows_per_col = np.argpartition(matrix, limit_per_col - 1, axis=0)[
:limit_per_col
].flatten()
row_indexes = np.repeat(range(num_rows), limit_per_row)
col_indexes = np.tile(range(num_cols), limit_per_col)
return top_ranked_cols_per_row, top_ranked_rows_per_col, row_indexes, col_indexes
def fast_cor_with_missing(x, y):
# preallocate storage for the result
result = np.zeros(shape=(x.shape[1], y.shape[1]))
x_groups = group_cols_with_same_mask(x)
y_groups = group_cols_with_same_mask(y)
for x_mask, x_columns in x_groups:
for y_mask, y_columns in y_groups:
# print(x_mask, x_columns, y_mask, y_columns)
combined_mask = x_mask & y_mask
# not sure if this is the fastest way to slice out the relevant subset
x_without_holes = x[:, x_columns][combined_mask, :]
y_without_holes = y[:, y_columns][combined_mask, :]
c = np_pearson_cor(x_without_holes, y_without_holes)
# update result with these correlations
result[np.ix_(x_columns, y_columns)] = c
return result
def group_cols_with_same_mask(x):
"""
Group columns with the same indexes of NAN values.
Return a sequence of tuples (mask, columns) where columns are the column indices
in x which all have the mask.
"""
per_mask = {}
for i in range(x.shape[1]):
o_mask = np.isfinite(x[:, i])
o_mask_b = np.packbits(o_mask).tobytes()
if o_mask_b not in per_mask:
per_mask[o_mask_b] = [o_mask, []]
per_mask[o_mask_b][1].append(i)
return per_mask.values()
def np_pearson_cor(x, y):
"""Full column-wise Pearson correlations of two matrices."""
xv = x - x.mean(axis=0)
yv = y - y.mean(axis=0)
xvss = (xv * xv).sum(axis=0)
yvss = (yv * yv).sum(axis=0)
# print(xvss, yvss)
# print(np.matmul(xv.transpose(), yv) , np.sqrt(np.outer(xvss, yvss)))
result = np.matmul(xv.transpose(), yv) / np.sqrt(np.outer(xvss, yvss))
return np.maximum(np.minimum(result, 1.0), -1.0)
def concat_dfs_and_filter(dfs, limit):
df = pd.concat(dfs, ignore_index=True, sort=False)
df["cor_abs"] = df["cor"].abs()
df["dim_0_rank"] = df.groupby("dim_1")["cor_abs"].rank(ascending=False)
df["dim_1_rank"] = df.groupby("dim_0")["cor_abs"].rank(ascending=False)
df = df[(df["dim_0_rank"] <= limit) | (df["dim_1_rank"] <= limit)]
del df["cor_abs"]
del df["dim_0_rank"]
del df["dim_1_rank"]
return df
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment