We will get the cosine similarity of genes starting with CDK and PPARG using pure duckdb.
Get the ids
CREATE OR REPLACE TABLE jcp_symbol AS (SELECT Metadata_JCP2022,Metadata_Symbol FROM read_csv(['https://github.com/jump-cellpainting/datasets/raw/99c34b66f51c5971c85417c02e191f43057c22a8/metadata/crispr.csv.gz', 'https://github.com/jump-cellpainting/datasets/raw/99c34b66f51c5971c85417c02e191f43057c22a8/metadata/orf.csv.gz']) WHERE starts_with(Metadata_Symbol, 'CDK') OR starts_with(Metadata_Symbol, 'PPARG'));
SELECT * FROM jcp_symbol LIMIT 2;
SELECT COUNT(*) AS nrows FROM jcp_symbol;
CREATE TABLE IF NOT EXISTS subset_features AS (SELECT * FROM jcp_symbol NATURAL JOIN (SELECT * FROM read_parquet('https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/ALL/v1.0b/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet')));
SELECT * FROM subset_features LIMIT 2;
SELECT COUNT(*) AS nrows FROM subset_features; -- nrows
CREATE OR REPLACE TABLE consensus AS (SELECT Metadata_Symbol,MEAN(COLUMNS('X_*')) FROM subset_features GROUP BY Metadata_Symbol);
SELECT * FROM consensus LIMIT 2;
SELECT COUNT(*) AS nrows FROM consensus;
Transpose the table, because duckdb performs much better aggregating columns.
CREATE OR REPLACE TABLE transposed AS PIVOT(UNPIVOT consensus ON COLUMNS('X_*')) ON Metadata_Symbol USING first(value);
SELECT * FROM transposed LIMIT 2;
SELECT COUNT(*) AS nrows FROM transposed;
Create vector arrays to calculate distance between them (they ought to be on the same column)
CREATE OR REPLACE TABLE lists AS UNPIVOT (SELECT LIST(COLUMNS(* EXCLUDE('name'))) FROM transposed) ON *;
CREATE OR REPLACE TABLE vec AS (SELECT name,value::Float[614] AS vec FROM lists);
SELECT * FROM vec LIMIT 2
Based on this example of vector search, we calculate the cross JOIN
CREATE OR REPLACE TABLE cosim AS (SELECT x.name as gene_1,
y.name as gene_2,
array_cosine_similarity(x.vec, y.vec) AS similarity_metric
FROM vec AS x
CROSS JOIN vec AS y);
SELECT * FROM cosim LIMIT 5;