We will get the cosine similarity of genes starting with CDK and PPARG using pure duckdb.
Get the ids
CREATE OR REPLACE TABLE jcp_symbol AS (SELECT Metadata_JCP2022,Metadata_Symbol FROM read_csv(['https://github.com/jump-cellpainting/datasets/raw/99c34b66f51c5971c85417c02e191f43057c22a8/metadata/crispr.csv.gz', 'https://github.com/jump-cellpainting/datasets/raw/99c34b66f51c5971c85417c02e191f43057c22a8/metadata/orf.csv.gz']) WHERE starts_with(Metadata_Symbol, 'CDK') OR starts_with(Metadata_Symbol, 'PPARG'));
SELECT * FROM jcp_symbol LIMIT 2;
SELECT COUNT(*) AS nrows FROM jcp_symbol;