Skip to content

Instantly share code, notes, and snippets.

View lwaldron's full-sized avatar

Levi Waldron lwaldron

View GitHub Profile
@lwaldron
lwaldron / create_test_parquets.sql
Last active July 30, 2025 13:43
Create parquet files from only the first two pMD tsv files for each file type
-- Install and load the httpfs extension to read from GCS
install httpfs;
load httpfs;
-- Create a secret for accessing Google Cloud Storage using a JSON key file
CREATE SECRET metagenomics_mac (
TYPE GCS,
KEY_FILE '/path/to/your/gcp-credentials.json'
);
# Title: Benchmark Query Performance on Sorted vs. Unsorted Parquet Files
# Description: This script first creates a new Parquet file sorted by the
# '# Gene Family' column. It then uses the 'microbenchmark'
# package to compare the query speed for a specific gene
# family between the original, unsorted file and the new,
# sorted file.
# --- 1. SETUP: Load necessary libraries ---
# Ensure you have these packages installed:
# install.packages(c("duckdb", "dplyr", "arrow", "microbenchmark"))
@lwaldron
lwaldron / FprausnitziiPD.Rmd
Created July 19, 2025 16:10
F. prausnitzii in PD
---
title: "F. prausnitzii in PD"
author: "Levi Waldron"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
@lwaldron
lwaldron / pMDstudies.Rmd
Created July 19, 2025 14:15
Parkinson's curated metagenomic studies.
---
title: "pMDstudies"
author: "Levi Waldron"
date: "`r Sys.Date()`"
output: html_document
---
```{r, message=FALSE, warning=FALSE, echo=FALSE}
library(parkinsonsMetagenomicData)
library(dplyr)
suppressPackageStartupMessages(library(parkinsonsMetagenomicData))
#> Set default bucket name to 'metagenomics-mac'
packageVersion("parkinsonsMetagenomicData")
#> [1] '0.0.0.9000'
colnames(sampleMetadata) |> sort()
#> [1] "Adhesive removal (s)"
#> [2] "age"
#> [3] "Age"
#> [4] "AGE"
#> [5] "age_group"
@lwaldron
lwaldron / cmd_giant_table.R
Created July 11, 2024 11:19
Summarize curatedMetagenomicData studies in one giant Epi Table 1
library(curatedMetagenomicData)
library(dplyr)
library(table1)
dat <- sampleMetadata |>
select(study_name, body_site, study_condition, age_category, age, BMI)
# labeling is optional, just to make the table nicer
label(dat$body_site) <- "Body Site"
label(dat$study_condition) <- "Study Condition"
@lwaldron
lwaldron / knn-matching.R
Created June 25, 2024 18:45
One way to age match using k-nearest neighbors
library(nabor)
# suppose you have two vectors of propensity scores
propensity_scores1 <- c(0.1, 0.2, 0.3, 0.4, 0.5) #more controls
propensity_scores2 <- c(0.15, 0.25, 0.35) #fewer cases
# use the knn function from the nabor package to find the index of the closest match in propensity_scores2 for each score in propensity_scores1
matches <- nabor::knn(matrix(propensity_scores2), matrix(propensity_scores1), k = 1)$nn.idx
# print the matches
@lwaldron
lwaldron / cmd_healthycontrols.R
Created May 3, 2024 14:30
curatedMetagenomicData healthy control samples, relab + metadata csv file per age category
library(curatedMetagenomicData)
library(dplyr)
agecats <- unique(sampleMetadata$age_category) |> na.omit()
sm <- filter(sampleMetadata, study_condition=="control") |>
filter(disease == "healthy") |>
filter(body_site == "stool") |>
filter(!is.na(age_category))
for (agecat in agecats){
sm1 <- filter(sm, age_category == agecat)
@lwaldron
lwaldron / lefser_pathwayab.R
Last active March 18, 2024 10:34
lefser on pathway abundances using ZellerG_2014 from cMD
suppressPackageStartupMessages({
library(lefser)
library(curatedMetagenomicData)
})
zeller <-
curatedMetagenomicData("ZellerG_2014.pathway_abundance",
counts = TRUE,
dryrun = FALSE)[[1]]
zeller <- zeller[, zeller$study_condition != "adenoma"]
zeller <- relativeAb(zeller)
@lwaldron
lwaldron / gist:edea48dfda3c9db34b80a326f50fc5d1
Last active February 24, 2024 21:23
Select some UniRef IDs from curatedMetagenomicData studies, join, write to file
suppressPackageStartupMessages({
library(curatedMetagenomicData)
library(mia)
library(dplyr)
library(purrr)
})
datasets <- sampleMetadata |>
group_by(study_name) |>
count() |>