Skip to content

Instantly share code, notes, and snippets.

@lwaldron
Last active July 30, 2025 13:43
Show Gist options
  • Save lwaldron/f4fcc974ced551d7901420e26e61a4e8 to your computer and use it in GitHub Desktop.
Save lwaldron/f4fcc974ced551d7901420e26e61a4e8 to your computer and use it in GitHub Desktop.
Create parquet files from only the first two pMD tsv files for each file type
-- Install and load the httpfs extension to read from GCS
install httpfs;
load httpfs;
-- Create a secret for accessing Google Cloud Storage using a JSON key file
CREATE SECRET metagenomics_mac (
TYPE GCS,
KEY_FILE '/path/to/your/gcp-credentials.json'
);
-- Define a variable for the local output directory to make the script easier to manage.
SET local_path = '~/Downloads';
-- NOTE: Each of the following COPY statements has been modified to process only
-- the first 2 matching files from the GCS bucket for testing purposes.
-- You can change `LIMIT 2` to any number of files you'd like to include.
-- MetaPhlAn
-- Create a small test parquet file for viral clusters from the first 2 TSV files found.
copy (
select * from read_csv(
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_lists/metaphlan_viruses_list.tsv.gz') limit 2),
filename=True,
delim='\t',
comment='#',
union_by_name=True
)
) to local_path || 'viral_clusters.parquet' (format parquet, compression 'zstd');
-- Create a small test parquet file for relative abundance from the first 2 TSV files found.
copy (
select * from read_csv(
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_lists/metaphlan_unknown_list.tsv.gz') limit 2),
filename=True,
delim='\t',
comment='#',
union_by_name=True
)
) to local_path || 'relative_abundance.parquet' (format parquet, compression 'zstd');
-- Create a small test parquet file for marker abundance from the first 2 TSV files found.
copy (
select * from read_csv_auto(
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_markers/marker_abundance.tsv.gz') limit 2),
filename=True,
delim = '\t',
comment='#',
union_by_name=True
)
) to local_path || 'marker_abundance.parquet' (format parquet, compression 'zstd');
-- Create a small test parquet file for marker presence from the first 2 TSV files found.
copy (
select * from read_csv_auto(
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_markers/marker_presence.tsv.gz') limit 2),
filename=True,
delim = '\t',
comment='#',
union_by_name=True
)
) to local_path || 'marker_presence.parquet' (format parquet, compression 'zstd');
-- HUMAnN
-- The pattern is repeated for all HUMAnN files, using the local_path variable.
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage_unstratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage_stratified.parquet' (format parquet, compression 'zstd');
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage.parquet' (format parquet, compression 'zstd');
@lwaldron
Copy link
Author

lwaldron commented Jul 30, 2025

Run from the command-line: duckdb < create_test_parquets.sql

See instructions for obtaining your Google key ID and secret in the Google Cloud Storage vignette

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment