Last active
July 30, 2025 13:43
-
-
Save lwaldron/f4fcc974ced551d7901420e26e61a4e8 to your computer and use it in GitHub Desktop.
Create parquet files from only the first two pMD tsv files for each file type
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Install and load the httpfs extension to read from GCS | |
install httpfs; | |
load httpfs; | |
-- Create a secret for accessing Google Cloud Storage using a JSON key file | |
CREATE SECRET metagenomics_mac ( | |
TYPE GCS, | |
KEY_FILE '/path/to/your/gcp-credentials.json' | |
); | |
-- Define a variable for the local output directory to make the script easier to manage. | |
SET local_path = '~/Downloads'; | |
-- NOTE: Each of the following COPY statements has been modified to process only | |
-- the first 2 matching files from the GCS bucket for testing purposes. | |
-- You can change `LIMIT 2` to any number of files you'd like to include. | |
-- MetaPhlAn | |
-- Create a small test parquet file for viral clusters from the first 2 TSV files found. | |
copy ( | |
select * from read_csv( | |
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_lists/metaphlan_viruses_list.tsv.gz') limit 2), | |
filename=True, | |
delim='\t', | |
comment='#', | |
union_by_name=True | |
) | |
) to local_path || 'viral_clusters.parquet' (format parquet, compression 'zstd'); | |
-- Create a small test parquet file for relative abundance from the first 2 TSV files found. | |
copy ( | |
select * from read_csv( | |
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_lists/metaphlan_unknown_list.tsv.gz') limit 2), | |
filename=True, | |
delim='\t', | |
comment='#', | |
union_by_name=True | |
) | |
) to local_path || 'relative_abundance.parquet' (format parquet, compression 'zstd'); | |
-- Create a small test parquet file for marker abundance from the first 2 TSV files found. | |
copy ( | |
select * from read_csv_auto( | |
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_markers/marker_abundance.tsv.gz') limit 2), | |
filename=True, | |
delim = '\t', | |
comment='#', | |
union_by_name=True | |
) | |
) to local_path || 'marker_abundance.parquet' (format parquet, compression 'zstd'); | |
-- Create a small test parquet file for marker presence from the first 2 TSV files found. | |
copy ( | |
select * from read_csv_auto( | |
(select file from glob('gs://metagenomics-mac/results/cMDv4/*/metaphlan_markers/marker_presence.tsv.gz') limit 2), | |
filename=True, | |
delim = '\t', | |
comment='#', | |
union_by_name=True | |
) | |
) to local_path || 'marker_presence.parquet' (format parquet, compression 'zstd'); | |
-- HUMAnN | |
-- The pattern is repeated for all HUMAnN files, using the local_path variable. | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_cpm_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_cpm_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_genefamilies_relab_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'genefamilies_relab_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_cpm_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_cpm_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathabundance_relab_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathabundance_relab_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage_unstratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage_unstratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage_stratified.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage_stratified.parquet' (format parquet, compression 'zstd'); | |
copy (select * from read_csv_auto((select file from glob('gs://metagenomics-mac/results/cMDv4/*/humann/out_pathcoverage.tsv.gz') limit 2), filename=True)) to local_path || 'pathcoverage.parquet' (format parquet, compression 'zstd'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run from the command-line:
duckdb < create_test_parquets.sql
See instructions for obtaining your Google key ID and secret in the Google Cloud Storage vignette