-
-
Save dwinston/b16922ff304c59b3c8ba675b9069b61a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
client = MongoClient() | |
aggregation = [ | |
{ | |
"$set": { | |
"collection_date.has_date_value": { | |
"$dateFromString": { | |
"dateString": "$collection_date.has_raw_value", | |
}, | |
}, | |
}, | |
}, | |
{ | |
"$lookup": { | |
"from": "study_set", | |
"localField": "part_of.0", | |
"foreignField": "id", | |
"as": "study", | |
}, | |
}, | |
{ | |
"$lookup": { | |
"from": "omics_processing_set", | |
"localField": "id", | |
"foreignField": "has_input.0", | |
"as": "omics_processing", | |
}, | |
}, | |
{ | |
"$set": { | |
"multiomics": { | |
"$sortArray": { | |
"input": { | |
# This set difference removes duplicate omics types and removes lipidomics | |
"$setDifference": [ | |
"$omics_processing.omics_type.has_raw_value", | |
["Lipidomics"], | |
], | |
}, | |
"sortBy": 1, | |
}, | |
}, | |
}, | |
}, | |
] | |
activity_types = { | |
"mags_activity": "nmdc:MAGsAnalysisActivity", | |
"metabolomics_analysis_activity": "nmdc:MetabolomicsAnalysisActivity", | |
"metagenome_annotation_activity": "nmdc:MetagenomeAnnotation", | |
"metagenome_assembly": "nmdc:MetagenomeAssembly", | |
"metaproteomics_analysis_activity": "nmdc:MetaProteomicAnalysis", | |
"metatranscriptome_activity": "nmdc:metaT", | |
"nom_analysis_activity": "nmdc:NomAnalysisActivity", | |
} | |
for activity_type in activity_types: | |
# Pull in activities and data_objects associated with each omics_processing | |
aggregation.extend([ | |
{ | |
"$lookup": { | |
"from": f"{activity_type}_set", | |
"localField": "omics_processing.id", | |
"foreignField": "was_informed_by", | |
"as": activity_type, | |
}, | |
}, | |
{ | |
"$lookup": { | |
"from": "data_object_set", | |
"localField": f"{activity_type}.has_output", | |
"foreignField": "id", | |
"as": f"{activity_type}_data_object", | |
"pipeline": [ | |
{"$set": {"activity_type": activity_types[activity_type]}}, | |
], | |
}, | |
}, | |
]) | |
aggregation.extend([ | |
# Lookup metagenome annotations | |
{ | |
"$lookup": { | |
"from": "functional_annotation_agg", | |
"localField": "metagenome_annotation_activity.id", | |
"foreignField": "metagenome_annotation_id", | |
"as": "metagenome_annotation", | |
"pipeline": [ | |
{ | |
"$set": { | |
"id": "$gene_function_id", | |
"activity_id": "$metagenome_annotation_id", | |
}, | |
}, | |
{"$unset": ["_id", "metagenome_annotation_id", "gene_function_id"]}, | |
], | |
}, | |
}, | |
# Lookup metaproteomics annotations | |
{ | |
"$lookup": { | |
"from": "metap_gene_function_aggregation", | |
"localField": "metaproteomics_analysis_activity.id", | |
"foreignField": "metaproteomic_analysis_id", | |
"as": "metaproteomics_annotation", | |
"pipeline": [ | |
{ | |
"$set": { | |
"id": "$gene_function_id", | |
"activity_id": "$metaproteomic_analysis_id", | |
}, | |
}, | |
{"$unset": ["_id", "metaproteomic_analysis_id", "gene_function_id"]}, | |
], | |
}, | |
}, | |
# Combine annotations into a single annotation array | |
{ | |
"$set": { | |
"gene_function": { | |
"$concatArrays": ["$metagenome_annotation", "$metaproteomics_annotation"] | |
} | |
}, | |
}, | |
{ | |
"$unset": ["metagenome_annotation", "metaproteomics_annotation"], | |
}, | |
# Combine all analyses into a single activity array | |
{ | |
"$set": { | |
"activity": { | |
"$concatArrays": [f"${activity_type}" for activity_type in activity_types] | |
} | |
} | |
}, | |
# Remove the monstrous has_peptide_quantifications array | |
{ | |
"$set": { | |
"activity": { | |
"$map": { | |
"input": "$activity", | |
"as": "d", | |
"in": { | |
"$setField": { | |
"field": "has_peptide_quantifications", | |
"value": "$$REMOVE", | |
"input": "$$d" | |
} | |
} | |
} | |
}, | |
} | |
}, | |
# We are done with the separate activity types since they are all in the activity array now | |
{ | |
"$unset": list(activity_types.keys()), | |
}, | |
# Add a count so we can sort by the number of analyses each sample has | |
{ | |
"$set": { | |
"omics_processing_count": { | |
"$size": "$omics_processing" | |
} | |
} | |
} | |
]) | |
aggregation.extend([ | |
{ | |
"$set": { | |
"data_object": { | |
"$concatArrays": [f"${activity_type}_data_object" for activity_type in activity_types] | |
} | |
} | |
}, | |
{ | |
"$unset": [f"{activity_type}_data_object" for activity_type in activity_types] | |
}, | |
{ | |
"$out": "denormalized", | |
}, | |
]) | |
q = client.nmdc.biosample_set.aggregate(aggregation) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment