Created
October 28, 2024 21:25
-
-
Save tsibley/c1913925e085d20c645ccad6754c5eaf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml | |
index 7bd0362..dfddd22 100644 | |
--- a/ingest/defaults/config.yaml | |
+++ b/ingest/defaults/config.yaml | |
@@ -39,9 +39,11 @@ curate: | |
# For the Nextstrain team, this is currently | |
# "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" | |
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv" | |
- # The path to the local geolocation rules within the pathogen repo | |
+ # General global defaults are sourced from the geolocation_rules_url URL above. | |
+ # Pathogen-specific defaults are sourced from defaults/geolocation_rules.tsv. | |
+ # Local customizations are sourced from the path named here. | |
# The path should be relative to the ingest directory. | |
- local_geolocation_rules: "defaults/geolocation_rules.tsv" | |
+ local_geolocation_rules: ~ | |
# List of field names to change where the key is the original field name and the value is the new field name | |
# The original field names should match the ncbi_datasets_fields provided above. | |
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names | |
@@ -93,10 +95,12 @@ curate: | |
authors_default_value: "?" | |
# Name to use for the generated abbreviated authors field | |
abbr_authors_field: "abbr_authors" | |
- # Path to the manual annotations file | |
- # The path should be relative to the ingest directory | |
- annotations: "defaults/annotations.tsv" | |
+ # Pathogen-specific default annotations are sourced from defaults/annotations.tsv. | |
+ # Local customizations are sourced from the path named here. | |
+ # The path should be relative to the ingest directory. | |
+ annotations: ~ | |
# The ID field in the metadata to use to merge the manual annotations | |
+ # XXX FIXME: address differing id fields between defaults and local | |
annotations_id: "accession" | |
# The ID field in the metadata to use as the sequence id in the output FASTA file | |
output_id_field: "accession" | |
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk | |
index 71ffa18..bfa088e 100644 | |
--- a/ingest/rules/curate.smk | |
+++ b/ingest/rules/curate.smk | |
@@ -13,10 +13,11 @@ OUTPUTS: | |
""" | |
-# The following two rules can be ignored if you choose not to use the | |
+# The following rule can be ignored if you choose not to use the | |
# generalized geolocation rules that are shared across pathogens. | |
# The Nextstrain team will try to maintain a generalized set of geolocation | |
-# rules that can then be overridden by local geolocation rules per pathogen repo. | |
+# rules that can then be overridden by additional geolocation rules per | |
+# pathogen repo. | |
rule fetch_general_geolocation_rules: | |
output: | |
general_geolocation_rules="data/general-geolocation-rules.tsv", | |
@@ -30,13 +31,28 @@ rule fetch_general_geolocation_rules: | |
rule concat_geolocation_rules: | |
input: | |
- general_geolocation_rules="data/general-geolocation-rules.tsv", | |
- local_geolocation_rules=config["curate"]["local_geolocation_rules"], | |
- output: | |
- all_geolocation_rules="data/all-geolocation-rules.tsv", | |
+ *filter(None, [ | |
+ "data/general-geolocation-rules.tsv", # Remove if not using above rule | |
+ "defaults/geolocation_rules.tsv", | |
+ config["curate"]["local_geolocation_rules"], | |
+ ]) | |
+ output: "data/all-geolocation-rules.tsv", | |
shell: | |
""" | |
- cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} | |
+ cat {input:q} > {output:q} | |
+ """ | |
+ | |
+ | |
+rule concat_annotations: | |
+ input: | |
+ *filter(None, [ | |
+ "defaults/annotations.tsv", | |
+ config["curate"]["annotations"], | |
+ ]) | |
+ output: "data/all-annotations.tsv", | |
+ shell: | |
+ """ | |
+ cat {input:q} > {output:q} | |
""" | |
@@ -57,9 +73,8 @@ def format_field_map(field_map: dict[str, str]) -> str: | |
rule curate: | |
input: | |
sequences_ndjson="data/ncbi.ndjson", | |
- # Change the geolocation_rules input path if you are removing the above two rules | |
all_geolocation_rules="data/all-geolocation-rules.tsv", | |
- annotations=config["curate"]["annotations"], | |
+ annotations="data/all-annotations.tsv", | |
output: | |
metadata="data/all_metadata.tsv", | |
sequences="results/sequences.fasta", |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment