Skip to content

Instantly share code, notes, and snippets.

@tsibley
Created October 28, 2024 21:25
Show Gist options
  • Save tsibley/c1913925e085d20c645ccad6754c5eaf to your computer and use it in GitHub Desktop.
Save tsibley/c1913925e085d20c645ccad6754c5eaf to your computer and use it in GitHub Desktop.
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index 7bd0362..dfddd22 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -39,9 +39,11 @@ curate:
# For the Nextstrain team, this is currently
# "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
- # The path to the local geolocation rules within the pathogen repo
+ # General global defaults are sourced from the geolocation_rules_url URL above.
+ # Pathogen-specific defaults are sourced from defaults/geolocation_rules.tsv.
+ # Local customizations are sourced from the path named here.
# The path should be relative to the ingest directory.
- local_geolocation_rules: "defaults/geolocation_rules.tsv"
+ local_geolocation_rules: ~
# List of field names to change where the key is the original field name and the value is the new field name
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
@@ -93,10 +95,12 @@ curate:
authors_default_value: "?"
# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"
- # Path to the manual annotations file
- # The path should be relative to the ingest directory
- annotations: "defaults/annotations.tsv"
+ # Pathogen-specific default annotations are sourced from defaults/annotations.tsv.
+ # Local customizations are sourced from the path named here.
+ # The path should be relative to the ingest directory.
+ annotations: ~
# The ID field in the metadata to use to merge the manual annotations
+ # XXX FIXME: address differing id fields between defaults and local
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: "accession"
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
index 71ffa18..bfa088e 100644
--- a/ingest/rules/curate.smk
+++ b/ingest/rules/curate.smk
@@ -13,10 +13,11 @@ OUTPUTS:
"""
-# The following two rules can be ignored if you choose not to use the
+# The following rule can be ignored if you choose not to use the
# generalized geolocation rules that are shared across pathogens.
# The Nextstrain team will try to maintain a generalized set of geolocation
-# rules that can then be overridden by local geolocation rules per pathogen repo.
+# rules that can then be overridden by additional geolocation rules per
+# pathogen repo.
rule fetch_general_geolocation_rules:
output:
general_geolocation_rules="data/general-geolocation-rules.tsv",
@@ -30,13 +31,28 @@ rule fetch_general_geolocation_rules:
rule concat_geolocation_rules:
input:
- general_geolocation_rules="data/general-geolocation-rules.tsv",
- local_geolocation_rules=config["curate"]["local_geolocation_rules"],
- output:
- all_geolocation_rules="data/all-geolocation-rules.tsv",
+ *filter(None, [
+ "data/general-geolocation-rules.tsv", # Remove if not using above rule
+ "defaults/geolocation_rules.tsv",
+ config["curate"]["local_geolocation_rules"],
+ ])
+ output: "data/all-geolocation-rules.tsv",
shell:
"""
- cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules}
+ cat {input:q} > {output:q}
+ """
+
+
+rule concat_annotations:
+ input:
+ *filter(None, [
+ "defaults/annotations.tsv",
+ config["curate"]["annotations"],
+ ])
+ output: "data/all-annotations.tsv",
+ shell:
+ """
+ cat {input:q} > {output:q}
"""
@@ -57,9 +73,8 @@ def format_field_map(field_map: dict[str, str]) -> str:
rule curate:
input:
sequences_ndjson="data/ncbi.ndjson",
- # Change the geolocation_rules input path if you are removing the above two rules
all_geolocation_rules="data/all-geolocation-rules.tsv",
- annotations=config["curate"]["annotations"],
+ annotations="data/all-annotations.tsv",
output:
metadata="data/all_metadata.tsv",
sequences="results/sequences.fasta",
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment