Add support for nextstrain run

joverlee521 · joverlee521 · commit 386f5d916fc3 · 2025-09-22T17:06:32.000-07:00
Workflow changes to support running the workflows with the `nextstrain run` command. I specifically chose _not_ to declare compatibility for `nextstrain run` in the `nextstrain-pathogen.yaml` because the phylogenetic and nextclade workflows are incomplete in this guide. Once workflow specific compatibility is supported,¹ we can declare compatibility for the ingest workflow. ¹ <nextstrain/cli#472>
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -2,11 +2,14 @@
 This is the main ingest Snakefile that orchestrates the full ingest workflow
 and defines its default outputs.
 """
-# The workflow filepaths are written relative to this Snakefile's base directory
-workdir: workflow.current_basedir
 
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "defaults/config.yaml"
+# Use default configuration values. Extend with Snakemake's --configfile/--config options.Add commentMore actions
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
+
+# Use custom configuration from analysis directory (i.e. working dir), if any.
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
+
 
 # This is the default rule that Snakemake will run when there are no specified targets.
 # The default output of the ingest workflow is usually the curated metadata and sequences.
@@ -21,6 +24,10 @@ rule all:
         "results/metadata.tsv",
 
 
+# Shared Snakemake files with generic functions are shared across pathogens
+# Use `resolve_config_path` to resolve file paths for config files
+include: "../shared/vendored/snakemake/config.smk"
+
 # Note that only PATHOGEN-level customizations should be added to these
 # core steps, meaning they are custom rules necessary for all builds of the pathogen.
 # If there are build-specific customizations, they should be added with the
@@ -70,4 +77,10 @@ else:
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        # Relative custom rule paths in the config are relative to the analysis
+        # directory (i.e. the current working directory, or workdir, usually
+        # given by --directory), but the "include" directive treats relative
+        # paths as relative to the workflow (e.g. workflow.current_basedir).
+        # Convert to an absolute path based on the analysis/current directory
+        # to avoid this mismatch of expectations.
+        include: os.path.join(os.getcwd(), rule_file)
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -35,9 +35,10 @@ ncbi_datasets_fields:
 
 # Config parameters related to the curate pipeline
 curate:
-  # The path to the local geolocation rules within the pathogen repo
-  # The path should be relative to the ingest directory.
-  local_geolocation_rules: "defaults/geolocation_rules.tsv"
+  # The path to the local geolocation rules for this pathogen.
+  # The path should be relative to the working directory (e.g. --directory).
+  # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
+  local_geolocation_rules: "geolocation_rules.tsv"
   # List of field names to change where the key is the original field name and the value is the new field name
   # The original field names should match the ncbi_datasets_fields provided above.
   # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
@@ -90,8 +91,9 @@ curate:
   # Name to use for the generated abbreviated authors field
   abbr_authors_field: "authors"
   # Path to the manual annotations file
-  # The path should be relative to the ingest directory
-  annotations: "defaults/annotations.tsv"
+  # The path should be relative to the working directory (e.g. --directory).
+  # If the path doesn't exist in the working directory, the file in the workflow's defaults/ directory it used instead (if it exists).
+  annotations: "annotations.tsv"
   # The ID field in the metadata to use to merge the manual annotations
   annotations_id: "accession"
   # The ID field in the metadata to use as the sequence id in the output FASTA file
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -32,8 +32,8 @@ def format_field_map(field_map: dict[str, str]) -> list[str]:
 rule curate:
     input:
         sequences_ndjson="data/ncbi.ndjson",
-        geolocation_rules=config["curate"]["local_geolocation_rules"],
-        annotations=config["curate"]["annotations"],
+        geolocation_rules=resolve_config_path(config["curate"]["local_geolocation_rules"]),
+        annotations=resolve_config_path(config["curate"]["annotations"]),
     output:
         metadata="data/all_metadata.tsv",
         sequences="results/sequences.fasta",
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
@@ -2,11 +2,12 @@
 This is the main Nextclade Snakefile that orchestrates the workflow to produce
 a Nextclade dataset.
 """
-# The workflow filepaths are written relative to this Snakefile's base directory
-workdir: workflow.current_basedir
+# Use default configuration values. Extend with Snakemake's --configfile/--config options.
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
 
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "defaults/config.yaml"
+# Use custom configuration from analysis directory (i.e. working dir), if any.
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
 
 # This is the default rule that Snakemake will run when there are no specified targets.
 # The default output of the Nextclade workflow is usually the produced Nextclade dataset.
@@ -17,6 +18,10 @@ rule all:
         # Fill in paths to the final exported Nextclade dataset.
 
 
+# Shared Snakemake files with generic functions are shared across pathogens
+# Use `resolve_config_path` to resolve file paths for config files
+include: "../shared/vendored/snakemake/config.smk"
+
 # These rules are imported in the order that they are expected to run.
 # Each Snakefile will have documented inputs and outputs that should be kept as
 # consistent interfaces across pathogen repos. This allows us to define typical
@@ -45,4 +50,10 @@ include: "rules/export.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        # Relative custom rule paths in the config are relative to the analysis
+        # directory (i.e. the current working directory, or workdir, usually
+        # given by --directory), but the "include" directive treats relative
+        # paths as relative to the workflow (e.g. workflow.current_basedir).
+        # Convert to an absolute path based on the analysis/current directory
+        # to avoid this mismatch of expectations.
+        include: os.path.join(os.getcwd(), rule_file)
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -2,11 +2,12 @@
 This is the main phylogenetic Snakefile that orchestrates the full phylogenetic
 workflow and defines its default output(s).
 """
-# The workflow filepaths are written relative to this Snakefile's base directory
-workdir: workflow.current_basedir
+# Use default configuration values. Extend with Snakemake's --configfile/--config options.
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
 
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "defaults/config.yaml"
+# Use custom configuration from analysis directory (i.e. working dir), if any.
+if os.path.exists("config.yaml"):
+    configfile: "config.yaml"
 
 
 # This is the default rule that Snakemake will run when there are no specified targets.
@@ -21,6 +22,7 @@ rule all:
 
 
 # Shared Snakemake files with generic functions are shared across pathogens
+# Use `resolve_config_path` to resolve file paths for config files
 include: "../shared/vendored/snakemake/config.smk"
 include: "../shared/vendored/snakemake/remote_files.smk"
 
@@ -53,4 +55,10 @@ include: "rules/export.smk"
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
 
-        include: rule_file
+        # Relative custom rule paths in the config are relative to the analysis
+        # directory (i.e. the current working directory, or workdir, usually
+        # given by --directory), but the "include" directive treats relative
+        # paths as relative to the workflow (e.g. workflow.current_basedir).
+        # Convert to an absolute path based on the analysis/current directory
+        # to avoid this mismatch of expectations.
+        include: os.path.join(os.getcwd(), rule_file)