Add Sensitivity workflow step and Monte Carlo parallelism improvements

networmix · cursoragent · networmix · commit 6b49522665b7 · 2026-02-17T11:26:24.000Z
- Add `type: Sensitivity` YAML workflow step for bottleneck analysis,
  calling FailureManager.run_sensitivity_monte_carlo() and storing
  baseline, flow_results, component_scores, and context in results
- Default Monte Carlo parallelism to "auto" (all CPU cores) instead of 1
- Add Monte Carlo sensitivity docs and examples to api.md

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/docs/reference/api.md b/docs/reference/api.md
@@ -333,6 +333,33 @@ for pair, edge_impacts in sensitivity.items():
         print(f"  {edge_key}: -{flow_reduction:.2f}")
 ```
 
+### Monte Carlo Sensitivity Analysis
+
+Run sensitivity analysis across random failure scenarios to identify which
+components are most critical under realistic failure conditions:
+
+```python
+from ngraph import FailureManager
+
+# fm = FailureManager(network=network, ...)  # see FailureManager section below
+
+# parallelism defaults to "auto" (all CPU cores)
+results = fm.run_sensitivity_monte_carlo(
+    source="^metro1/.*",
+    target="^metro5/.*",
+    mode="combine",
+    iterations=100,
+    seed=42,
+)
+
+# Aggregated component scores across all failure scenarios
+for flow_key, components in results["component_scores"].items():
+    sorted_comps = sorted(components.items(), key=lambda x: -x[1]["mean"])
+    print(f"Flow: {flow_key}")
+    for comp_key, stats in sorted_comps[:5]:
+        print(f"  {comp_key}: mean={stats['mean']:.1f}, count={stats['count']:.0f}")
+```
+
 ## 5. Monte Carlo Analysis
 
 Probabilistic failure analysis using FailureManager.
@@ -366,13 +393,12 @@ fm = FailureManager(
     policy_name="single_link"
 )
 
-# Run max-flow Monte Carlo analysis
+# Run max-flow Monte Carlo analysis (parallelism defaults to "auto")
 results = fm.run_max_flow_monte_carlo(
     source="^A$",
     target="^C$",
     mode="combine",
     iterations=100,
-    parallelism=1,
     seed=42  # For reproducibility
 )
 
@@ -384,9 +410,16 @@ for iter_result in results["results"]:
 **Key Methods:**
 
 - `run_max_flow_monte_carlo(...)` - Max-flow capacity analysis under failures
+- `run_sensitivity_monte_carlo(...)` - Component criticality analysis under failures
 - `run_demand_placement_monte_carlo(...)` - Traffic demand placement under failures
 - `run_monte_carlo_analysis(analysis_func, ...)` - Generic Monte Carlo with custom function
 
+**Performance note:** All Monte Carlo convenience methods default to `parallelism="auto"`
+(all CPU cores). Set `parallelism=1` to force serial execution if needed. The C++ Core
+backend releases the GIL during computation, enabling true parallelism with threads.
+Sensitivity analysis is significantly more expensive per iteration than max-flow
+(~1-2s vs ~0.002s on a 1,280-node network), so parallelism provides substantial speedup.
+
 ## 6. Workflow Steps
 
 Pre-built analysis steps for YAML-driven workflows.
diff --git a/ngraph/analysis/failure_manager.py b/ngraph/analysis/failure_manager.py
@@ -94,6 +94,22 @@ def _create_cache_key(
     return base_key + (tuple(hashable_kwargs),)
 
 
+def _resolve_parallelism(parallelism: int | str) -> int:
+    """Resolve parallelism setting to a concrete worker count.
+
+    Args:
+        parallelism: Either an integer worker count or "auto" for CPU count.
+
+    Returns:
+        Positive integer worker count (minimum 1).
+    """
+    if isinstance(parallelism, str):
+        if parallelism != "auto":
+            raise ValueError("parallelism must be an integer or 'auto'")
+        return max(1, int(os.cpu_count() or 1))
+    return max(1, int(parallelism))
+
+
 def _auto_adjust_parallelism(parallelism: int, analysis_func: Any) -> int:
     """Adjust parallelism based on function characteristics.
 
@@ -759,7 +775,7 @@ def run_max_flow_monte_carlo(
         target: str | dict[str, Any],
         mode: str = "combine",
         iterations: int = 100,
-        parallelism: int = 1,
+        parallelism: int | str = "auto",
         shortest_path: bool = False,
         require_capacity: bool = True,
         flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL,
@@ -781,7 +797,9 @@ def run_max_flow_monte_carlo(
             target: Target node selector (string path or selector dict).
             mode: "combine" (aggregate) or "pairwise" (individual flows).
             iterations: Number of failure scenarios to simulate.
-            parallelism: Number of parallel workers (auto-adjusted if needed).
+            parallelism: Number of parallel workers. Defaults to ``"auto"``
+                (use all CPU cores). Set to ``1`` for serial execution. The C++ Core
+                backend releases the GIL, enabling true parallelism with threads.
             shortest_path: Whether to use shortest paths only.
             require_capacity: If True (default), path selection considers available
                 capacity. If False, path selection is cost-only (true IP/IGP semantics).
@@ -800,6 +818,9 @@ def run_max_flow_monte_carlo(
         """
         from ngraph.analysis.functions import max_flow_analysis
 
+        # Resolve "auto" parallelism to CPU count
+        parallelism = _resolve_parallelism(parallelism)
+
         # Convert string flow_placement to enum if needed
         if isinstance(flow_placement, str):
             flow_placement = FlowPlacement.from_string(flow_placement)
@@ -884,7 +905,7 @@ def run_demand_placement_monte_carlo(
         demands_config: list[dict[str, Any]]
         | Any,  # List of demand configs or DemandSet
         iterations: int = 100,
-        parallelism: int = 1,
+        parallelism: int | str = "auto",
         placement_rounds: int | str = "auto",
         seed: int | None = None,
         store_failure_patterns: bool = False,
@@ -901,7 +922,9 @@ def run_demand_placement_monte_carlo(
         Args:
             demands_config: List of demand configs or DemandSet object.
             iterations: Number of failure scenarios to simulate.
-            parallelism: Number of parallel workers (auto-adjusted if needed).
+            parallelism: Number of parallel workers. Defaults to ``"auto"``
+                (use all CPU cores). Set to ``1`` for serial execution. The C++ Core
+                backend releases the GIL, enabling true parallelism with threads.
             placement_rounds: Optimization rounds for demand placement.
             seed: Optional seed for reproducible results.
             store_failure_patterns: Whether to store failure trace on results.
@@ -917,6 +940,9 @@ def run_demand_placement_monte_carlo(
         """
         from ngraph.analysis.functions import demand_placement_analysis
 
+        # Resolve "auto" parallelism to CPU count
+        parallelism = _resolve_parallelism(parallelism)
+
         # If caller passed a sequence of TrafficDemand objects, convert to dicts
         if not isinstance(demands_config, list):
             # Accept DemandSet or any container providing get_all_demands()
@@ -962,7 +988,7 @@ def run_sensitivity_monte_carlo(
         target: str | dict[str, Any],
         mode: str = "combine",
         iterations: int = 100,
-        parallelism: int = 1,
+        parallelism: int | str = "auto",
         shortest_path: bool = False,
         flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL,
         seed: int | None = None,
@@ -976,12 +1002,21 @@ def run_sensitivity_monte_carlo(
 
         Baseline (no failures) is always run first as a separate reference.
 
+        .. note::
+
+            Sensitivity analysis is significantly more expensive per iteration than
+            plain max-flow (~1-2s vs ~0.002s per iteration on a 1,280-node network).
+            Multi-threaded execution (the default ``"auto"`` parallelism) provides
+            significant speedup. The C++ Core backend releases the GIL during
+            computation, enabling true parallelism with threads.
+
         Args:
             source: Source node selector (string path or selector dict).
             target: Target node selector (string path or selector dict).
             mode: "combine" (aggregate) or "pairwise" (individual flows).
             iterations: Number of failure scenarios to simulate.
-            parallelism: Number of parallel workers (auto-adjusted if needed).
+            parallelism: Number of parallel workers. Defaults to ``"auto"``
+                (use all CPU cores). Set to ``1`` for serial execution.
             shortest_path: Whether to use shortest paths only.
             flow_placement: Flow placement strategy.
             seed: Optional seed for reproducible results.
@@ -997,6 +1032,9 @@ def run_sensitivity_monte_carlo(
         """
         from ngraph.analysis.functions import sensitivity_analysis
 
+        # Resolve "auto" parallelism to CPU count
+        parallelism = _resolve_parallelism(parallelism)
+
         # Convert string flow_placement to enum if needed
         if isinstance(flow_placement, str):
             flow_placement = FlowPlacement.from_string(flow_placement)
diff --git a/ngraph/workflow/__init__.py b/ngraph/workflow/__init__.py
@@ -6,6 +6,7 @@
 from .max_flow_step import MaxFlow
 from .maximum_supported_demand_step import MaximumSupportedDemand
 from .network_stats import NetworkStats
+from .sensitivity_step import Sensitivity
 from .traffic_matrix_placement_step import TrafficMatrixPlacement
 
 __all__ = [
@@ -14,6 +15,7 @@
     "BuildGraph",
     "MaxFlow",
     "NetworkStats",
+    "Sensitivity",
     "TrafficMatrixPlacement",
     "MaximumSupportedDemand",
     "CostPower",
diff --git a/ngraph/workflow/sensitivity_step.py b/ngraph/workflow/sensitivity_step.py
@@ -0,0 +1,190 @@
+"""Sensitivity workflow step.
+
+Monte Carlo sensitivity analysis of network bottlenecks between node groups
+using FailureManager. Identifies critical edges and quantifies their impact
+on flow capacity across failure scenarios.
+
+Baseline (no failures) is always run first as a separate reference. The
+``iterations`` parameter specifies how many failure scenarios to run.
+Per-iteration results include per-edge flow-reduction deltas. Aggregated
+``component_scores`` summarize mean/max/min impact across all iterations.
+
+YAML Configuration Example:
+
+    workflow:
+      - type: Sensitivity
+        name: "bottleneck_analysis"
+        source: "^datacenter/.*"
+        target: "^edge/.*"
+        mode: "combine"
+        failure_policy: "random_failures"
+        iterations: 100
+        parallelism: auto
+        shortest_path: false
+        flow_placement: "PROPORTIONAL"
+        seed: 42
+        store_failure_patterns: false
+"""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Union
+
+from ngraph.analysis.failure_manager import FailureManager
+from ngraph.logging import get_logger
+from ngraph.results.flow import FlowIterationResult
+from ngraph.types.base import FlowPlacement
+from ngraph.workflow.base import (
+    WorkflowStep,
+    register_workflow_step,
+    resolve_parallelism,
+)
+
+if TYPE_CHECKING:
+    from ngraph.scenario import Scenario
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class Sensitivity(WorkflowStep):
+    """Monte Carlo sensitivity analysis workflow step.
+
+    Identifies critical network edges by measuring the flow-capacity reduction
+    caused by removing each one, across Monte Carlo failure scenarios. Results
+    include per-iteration sensitivity maps and aggregated component scores.
+
+    Baseline (no failures) is always run first as a separate reference. The
+    flow_results list contains unique failure patterns (deduplicated); each
+    result has occurrence_count indicating how many iterations matched that
+    pattern.
+
+    Attributes:
+        source: Source node selector (string path or selector dict).
+        target: Target node selector (string path or selector dict).
+        mode: Flow analysis mode ("combine" or "pairwise").
+        failure_policy: Name of failure policy in scenario.failure_policy_set.
+        iterations: Number of failure iterations to run.
+        parallelism: Number of parallel worker threads.
+        shortest_path: Whether to use shortest paths only.
+        flow_placement: Flow placement strategy.
+        seed: Optional seed for reproducible results.
+        store_failure_patterns: Whether to store failure patterns in results.
+    """
+
+    source: Union[str, Dict[str, Any]] = ""
+    target: Union[str, Dict[str, Any]] = ""
+    mode: str = "combine"
+    failure_policy: str | None = None
+    iterations: int = 1
+    parallelism: int | str = "auto"
+    shortest_path: bool = False
+    flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL
+    seed: int | None = None
+    store_failure_patterns: bool = False
+
+    def __post_init__(self) -> None:
+        if self.iterations < 0:
+            raise ValueError("iterations must be >= 0")
+        if isinstance(self.parallelism, str):
+            if self.parallelism != "auto":
+                raise ValueError("parallelism must be an integer or 'auto'")
+        else:
+            if self.parallelism < 1:
+                raise ValueError("parallelism must be >= 1")
+        if self.mode not in {"combine", "pairwise"}:
+            raise ValueError("mode must be 'combine' or 'pairwise'")
+        if isinstance(self.flow_placement, str):
+            self.flow_placement = FlowPlacement.from_string(self.flow_placement)
+
+    def run(self, scenario: "Scenario") -> None:
+        t0 = time.perf_counter()
+        logger.info("Starting Sensitivity: name=%s", self.name)
+        logger.debug(
+            "Sensitivity params: source=%s target=%s mode=%s failure_iters=%d "
+            "parallelism=%s failure_policy=%s shortest_path=%s",
+            self.source,
+            self.target,
+            self.mode,
+            self.iterations,
+            self.parallelism,
+            self.failure_policy,
+            self.shortest_path,
+        )
+
+        fm = FailureManager(
+            network=scenario.network,
+            failure_policy_set=scenario.failure_policy_set,
+            policy_name=self.failure_policy,
+        )
+        effective_parallelism = resolve_parallelism(self.parallelism)
+        raw = fm.run_sensitivity_monte_carlo(
+            source=self.source,
+            target=self.target,
+            mode=self.mode,
+            iterations=self.iterations,
+            parallelism=effective_parallelism,
+            shortest_path=self.shortest_path,
+            flow_placement=self.flow_placement,
+            seed=self.seed,
+            store_failure_patterns=self.store_failure_patterns,
+        )
+
+        scenario.results.put("metadata", raw.get("metadata", {}))
+
+        # Handle baseline (separate from failure results)
+        baseline_result = raw.get("baseline")
+        baseline_dict = None
+        if baseline_result is not None:
+            if hasattr(baseline_result, "to_dict"):
+                baseline_dict = baseline_result.to_dict()
+            else:
+                baseline_dict = baseline_result
+
+        # Handle failure results
+        flow_results: list[dict] = []
+        for item in raw.get("results", []):
+            if isinstance(item, FlowIterationResult):
+                flow_results.append(item.to_dict())
+            elif hasattr(item, "to_dict") and callable(item.to_dict):
+                flow_results.append(item.to_dict())  # type: ignore[union-attr]
+            else:
+                flow_results.append(item)
+
+        # Component scores: aggregated per-component sensitivity statistics
+        component_scores = raw.get("component_scores", {})
+
+        context = {
+            "source": self.source,
+            "target": self.target,
+            "mode": self.mode,
+            "shortest_path": bool(self.shortest_path),
+            "flow_placement": getattr(
+                self.flow_placement, "name", str(self.flow_placement)
+            ),
+        }
+        scenario.results.put(
+            "data",
+            {
+                "baseline": baseline_dict,
+                "flow_results": flow_results,
+                "component_scores": component_scores,
+                "context": context,
+            },
+        )
+
+        metadata = raw.get("metadata", {})
+        logger.info(
+            "Sensitivity completed: name=%s failure_iters=%d unique_patterns=%d "
+            "workers=%d duration=%.3fs",
+            self.name,
+            metadata.get("iterations", self.iterations),
+            metadata.get("unique_patterns", 0),
+            metadata.get("parallelism", effective_parallelism),
+            time.perf_counter() - t0,
+        )
+
+
+register_workflow_step("Sensitivity")(Sensitivity)
diff --git a/tests/workflow/test_sensitivity_step.py b/tests/workflow/test_sensitivity_step.py