Skip to content

Commit 6b49522

Browse files
Add Sensitivity workflow step and Monte Carlo parallelism improvements
- Add `type: Sensitivity` YAML workflow step for bottleneck analysis, calling FailureManager.run_sensitivity_monte_carlo() and storing baseline, flow_results, component_scores, and context in results - Default Monte Carlo parallelism to "auto" (all CPU cores) instead of 1 - Add Monte Carlo sensitivity docs and examples to api.md Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 222b0b5 commit 6b49522

5 files changed

Lines changed: 658 additions & 8 deletions

File tree

docs/reference/api.md

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,33 @@ for pair, edge_impacts in sensitivity.items():
333333
print(f" {edge_key}: -{flow_reduction:.2f}")
334334
```
335335

336+
### Monte Carlo Sensitivity Analysis
337+
338+
Run sensitivity analysis across random failure scenarios to identify which
339+
components are most critical under realistic failure conditions:
340+
341+
```python
342+
from ngraph import FailureManager
343+
344+
# fm = FailureManager(network=network, ...) # see FailureManager section below
345+
346+
# parallelism defaults to "auto" (all CPU cores)
347+
results = fm.run_sensitivity_monte_carlo(
348+
source="^metro1/.*",
349+
target="^metro5/.*",
350+
mode="combine",
351+
iterations=100,
352+
seed=42,
353+
)
354+
355+
# Aggregated component scores across all failure scenarios
356+
for flow_key, components in results["component_scores"].items():
357+
sorted_comps = sorted(components.items(), key=lambda x: -x[1]["mean"])
358+
print(f"Flow: {flow_key}")
359+
for comp_key, stats in sorted_comps[:5]:
360+
print(f" {comp_key}: mean={stats['mean']:.1f}, count={stats['count']:.0f}")
361+
```
362+
336363
## 5. Monte Carlo Analysis
337364

338365
Probabilistic failure analysis using FailureManager.
@@ -366,13 +393,12 @@ fm = FailureManager(
366393
policy_name="single_link"
367394
)
368395

369-
# Run max-flow Monte Carlo analysis
396+
# Run max-flow Monte Carlo analysis (parallelism defaults to "auto")
370397
results = fm.run_max_flow_monte_carlo(
371398
source="^A$",
372399
target="^C$",
373400
mode="combine",
374401
iterations=100,
375-
parallelism=1,
376402
seed=42 # For reproducibility
377403
)
378404

@@ -384,9 +410,16 @@ for iter_result in results["results"]:
384410
**Key Methods:**
385411

386412
- `run_max_flow_monte_carlo(...)` - Max-flow capacity analysis under failures
413+
- `run_sensitivity_monte_carlo(...)` - Component criticality analysis under failures
387414
- `run_demand_placement_monte_carlo(...)` - Traffic demand placement under failures
388415
- `run_monte_carlo_analysis(analysis_func, ...)` - Generic Monte Carlo with custom function
389416

417+
**Performance note:** All Monte Carlo convenience methods default to `parallelism="auto"`
418+
(all CPU cores). Set `parallelism=1` to force serial execution if needed. The C++ Core
419+
backend releases the GIL during computation, enabling true parallelism with threads.
420+
Sensitivity analysis is significantly more expensive per iteration than max-flow
421+
(~1-2s vs ~0.002s on a 1,280-node network), so parallelism provides substantial speedup.
422+
390423
## 6. Workflow Steps
391424

392425
Pre-built analysis steps for YAML-driven workflows.

ngraph/analysis/failure_manager.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,22 @@ def _create_cache_key(
9494
return base_key + (tuple(hashable_kwargs),)
9595

9696

97+
def _resolve_parallelism(parallelism: int | str) -> int:
98+
"""Resolve parallelism setting to a concrete worker count.
99+
100+
Args:
101+
parallelism: Either an integer worker count or "auto" for CPU count.
102+
103+
Returns:
104+
Positive integer worker count (minimum 1).
105+
"""
106+
if isinstance(parallelism, str):
107+
if parallelism != "auto":
108+
raise ValueError("parallelism must be an integer or 'auto'")
109+
return max(1, int(os.cpu_count() or 1))
110+
return max(1, int(parallelism))
111+
112+
97113
def _auto_adjust_parallelism(parallelism: int, analysis_func: Any) -> int:
98114
"""Adjust parallelism based on function characteristics.
99115
@@ -759,7 +775,7 @@ def run_max_flow_monte_carlo(
759775
target: str | dict[str, Any],
760776
mode: str = "combine",
761777
iterations: int = 100,
762-
parallelism: int = 1,
778+
parallelism: int | str = "auto",
763779
shortest_path: bool = False,
764780
require_capacity: bool = True,
765781
flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL,
@@ -781,7 +797,9 @@ def run_max_flow_monte_carlo(
781797
target: Target node selector (string path or selector dict).
782798
mode: "combine" (aggregate) or "pairwise" (individual flows).
783799
iterations: Number of failure scenarios to simulate.
784-
parallelism: Number of parallel workers (auto-adjusted if needed).
800+
parallelism: Number of parallel workers. Defaults to ``"auto"``
801+
(use all CPU cores). Set to ``1`` for serial execution. The C++ Core
802+
backend releases the GIL, enabling true parallelism with threads.
785803
shortest_path: Whether to use shortest paths only.
786804
require_capacity: If True (default), path selection considers available
787805
capacity. If False, path selection is cost-only (true IP/IGP semantics).
@@ -800,6 +818,9 @@ def run_max_flow_monte_carlo(
800818
"""
801819
from ngraph.analysis.functions import max_flow_analysis
802820

821+
# Resolve "auto" parallelism to CPU count
822+
parallelism = _resolve_parallelism(parallelism)
823+
803824
# Convert string flow_placement to enum if needed
804825
if isinstance(flow_placement, str):
805826
flow_placement = FlowPlacement.from_string(flow_placement)
@@ -884,7 +905,7 @@ def run_demand_placement_monte_carlo(
884905
demands_config: list[dict[str, Any]]
885906
| Any, # List of demand configs or DemandSet
886907
iterations: int = 100,
887-
parallelism: int = 1,
908+
parallelism: int | str = "auto",
888909
placement_rounds: int | str = "auto",
889910
seed: int | None = None,
890911
store_failure_patterns: bool = False,
@@ -901,7 +922,9 @@ def run_demand_placement_monte_carlo(
901922
Args:
902923
demands_config: List of demand configs or DemandSet object.
903924
iterations: Number of failure scenarios to simulate.
904-
parallelism: Number of parallel workers (auto-adjusted if needed).
925+
parallelism: Number of parallel workers. Defaults to ``"auto"``
926+
(use all CPU cores). Set to ``1`` for serial execution. The C++ Core
927+
backend releases the GIL, enabling true parallelism with threads.
905928
placement_rounds: Optimization rounds for demand placement.
906929
seed: Optional seed for reproducible results.
907930
store_failure_patterns: Whether to store failure trace on results.
@@ -917,6 +940,9 @@ def run_demand_placement_monte_carlo(
917940
"""
918941
from ngraph.analysis.functions import demand_placement_analysis
919942

943+
# Resolve "auto" parallelism to CPU count
944+
parallelism = _resolve_parallelism(parallelism)
945+
920946
# If caller passed a sequence of TrafficDemand objects, convert to dicts
921947
if not isinstance(demands_config, list):
922948
# Accept DemandSet or any container providing get_all_demands()
@@ -962,7 +988,7 @@ def run_sensitivity_monte_carlo(
962988
target: str | dict[str, Any],
963989
mode: str = "combine",
964990
iterations: int = 100,
965-
parallelism: int = 1,
991+
parallelism: int | str = "auto",
966992
shortest_path: bool = False,
967993
flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL,
968994
seed: int | None = None,
@@ -976,12 +1002,21 @@ def run_sensitivity_monte_carlo(
9761002
9771003
Baseline (no failures) is always run first as a separate reference.
9781004
1005+
.. note::
1006+
1007+
Sensitivity analysis is significantly more expensive per iteration than
1008+
plain max-flow (~1-2s vs ~0.002s per iteration on a 1,280-node network).
1009+
Multi-threaded execution (the default ``"auto"`` parallelism) provides
1010+
significant speedup. The C++ Core backend releases the GIL during
1011+
computation, enabling true parallelism with threads.
1012+
9791013
Args:
9801014
source: Source node selector (string path or selector dict).
9811015
target: Target node selector (string path or selector dict).
9821016
mode: "combine" (aggregate) or "pairwise" (individual flows).
9831017
iterations: Number of failure scenarios to simulate.
984-
parallelism: Number of parallel workers (auto-adjusted if needed).
1018+
parallelism: Number of parallel workers. Defaults to ``"auto"``
1019+
(use all CPU cores). Set to ``1`` for serial execution.
9851020
shortest_path: Whether to use shortest paths only.
9861021
flow_placement: Flow placement strategy.
9871022
seed: Optional seed for reproducible results.
@@ -997,6 +1032,9 @@ def run_sensitivity_monte_carlo(
9971032
"""
9981033
from ngraph.analysis.functions import sensitivity_analysis
9991034

1035+
# Resolve "auto" parallelism to CPU count
1036+
parallelism = _resolve_parallelism(parallelism)
1037+
10001038
# Convert string flow_placement to enum if needed
10011039
if isinstance(flow_placement, str):
10021040
flow_placement = FlowPlacement.from_string(flow_placement)

ngraph/workflow/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from .max_flow_step import MaxFlow
77
from .maximum_supported_demand_step import MaximumSupportedDemand
88
from .network_stats import NetworkStats
9+
from .sensitivity_step import Sensitivity
910
from .traffic_matrix_placement_step import TrafficMatrixPlacement
1011

1112
__all__ = [
@@ -14,6 +15,7 @@
1415
"BuildGraph",
1516
"MaxFlow",
1617
"NetworkStats",
18+
"Sensitivity",
1719
"TrafficMatrixPlacement",
1820
"MaximumSupportedDemand",
1921
"CostPower",
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""Sensitivity workflow step.
2+
3+
Monte Carlo sensitivity analysis of network bottlenecks between node groups
4+
using FailureManager. Identifies critical edges and quantifies their impact
5+
on flow capacity across failure scenarios.
6+
7+
Baseline (no failures) is always run first as a separate reference. The
8+
``iterations`` parameter specifies how many failure scenarios to run.
9+
Per-iteration results include per-edge flow-reduction deltas. Aggregated
10+
``component_scores`` summarize mean/max/min impact across all iterations.
11+
12+
YAML Configuration Example:
13+
14+
workflow:
15+
- type: Sensitivity
16+
name: "bottleneck_analysis"
17+
source: "^datacenter/.*"
18+
target: "^edge/.*"
19+
mode: "combine"
20+
failure_policy: "random_failures"
21+
iterations: 100
22+
parallelism: auto
23+
shortest_path: false
24+
flow_placement: "PROPORTIONAL"
25+
seed: 42
26+
store_failure_patterns: false
27+
"""
28+
29+
from __future__ import annotations
30+
31+
import time
32+
from dataclasses import dataclass
33+
from typing import TYPE_CHECKING, Any, Dict, Union
34+
35+
from ngraph.analysis.failure_manager import FailureManager
36+
from ngraph.logging import get_logger
37+
from ngraph.results.flow import FlowIterationResult
38+
from ngraph.types.base import FlowPlacement
39+
from ngraph.workflow.base import (
40+
WorkflowStep,
41+
register_workflow_step,
42+
resolve_parallelism,
43+
)
44+
45+
if TYPE_CHECKING:
46+
from ngraph.scenario import Scenario
47+
48+
logger = get_logger(__name__)
49+
50+
51+
@dataclass
52+
class Sensitivity(WorkflowStep):
53+
"""Monte Carlo sensitivity analysis workflow step.
54+
55+
Identifies critical network edges by measuring the flow-capacity reduction
56+
caused by removing each one, across Monte Carlo failure scenarios. Results
57+
include per-iteration sensitivity maps and aggregated component scores.
58+
59+
Baseline (no failures) is always run first as a separate reference. The
60+
flow_results list contains unique failure patterns (deduplicated); each
61+
result has occurrence_count indicating how many iterations matched that
62+
pattern.
63+
64+
Attributes:
65+
source: Source node selector (string path or selector dict).
66+
target: Target node selector (string path or selector dict).
67+
mode: Flow analysis mode ("combine" or "pairwise").
68+
failure_policy: Name of failure policy in scenario.failure_policy_set.
69+
iterations: Number of failure iterations to run.
70+
parallelism: Number of parallel worker threads.
71+
shortest_path: Whether to use shortest paths only.
72+
flow_placement: Flow placement strategy.
73+
seed: Optional seed for reproducible results.
74+
store_failure_patterns: Whether to store failure patterns in results.
75+
"""
76+
77+
source: Union[str, Dict[str, Any]] = ""
78+
target: Union[str, Dict[str, Any]] = ""
79+
mode: str = "combine"
80+
failure_policy: str | None = None
81+
iterations: int = 1
82+
parallelism: int | str = "auto"
83+
shortest_path: bool = False
84+
flow_placement: FlowPlacement | str = FlowPlacement.PROPORTIONAL
85+
seed: int | None = None
86+
store_failure_patterns: bool = False
87+
88+
def __post_init__(self) -> None:
89+
if self.iterations < 0:
90+
raise ValueError("iterations must be >= 0")
91+
if isinstance(self.parallelism, str):
92+
if self.parallelism != "auto":
93+
raise ValueError("parallelism must be an integer or 'auto'")
94+
else:
95+
if self.parallelism < 1:
96+
raise ValueError("parallelism must be >= 1")
97+
if self.mode not in {"combine", "pairwise"}:
98+
raise ValueError("mode must be 'combine' or 'pairwise'")
99+
if isinstance(self.flow_placement, str):
100+
self.flow_placement = FlowPlacement.from_string(self.flow_placement)
101+
102+
def run(self, scenario: "Scenario") -> None:
103+
t0 = time.perf_counter()
104+
logger.info("Starting Sensitivity: name=%s", self.name)
105+
logger.debug(
106+
"Sensitivity params: source=%s target=%s mode=%s failure_iters=%d "
107+
"parallelism=%s failure_policy=%s shortest_path=%s",
108+
self.source,
109+
self.target,
110+
self.mode,
111+
self.iterations,
112+
self.parallelism,
113+
self.failure_policy,
114+
self.shortest_path,
115+
)
116+
117+
fm = FailureManager(
118+
network=scenario.network,
119+
failure_policy_set=scenario.failure_policy_set,
120+
policy_name=self.failure_policy,
121+
)
122+
effective_parallelism = resolve_parallelism(self.parallelism)
123+
raw = fm.run_sensitivity_monte_carlo(
124+
source=self.source,
125+
target=self.target,
126+
mode=self.mode,
127+
iterations=self.iterations,
128+
parallelism=effective_parallelism,
129+
shortest_path=self.shortest_path,
130+
flow_placement=self.flow_placement,
131+
seed=self.seed,
132+
store_failure_patterns=self.store_failure_patterns,
133+
)
134+
135+
scenario.results.put("metadata", raw.get("metadata", {}))
136+
137+
# Handle baseline (separate from failure results)
138+
baseline_result = raw.get("baseline")
139+
baseline_dict = None
140+
if baseline_result is not None:
141+
if hasattr(baseline_result, "to_dict"):
142+
baseline_dict = baseline_result.to_dict()
143+
else:
144+
baseline_dict = baseline_result
145+
146+
# Handle failure results
147+
flow_results: list[dict] = []
148+
for item in raw.get("results", []):
149+
if isinstance(item, FlowIterationResult):
150+
flow_results.append(item.to_dict())
151+
elif hasattr(item, "to_dict") and callable(item.to_dict):
152+
flow_results.append(item.to_dict()) # type: ignore[union-attr]
153+
else:
154+
flow_results.append(item)
155+
156+
# Component scores: aggregated per-component sensitivity statistics
157+
component_scores = raw.get("component_scores", {})
158+
159+
context = {
160+
"source": self.source,
161+
"target": self.target,
162+
"mode": self.mode,
163+
"shortest_path": bool(self.shortest_path),
164+
"flow_placement": getattr(
165+
self.flow_placement, "name", str(self.flow_placement)
166+
),
167+
}
168+
scenario.results.put(
169+
"data",
170+
{
171+
"baseline": baseline_dict,
172+
"flow_results": flow_results,
173+
"component_scores": component_scores,
174+
"context": context,
175+
},
176+
)
177+
178+
metadata = raw.get("metadata", {})
179+
logger.info(
180+
"Sensitivity completed: name=%s failure_iters=%d unique_patterns=%d "
181+
"workers=%d duration=%.3fs",
182+
self.name,
183+
metadata.get("iterations", self.iterations),
184+
metadata.get("unique_patterns", 0),
185+
metadata.get("parallelism", effective_parallelism),
186+
time.perf_counter() - t0,
187+
)
188+
189+
190+
register_workflow_step("Sensitivity")(Sensitivity)

0 commit comments

Comments
 (0)