diff --git a/docs/notebooks/08c-clustering.ipynb b/docs/notebooks/08c-clustering.ipynb
index 602468cba..b612756a9 100644
--- a/docs/notebooks/08c-clustering.ipynb
+++ b/docs/notebooks/08c-clustering.ipynb
@@ -14,10 +14,16 @@
"- **Typical periods**: Cluster similar time segments (e.g., days) and solve only representative ones\n",
"- **Weighted costs**: Automatically weight operational costs by cluster occurrence\n",
"- **Two-stage workflow**: Fast sizing with clustering, accurate dispatch at full resolution\n",
+ "- **Segmentation**: Reduce timesteps within each cluster for further compression\n",
"\n",
"!!! note \"Requirements\"\n",
- " This notebook requires the `tsam` package with `ClusterConfig` and `ExtremeConfig` support.\n",
- " Install with: `pip install \"flixopt[full]\"`"
+ " This notebook requires the `tsam` and `tsam_xarray` packages.\n",
+ " Install with: `pip install \"flixopt[full]\"`\n",
+ "\n",
+ "!!! tip \"tsam_xarray\"\n",
+ " flixopt uses [tsam_xarray](https://github.com/FZJ-IEK3-VSA/tsam_xarray) for clustering,\n",
+ " which wraps [tsam](https://github.com/FZJ-IEK3-VSA/tsam). For advanced clustering options\n",
+ " (custom algorithms, weights, tuning), see the tsam_xarray documentation."
]
},
{
@@ -171,7 +177,9 @@
"source": [
"## Understanding the Clustering\n",
"\n",
- "The clustering algorithm groups similar days together. Access all metadata via `fs.clustering`:"
+ "Access clustering metadata via `fs.clustering`. For full access to the underlying\n",
+ "[tsam_xarray ClusteringResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray),\n",
+ "use `fs.clustering.clustering_result`."
]
},
{
@@ -181,202 +189,13 @@
"metadata": {},
"outputs": [],
"source": [
- "# Access clustering metadata directly\n",
- "clustering = fs_clustered.clustering.results\n",
- "clustering"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "12",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Show clustering info using __repr__\n",
+ "# Clustering overview\n",
"fs_clustered.clustering"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "13",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Quality metrics - how well do the clusters represent the original data?\n",
- "# Lower RMSE/MAE = better representation\n",
- "fs_clustered.clustering.metrics.to_dataframe().style.format('{:.3f}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "14",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visual comparison: original vs clustered time series\n",
- "fs_clustered.clustering.plot.compare()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "15",
- "metadata": {},
- "source": [
- "## Inspect Clustering Input Data\n",
- "\n",
- "Before clustering, you can inspect which time-varying data will be used.\n",
- "The `clustering_data()` method returns only the arrays that vary over time\n",
- "(constant arrays are excluded since they don't affect clustering):"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "16",
- "metadata": {},
- "outputs": [],
- "source": [
- "# See what data will be used for clustering\n",
- "clustering_data = flow_system.transform.clustering_data()\n",
- "print(f'Variables used for clustering ({len(clustering_data.data_vars)} total):')\n",
- "for var in clustering_data.data_vars:\n",
- " print(f' - {var}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "17",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize the time-varying data (select a few key variables)\n",
- "key_vars = [v for v in clustering_data.data_vars if 'fixed_relative_profile' in v or 'effects_per_flow_hour' in v]\n",
- "clustering_data[key_vars].plotly.line(facet_row='variable', title='Time-Varying Data Used for Clustering')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "18",
- "metadata": {},
- "source": [
- "## Selective Clustering with `data_vars`\n",
- "\n",
- "By default, clustering uses **all** time-varying data to determine typical periods.\n",
- "However, you may want to cluster based on only a **subset** of variables while still\n",
- "applying the clustering to all data.\n",
- "\n",
- "Use the `data_vars` parameter to specify which variables determine the clustering:\n",
- "\n",
- "- **Cluster based on subset**: Only the specified variables affect which days are grouped together\n",
- "- **Apply to all data**: The resulting clustering is applied to ALL time-varying data\n",
- "\n",
- "This is useful when:\n",
- "- You want to cluster based on demand patterns only (ignoring price variations)\n",
- "- You have dominant time series that should drive the clustering\n",
- "- You want to ensure certain patterns are well-represented in typical periods"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "19",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cluster based ONLY on heat demand pattern (ignore electricity prices)\n",
- "demand_var = 'HeatDemand(Q_th)|fixed_relative_profile'\n",
- "\n",
- "fs_demand_only = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " data_vars=[demand_var], # Only this variable determines clustering\n",
- " extremes=ExtremeConfig(method='new_cluster', max_value=[demand_var]),\n",
- ")\n",
- "\n",
- "# Verify: clustering was determined by demand but applied to all data\n",
- "print(f'Clustered using: {demand_var}')\n",
- "print(f'But all {len(clustering_data.data_vars)} variables are included in the result')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "20",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare metrics: clustering with all data vs. demand-only\n",
- "pd.DataFrame(\n",
- " {\n",
- " 'All Variables': fs_clustered.clustering.metrics.to_dataframe().iloc[0],\n",
- " 'Demand Only': fs_demand_only.clustering.metrics.to_dataframe().iloc[0],\n",
- " }\n",
- ").round(4)"
- ]
- },
{
"cell_type": "markdown",
- "id": "21",
- "metadata": {},
- "source": [
- "## Advanced Clustering Options\n",
- "\n",
- "The `cluster()` method exposes many parameters for fine-tuning:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "22",
- "metadata": {},
- "outputs": [],
- "source": [
- "from tsam import ClusterConfig\n",
- "\n",
- "# Try different clustering algorithms\n",
- "fs_kmeans = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " cluster=ClusterConfig(method='kmeans'), # Alternative: 'hierarchical' (default), 'kmedoids', 'averaging'\n",
- ")\n",
- "\n",
- "fs_kmeans.clustering"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "23",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare quality metrics between algorithms\n",
- "pd.DataFrame(\n",
- " {\n",
- " 'hierarchical': fs_clustered.clustering.metrics.to_dataframe().iloc[0],\n",
- " 'kmeans': fs_kmeans.clustering.metrics.to_dataframe().iloc[0],\n",
- " }\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "24",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize cluster structure with heatmap\n",
- "fs_clustered.clustering.plot.heatmap()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "25",
+ "id": "12",
"metadata": {},
"source": [
"### Apply Existing Clustering\n",
@@ -402,7 +221,7 @@
},
{
"cell_type": "markdown",
- "id": "26",
+ "id": "13",
"metadata": {},
"source": [
"## Method 3: Two-Stage Workflow (Recommended)\n",
@@ -420,7 +239,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "27",
+ "id": "14",
"metadata": {},
"outputs": [],
"source": [
@@ -432,7 +251,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "28",
+ "id": "15",
"metadata": {},
"outputs": [],
"source": [
@@ -451,7 +270,7 @@
},
{
"cell_type": "markdown",
- "id": "29",
+ "id": "16",
"metadata": {},
"source": [
"## Compare Results"
@@ -460,7 +279,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "30",
+ "id": "17",
"metadata": {},
"outputs": [],
"source": [
@@ -509,7 +328,7 @@
},
{
"cell_type": "markdown",
- "id": "31",
+ "id": "18",
"metadata": {},
"source": [
"## Expand Solution to Full Resolution\n",
@@ -521,7 +340,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "32",
+ "id": "19",
"metadata": {},
"outputs": [],
"source": [
@@ -532,7 +351,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "33",
+ "id": "20",
"metadata": {},
"outputs": [],
"source": [
@@ -554,7 +373,7 @@
},
{
"cell_type": "markdown",
- "id": "34",
+ "id": "21",
"metadata": {},
"source": [
"## Visualize Clustered Heat Balance"
@@ -563,7 +382,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "35",
+ "id": "22",
"metadata": {},
"outputs": [],
"source": [
@@ -573,7 +392,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "36",
+ "id": "23",
"metadata": {},
"outputs": [],
"source": [
@@ -582,10 +401,10 @@
},
{
"cell_type": "markdown",
- "id": "37",
+ "id": "24",
"metadata": {},
"source": [
- "markdown## API Reference\n",
+ "## API Reference\n",
"\n",
"### `transform.cluster()` Parameters\n",
"\n",
@@ -593,25 +412,11 @@
"|-----------|------|---------|-------------|\n",
"| `n_clusters` | `int` | - | Number of typical periods (e.g., 8 typical days) |\n",
"| `cluster_duration` | `str \\| float` | - | Duration per cluster ('1D', '24h') or hours |\n",
- "| `data_vars` | `list[str]` | None | Variables to cluster on (applies result to all) |\n",
- "| `weights` | `dict[str, float]` | None | Optional weights for time series in clustering |\n",
- "| `cluster` | `ClusterConfig` | None | Clustering algorithm configuration |\n",
+ "| `cluster` | `ClusterConfig` | None | Clustering algorithm and weights. Use `weights={var: 0}` to exclude variables. |\n",
"| `extremes` | `ExtremeConfig` | None | **Essential**: Force inclusion of peak/min periods |\n",
+ "| `segments` | `SegmentConfig` | None | Intra-period segmentation (variable timestep durations) |\n",
"| `**tsam_kwargs` | - | - | Additional tsam parameters |\n",
"\n",
- "### `transform.clustering_data()` Method\n",
- "\n",
- "Inspect which time-varying data will be used for clustering:\n",
- "\n",
- "```python\n",
- "# Get all time-varying variables\n",
- "clustering_data = flow_system.transform.clustering_data()\n",
- "print(list(clustering_data.data_vars))\n",
- "\n",
- "# Get data for a specific period (multi-period systems)\n",
- "clustering_data = flow_system.transform.clustering_data(period=2024)\n",
- "```\n",
- "\n",
"### Clustering Object Properties\n",
"\n",
"After clustering, access metadata via `fs.clustering`:\n",
@@ -619,31 +424,12 @@
"| Property | Description |\n",
"|----------|-------------|\n",
"| `n_clusters` | Number of clusters |\n",
- "| `n_original_clusters` | Number of original time segments (e.g., 365 days) |\n",
- "| `timesteps_per_cluster` | Timesteps in each cluster (e.g., 24 for daily) |\n",
- "| `cluster_assignments` | xr.DataArray mapping original segment → cluster ID |\n",
+ "| `n_original_clusters` | Number of original time segments (e.g., 31 days) |\n",
+ "| `timesteps_per_cluster` | Timesteps in each cluster (e.g., 96 for daily at 15 min) |\n",
+ "| `cluster_assignments` | xr.DataArray mapping original segment to cluster ID |\n",
"| `cluster_occurrences` | How many original segments each cluster represents |\n",
- "| `metrics` | xr.Dataset with RMSE, MAE per time series |\n",
- "| `results` | `ClusteringResults` with xarray-like interface |\n",
- "| `plot.compare()` | Compare original vs clustered time series |\n",
- "| `plot.heatmap()` | Visualize cluster structure |\n",
- "\n",
- "### ClusteringResults (xarray-like)\n",
- "\n",
- "Access the underlying tsam results via `clustering.results`:\n",
- "\n",
- "```python\n",
- "# Dimension info (like xarray)\n",
- "clustering.results.dims # ('period', 'scenario') or ()\n",
- "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n",
- "\n",
- "# Select specific result (like xarray)\n",
- "clustering.results.sel(period=2020, scenario='high') # Label-based\n",
- "clustering.results.isel(period=0, scenario=1) # Index-based\n",
- "\n",
- "# Apply existing clustering to new data\n",
- "agg_results = clustering.results.apply(dataset) # Returns AggregationResults\n",
- "```\n",
+ "| `clustering_result` | Full [tsam_xarray ClusteringResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray) |\n",
+ "| `aggregation_result` | Full [tsam_xarray AggregationResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray) (pre-IO only) |\n",
"\n",
"### Storage Behavior\n",
"\n",
@@ -656,44 +442,12 @@
"| `'cyclic'` | Each cluster is independent but cyclic (start = end) |\n",
"| `'independent'` | Each cluster is independent, free start/end |\n",
"\n",
- "For a detailed comparison of storage modes, see [08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb).\n",
- "\n",
- "### Peak Forcing with ExtremeConfig\n",
- "\n",
- "```python\n",
- "from tsam import ExtremeConfig\n",
- "\n",
- "extremes = ExtremeConfig(\n",
- " method='new_cluster', # Creates new cluster for extremes\n",
- " max_value=['ComponentName(FlowName)|fixed_relative_profile'], # Capture peak demand\n",
- ")\n",
- "```\n",
- "\n",
- "### Recommended Workflow\n",
- "\n",
- "```python\n",
- "from tsam import ExtremeConfig\n",
- "\n",
- "# Stage 1: Fast sizing\n",
- "fs_sizing = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand(Flow)|fixed_relative_profile']),\n",
- ")\n",
- "fs_sizing.optimize(solver)\n",
- "\n",
- "# Apply safety margin\n",
- "sizes = {k: v.item() * 1.05 for k, v in fs_sizing.stats.sizes.items()}\n",
- "\n",
- "# Stage 2: Accurate dispatch\n",
- "fs_dispatch = flow_system.transform.fix_sizes(sizes)\n",
- "fs_dispatch.optimize(solver)\n",
- "```"
+ "For a detailed comparison of storage modes, see [08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)."
]
},
{
"cell_type": "markdown",
- "id": "38",
+ "id": "25",
"metadata": {},
"source": [
"## Summary\n",
@@ -701,30 +455,25 @@
"You learned how to:\n",
"\n",
"- Use **`cluster()`** to reduce time series into typical periods\n",
- "- **Inspect clustering data** with `clustering_data()` before clustering\n",
- "- Use **`data_vars`** to cluster based on specific variables only\n",
"- Apply **peak forcing** with `ExtremeConfig` to capture extreme demand days\n",
"- Use **two-stage optimization** for fast yet accurate investment decisions\n",
"- **Expand solutions** back to full resolution with `expand()`\n",
- "- Access **clustering metadata** via `fs.clustering` (metrics, cluster_assignments, cluster_occurrences)\n",
- "- Use **advanced options** like different algorithms with `ClusterConfig`\n",
+ "- Access **clustering metadata** via `fs.clustering`\n",
"- **Apply existing clustering** to other FlowSystems using `apply_clustering()`\n",
"\n",
"### Key Takeaways\n",
"\n",
"1. **Always use peak forcing** (`extremes=ExtremeConfig(max_value=[...])`) for demand time series\n",
- "2. **Inspect data first** with `clustering_data()` to see available variables\n",
- "3. **Use `data_vars`** to cluster on specific variables (e.g., demand only, ignoring prices)\n",
- "4. **Add safety margin** (5-10%) when fixing sizes from clustering\n",
- "5. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n",
- "6. **Storage handling** is configurable via `cluster_mode`\n",
- "7. **Check metrics** to evaluate clustering quality\n",
- "8. **Use `apply_clustering()`** to apply the same clustering to different FlowSystem variants\n",
+ "2. **Add safety margin** (5-10%) when fixing sizes from clustering\n",
+ "3. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n",
+ "4. **Storage handling** is configurable via `cluster_mode`\n",
+ "5. **Use `apply_clustering()`** to apply the same clustering to different FlowSystem variants\n",
+ "6. For advanced clustering options (weights, algorithms, segmentation, tuning), see\n",
+ " [tsam_xarray](https://github.com/FZJ-IEK3-VSA/tsam_xarray) and [tsam](https://github.com/FZJ-IEK3-VSA/tsam)\n",
"\n",
"### Next Steps\n",
"\n",
- "- **[08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)**: Compare storage modes using a seasonal storage system\n",
- "- **[08d-clustering-multiperiod](08d-clustering-multiperiod.ipynb)**: Clustering with multiple periods and scenarios"
+ "- **[08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)**: Compare storage modes using a seasonal storage system"
]
}
],
diff --git a/docs/notebooks/08d-clustering-multiperiod.ipynb b/docs/notebooks/08d-clustering-multiperiod.ipynb
deleted file mode 100644
index 82da05c6f..000000000
--- a/docs/notebooks/08d-clustering-multiperiod.ipynb
+++ /dev/null
@@ -1,609 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "0",
- "metadata": {},
- "source": [
- "# Multi-Period Clustering with `cluster()`\n",
- "\n",
- "Combine time series clustering with multi-period investment optimization.\n",
- "\n",
- "This notebook demonstrates:\n",
- "\n",
- "- **Multi-period modeling**: Optimize investments across multiple planning periods (years)\n",
- "- **Scenario analysis**: Handle demand uncertainty with weighted scenarios\n",
- "- **Clustering per period**: Apply typical-period clustering independently for each period/scenario\n",
- "- **Scalability**: Reduce computational complexity for long-horizon planning\n",
- "\n",
- "!!! note \"Requirements\"\n",
- " This notebook requires the `tsam` package with `ExtremeConfig` support.\n",
- " Install with: `pip install \"flixopt[full]\"`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1",
- "metadata": {},
- "outputs": [],
- "source": [
- "import timeit\n",
- "\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "import plotly.express as px\n",
- "\n",
- "import flixopt as fx\n",
- "\n",
- "fx.CONFIG.notebook()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2",
- "metadata": {},
- "source": [
- "## Create the Multi-Period System\n",
- "\n",
- "We use a multi-period heating system with:\n",
- "- **3 planning periods** (years 2024, 2025, 2026)\n",
- "- **2 scenarios** (high demand 30%, low demand 70%)\n",
- "- **2 weeks** at hourly resolution (336 timesteps)\n",
- "\n",
- "This represents a capacity expansion problem where we optimize component sizes once,\n",
- "but operations are simulated across multiple future years and demand scenarios."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3",
- "metadata": {},
- "outputs": [],
- "source": [
- "from data.generate_example_systems import create_multiperiod_system\n",
- "\n",
- "flow_system = create_multiperiod_system()\n",
- "\n",
- "print(f'Timesteps: {len(flow_system.timesteps)} ({len(flow_system.timesteps) // 24} days)')\n",
- "print(f'Periods: {list(flow_system.periods.values)}')\n",
- "print(f'Scenarios: {list(flow_system.scenarios.values)}')\n",
- "print(f'Scenario weights: {flow_system.scenario_weights.values}')\n",
- "print(f'\\nComponents: {list(flow_system.components.keys())}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4",
- "metadata": {},
- "source": [
- "## Selecting a Subset with `transform.isel()`\n",
- "\n",
- "For demonstration purposes, we'll use only the first week of data.\n",
- "The `isel()` method (index select) lets you slice FlowSystems by time:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Select first week only (168 hours)\n",
- "flow_system = flow_system.transform.isel(time=slice(0, 168))\n",
- "\n",
- "print(f'After isel: {len(flow_system.timesteps)} timesteps ({len(flow_system.timesteps) // 24} days)')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize demand scenarios (equal across periods)\n",
- "heat_demand = flow_system.components['Building'].inputs[0].fixed_relative_profile\n",
- "\n",
- "fig = px.line(heat_demand.to_dataframe('value').reset_index(), x='time', y='value', facet_row='scenario')\n",
- "\n",
- "fig.update_layout(\n",
- " height=350,\n",
- " title='Heat Demand by Scenario (One Week)',\n",
- " xaxis_title='Time',\n",
- " yaxis_title='Heat Demand [kW]',\n",
- ")\n",
- "fig.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7",
- "metadata": {},
- "source": [
- "## Full Optimization (Baseline)\n",
- "\n",
- "First, solve the complete problem with all timesteps across all periods and scenarios:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8",
- "metadata": {},
- "outputs": [],
- "source": [
- "solver = fx.solvers.HighsSolver(mip_gap=0.01)\n",
- "\n",
- "start = timeit.default_timer()\n",
- "fs_full = flow_system.copy()\n",
- "fs_full.name = 'Full Optimization'\n",
- "fs_full.optimize(solver)\n",
- "time_full = timeit.default_timer() - start\n",
- "\n",
- "print(f'Full optimization: {time_full:.2f} seconds')\n",
- "print(f'Total cost (objective): {fs_full.solution[\"objective\"].item():,.0f} €')\n",
- "print('\\nOptimized sizes:')\n",
- "for name, size in fs_full.stats.sizes.items():\n",
- " print(f' {name}: {size.max().item():.1f}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9",
- "metadata": {},
- "source": [
- "## Multi-Period Clustering with `cluster()`\n",
- "\n",
- "When applied to a multi-period system, `cluster()` clusters **each period/scenario combination independently**.\n",
- "This is because demand patterns and optimal operations may differ across:\n",
- "\n",
- "- **Periods**: Different years may have different characteristics\n",
- "- **Scenarios**: High vs low demand scenarios need different representative days\n",
- "\n",
- "The investment decisions (sizes) remain consistent across all periods and scenarios,\n",
- "while the operational patterns are optimized for each cluster."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "10",
- "metadata": {},
- "outputs": [],
- "source": [
- "from tsam import ExtremeConfig\n",
- "\n",
- "start = timeit.default_timer()\n",
- "\n",
- "# Force inclusion of peak demand periods\n",
- "peak_series = ['Building(Heat)|fixed_relative_profile']\n",
- "\n",
- "# Cluster to 3 typical days (from 7 days)\n",
- "fs_clustered = flow_system.transform.cluster(\n",
- " n_clusters=3,\n",
- " cluster_duration='1D',\n",
- " extremes=ExtremeConfig(method='replace', max_value=peak_series),\n",
- ")\n",
- "\n",
- "time_clustering = timeit.default_timer() - start\n",
- "\n",
- "print(f'Clustering time: {time_clustering:.2f} seconds')\n",
- "print(f'Reduced: {len(flow_system.timesteps)} → {len(fs_clustered.timesteps)} timesteps per period')\n",
- "print('Total problem reduction: 7 days × 3 periods × 2 scenarios → 3 days × 3 × 2')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "11",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Optimize the reduced system\n",
- "start = timeit.default_timer()\n",
- "fs_clustered.optimize(solver)\n",
- "time_clustered = timeit.default_timer() - start\n",
- "\n",
- "print(f'Clustered optimization: {time_clustered:.2f} seconds')\n",
- "print(f'Total cost (objective): {fs_clustered.solution[\"objective\"].item():,.0f} €')\n",
- "print(f'\\nSpeedup vs full: {time_full / (time_clustering + time_clustered):.1f}x')\n",
- "print('\\nOptimized sizes:')\n",
- "for name, size in fs_clustered.stats.sizes.items():\n",
- " print(f' {name}: {size.max().item():.1f}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "12",
- "metadata": {},
- "source": [
- "## Visualize Clustering Quality\n",
- "\n",
- "The `.plot` accessor provides built-in visualizations with automatic faceting by period and scenario:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "13",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Duration curves show how well the distribution is preserved per period/scenario\n",
- "fs_clustered.clustering.plot.compare(\n",
- " kind='duration_curve',\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "14",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Heatmap shows cluster assignments - faceted by period and scenario\n",
- "fs_clustered.clustering.plot.heatmap()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "15",
- "metadata": {},
- "source": [
- "## Understand the Cluster Structure\n",
- "\n",
- "Let's inspect how days were grouped into clusters:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "16",
- "metadata": {},
- "outputs": [],
- "source": [
- "clustering = fs_clustered.clustering\n",
- "\n",
- "print('Clustering Configuration:')\n",
- "print(f' Typical periods (clusters): {clustering.n_clusters}')\n",
- "print(f' Timesteps per cluster: {clustering.timesteps_per_cluster}')\n",
- "\n",
- "# Access underlying results via xarray-like interface\n",
- "print(f'\\nClusteringResults dimensions: {clustering.results.dims}')\n",
- "print(f'ClusteringResults coords: {clustering.results.coords}')\n",
- "\n",
- "# The cluster_assignments shows which cluster each original day belongs to\n",
- "# For multi-period systems, select a specific period/scenario combination\n",
- "cluster_assignments = clustering.cluster_assignments.isel(period=0, scenario=0).values\n",
- "day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n",
- "\n",
- "print('\\nCluster assignments per day (period=2024, scenario=High):')\n",
- "for i, cluster_id in enumerate(cluster_assignments):\n",
- " print(f' {day_names[i]}: Cluster {cluster_id}')\n",
- "\n",
- "# Cluster occurrences (how many original days each cluster represents)\n",
- "unique, counts = np.unique(cluster_assignments, return_counts=True)\n",
- "print('\\nCluster weights (days represented):')\n",
- "for cluster_id, count in zip(unique, counts, strict=True):\n",
- " print(f' Cluster {cluster_id}: {count} days')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "17",
- "metadata": {},
- "source": [
- "## Two-Stage Workflow for Multi-Period\n",
- "\n",
- "For investment optimization across multiple periods, the recommended workflow is:\n",
- "\n",
- "1. **Stage 1**: Fast sizing with clustering (reduced timesteps)\n",
- "2. **Stage 2**: Fix sizes and run full-resolution dispatch\n",
- "\n",
- "This gives accurate investment decisions while maintaining computational tractability.\n",
- "\n",
- "### Safety Margin Rationale\n",
- "\n",
- "A 10% safety margin is applied to compensate for:\n",
- "\n",
- "- **Peak underestimation**: Clustering averages similar days, potentially underestimating true peak demands\n",
- "- **Temporal detail loss**: Representative periods may miss short-duration extreme events\n",
- "- **Scenario averaging**: Weighted scenarios smooth out worst-case conditions\n",
- "\n",
- "For critical applications, consider 15-20% margins or validate with full-resolution runs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stage 1 already done - apply safety margin\n",
- "SAFETY_MARGIN = 1.10 # 10% buffer for multi-period uncertainty\n",
- "\n",
- "sizes_with_margin = {name: size.max().item() * SAFETY_MARGIN for name, size in fs_clustered.stats.sizes.items()}\n",
- "\n",
- "print('Stage 1: Sizing with clustering')\n",
- "print(f' Time: {time_clustering + time_clustered:.2f} seconds')\n",
- "print(f' Cost estimate: {fs_clustered.solution[\"objective\"].item():,.0f} €')\n",
- "print(f'\\nSizes with {(SAFETY_MARGIN - 1) * 100:.0f}% safety margin:')\n",
- "for name, size in sizes_with_margin.items():\n",
- " original = fs_clustered.stats.sizes[name].max().item()\n",
- " print(f' {name}: {original:.1f} → {size:.1f}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "19",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stage 2: Full resolution dispatch with fixed sizes\n",
- "print('Stage 2: Full resolution dispatch')\n",
- "start = timeit.default_timer()\n",
- "\n",
- "fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin)\n",
- "fs_dispatch.name = 'Two-Stage'\n",
- "fs_dispatch.optimize(solver)\n",
- "\n",
- "time_dispatch = timeit.default_timer() - start\n",
- "\n",
- "print(f' Time: {time_dispatch:.2f} seconds')\n",
- "print(f' Actual cost: {fs_dispatch.solution[\"objective\"].item():,.0f} €')\n",
- "\n",
- "# Total comparison\n",
- "total_two_stage = time_clustering + time_clustered + time_dispatch\n",
- "print(f'\\nTotal two-stage time: {total_two_stage:.2f} seconds')\n",
- "print(f'Speedup vs full: {time_full / total_two_stage:.1f}x')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20",
- "metadata": {},
- "source": [
- "## Compare Results Across Methods"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "21",
- "metadata": {},
- "outputs": [],
- "source": [
- "results = {\n",
- " 'Full (baseline)': {\n",
- " 'Time [s]': time_full,\n",
- " 'Cost [€]': fs_full.solution['objective'].item(),\n",
- " 'Boiler': fs_full.stats.sizes['Boiler(Heat)'].max().item(),\n",
- " 'Storage': fs_full.stats.sizes['ThermalStorage'].max().item(),\n",
- " },\n",
- " 'Clustered (3 days)': {\n",
- " 'Time [s]': time_clustering + time_clustered,\n",
- " 'Cost [€]': fs_clustered.solution['objective'].item(),\n",
- " 'Boiler': fs_clustered.stats.sizes['Boiler(Heat)'].max().item(),\n",
- " 'Storage': fs_clustered.stats.sizes['ThermalStorage'].max().item(),\n",
- " },\n",
- " 'Two-Stage': {\n",
- " 'Time [s]': total_two_stage,\n",
- " 'Cost [€]': fs_dispatch.solution['objective'].item(),\n",
- " 'Boiler': sizes_with_margin['Boiler(Heat)'],\n",
- " 'Storage': sizes_with_margin['ThermalStorage'],\n",
- " },\n",
- "}\n",
- "\n",
- "comparison = pd.DataFrame(results).T\n",
- "baseline_cost = comparison.loc['Full (baseline)', 'Cost [€]']\n",
- "baseline_time = comparison.loc['Full (baseline)', 'Time [s]']\n",
- "comparison['Cost Gap [%]'] = ((comparison['Cost [€]'] - baseline_cost) / abs(baseline_cost) * 100).round(2)\n",
- "comparison['Speedup'] = (baseline_time / comparison['Time [s]']).round(1)\n",
- "\n",
- "comparison.style.format(\n",
- " {\n",
- " 'Time [s]': '{:.2f}',\n",
- " 'Cost [€]': '{:,.0f}',\n",
- " 'Boiler': '{:.1f}',\n",
- " 'Storage': '{:.0f}',\n",
- " 'Cost Gap [%]': '{:.2f}',\n",
- " 'Speedup': '{:.1f}x',\n",
- " }\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "22",
- "metadata": {},
- "source": [
- "## Visualize Optimization Results\n",
- "\n",
- "Use the built-in statistics plotting to compare results across periods and scenarios:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "23",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Plot flow rates with automatic faceting by period and scenario\n",
- "fs_full.stats.plot.flows(component='Boiler')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "24",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Side-by-side comparison using the Comparison class\n",
- "comp = fx.Comparison([fs_full, fs_dispatch])\n",
- "comp.stats.plot.balance('Heat')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "25",
- "metadata": {},
- "source": [
- "## Expand Clustered Solution to Full Resolution\n",
- "\n",
- "Use `expand()` to map the clustered results back to all original timesteps:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "26",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Expand the clustered solution\n",
- "fs_expanded = fs_clustered.transform.expand()\n",
- "\n",
- "print(f'Expanded: {len(fs_clustered.timesteps)} → {len(fs_expanded.timesteps)} timesteps')\n",
- "print(f'Cost (objective): {fs_expanded.solution[\"objective\"].item():,.0f} €')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "27",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare expanded solution - shows the repeated cluster patterns\n",
- "fs_expanded.stats.plot.flows(component='Boiler')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "28",
- "metadata": {},
- "source": [
- "## Key Considerations for Multi-Period Clustering\n",
- "\n",
- "### 1. Independent Clustering per Period/Scenario\n",
- "\n",
- "Each period and scenario combination is clustered independently because:\n",
- "- Demand patterns may differ across years (growth, seasonality)\n",
- "- Scenarios represent different futures that shouldn't be mixed\n",
- "- Investment decisions must be robust across all combinations\n",
- "\n",
- "### 2. Safety Margins\n",
- "\n",
- "Multi-period systems often warrant larger safety margins (10-15%) because:\n",
- "- More uncertainty across multiple years\n",
- "- Investments made once must work for all periods\n",
- "- Scenario weights may not perfectly represent actual outcomes\n",
- "\n",
- "### 3. Computational Benefits\n",
- "\n",
- "Clustering becomes more valuable as problem size grows:\n",
- "\n",
- "| Scenario | Full Problem | With Clustering |\n",
- "|----------|--------------|----------------|\n",
- "| 1 period, 1 scenario, 365 days | 8,760 timesteps | ~730 (10 typical days) |\n",
- "| 3 periods, 2 scenarios, 365 days | 52,560 timesteps | ~4,380 |\n",
- "| 10 periods, 3 scenarios, 365 days | 262,800 timesteps | ~21,900 |\n",
- "\n",
- "The speedup factor increases with problem size."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "29",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "You learned how to:\n",
- "\n",
- "- Load **multi-period systems** with periods and scenarios\n",
- "- Use **`transform.isel()`** to select time subsets\n",
- "- Apply **`cluster()`** to multi-dimensional FlowSystems\n",
- "- **Visualize clustering** with the `.plot` accessor (compare, duration curves, heatmaps)\n",
- "- Use the **two-stage workflow** for robust investment optimization\n",
- "- **Expand solutions** back to full resolution with `expand()`\n",
- "\n",
- "### Key Takeaways\n",
- "\n",
- "1. **Clustering is applied per period/scenario**: Each combination gets independent typical periods\n",
- "2. **Investments are shared**: Component sizes are optimized once across all periods/scenarios\n",
- "3. **Use larger safety margins**: Multi-period uncertainty warrants 10-15% buffers\n",
- "4. **Two-stage is recommended**: Fast sizing with clustering, accurate dispatch at full resolution\n",
- "5. **Built-in plotting**: Use `.plot` accessor for automatic faceting by period/scenario\n",
- "\n",
- "### API Reference\n",
- "\n",
- "```python\n",
- "from tsam import ExtremeConfig\n",
- "\n",
- "# Load multi-period system\n",
- "fs = fx.FlowSystem.from_netcdf('multiperiod_system.nc4')\n",
- "\n",
- "# Select time subset (optional)\n",
- "fs = fs.transform.isel(time=slice(0, 168)) # First 168 timesteps\n",
- "\n",
- "# Cluster (applies per period/scenario)\n",
- "# Note: For multi-period systems, only method='replace' is supported\n",
- "fs_clustered = fs.transform.cluster(\n",
- " n_clusters=10,\n",
- " cluster_duration='1D',\n",
- " extremes=ExtremeConfig(method='replace', max_value=['Demand(Flow)|fixed_relative_profile']),\n",
- ")\n",
- "\n",
- "# Visualize clustering quality\n",
- "fs_clustered.clustering.plot.compare(variable='Demand(Flow)|profile')\n",
- "fs_clustered.clustering.plot.heatmap()\n",
- "\n",
- "# Access underlying results (xarray-like interface)\n",
- "fs_clustered.clustering.results.dims # ('period', 'scenario')\n",
- "fs_clustered.clustering.results.coords # {'period': [...], 'scenario': [...]}\n",
- "fs_clustered.clustering.results.sel(period=2024, scenario='High') # Label-based\n",
- "fs_clustered.clustering.results.isel(period=0, scenario=0) # Index-based\n",
- "\n",
- "# Two-stage workflow\n",
- "fs_clustered.optimize(solver)\n",
- "sizes = {k: v.max().item() * 1.10 for k, v in fs_clustered.stats.sizes.items()}\n",
- "fs_dispatch = fs.transform.fix_sizes(sizes)\n",
- "fs_dispatch.optimize(solver)\n",
- "\n",
- "# Visualize results\n",
- "fs_dispatch.stats.plot.flows(component='Boiler')\n",
- "```"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/notebooks/08e-clustering-internals.ipynb b/docs/notebooks/08e-clustering-internals.ipynb
deleted file mode 100644
index 2d099ff34..000000000
--- a/docs/notebooks/08e-clustering-internals.ipynb
+++ /dev/null
@@ -1,540 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "0",
- "metadata": {},
- "source": [
- "# Clustering Internals\n",
- "\n",
- "Understanding the data structures and visualization tools behind time series clustering.\n",
- "\n",
- "This notebook demonstrates:\n",
- "\n",
- "- **Data structure**: The `Clustering` class that stores all clustering information\n",
- "- **Plot accessor**: Built-in visualizations via `.plot`\n",
- "- **Data expansion**: Using `expand_data()` to map aggregated data back to original timesteps\n",
- "- **IO workflow**: What's preserved and lost when saving/loading clustered systems\n",
- "\n",
- "!!! note \"Requirements\"\n",
- " This notebook requires the `tsam` package for time series aggregation.\n",
- " Install with: `pip install \"flixopt[full]\"`\n",
- "\n",
- "!!! note \"Prerequisites\"\n",
- " This notebook assumes familiarity with [08c-clustering](08c-clustering.ipynb)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1",
- "metadata": {},
- "outputs": [],
- "source": [
- "from data.generate_example_systems import create_district_heating_system\n",
- "\n",
- "import flixopt as fx\n",
- "\n",
- "fx.CONFIG.notebook()\n",
- "\n",
- "flow_system = create_district_heating_system()\n",
- "flow_system.connect_and_transform()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2",
- "metadata": {},
- "source": [
- "## Clustering Metadata\n",
- "\n",
- "After calling `cluster()`, metadata is stored in `fs.clustering`:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "3",
- "metadata": {},
- "outputs": [],
- "source": [
- "from tsam import ExtremeConfig\n",
- "\n",
- "fs_clustered = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n",
- ")\n",
- "\n",
- "fs_clustered.clustering"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4",
- "metadata": {},
- "source": [
- "The `Clustering` object contains:\n",
- "- **`cluster_assignments`**: Which cluster each original period maps to\n",
- "- **`cluster_occurrences`**: How many original periods each cluster represents\n",
- "- **`timestep_mapping`**: Maps each original timestep to its representative\n",
- "- **`original_data`** / **`aggregated_data`**: The data before and after clustering\n",
- "- **`results`**: `ClusteringResults` object with xarray-like interface (`.dims`, `.coords`, `.sel()`)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cluster order shows which cluster each original period maps to\n",
- "fs_clustered.clustering.cluster_assignments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cluster occurrences shows how many original periods each cluster represents\n",
- "fs_clustered.clustering.cluster_occurrences"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7",
- "metadata": {},
- "source": [
- "## Visualizing Clustering\n",
- "\n",
- "The `.plot` accessor provides built-in visualizations for understanding clustering results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare original vs aggregated data as timeseries\n",
- "# By default, plots all time-varying variables\n",
- "fs_clustered.clustering.plot.compare()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Use a different approach of visualizing the data using normalize heatmaps\n",
- "ds = fs_clustered.clustering.plot.compare(data_only=True).data\n",
- "\n",
- "ds_normalized = (ds - ds.min()) / (ds.max() - ds.min())\n",
- "ds_normalized.to_array().plotly.imshow(\n",
- " x='time',\n",
- " animation_frame='representation',\n",
- " zmin=0,\n",
- " zmax=1,\n",
- " color_continuous_scale='viridis',\n",
- " title='Normalized Comparison',\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "10",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare specific variables only\n",
- "fs_clustered.clustering.plot.compare(variables='HeatDemand(Q_th)|fixed_relative_profile')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "11",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Duration curves show how well the aggregated data preserves the distribution\n",
- "fs_clustered.clustering.plot.compare(kind='duration_curve').data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "12",
- "metadata": {},
- "outputs": [],
- "source": [
- "# View typical period profiles for each cluster\n",
- "# Each line represents a cluster's representative day\n",
- "fs_clustered.clustering.plot.clusters(variables='HeatDemand(Q_th)|fixed_relative_profile', color='cluster')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "13",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Heatmap shows cluster assignments for each original period\n",
- "fs_clustered.clustering.plot.heatmap()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "14",
- "metadata": {},
- "source": [
- "## Expanding Aggregated Data\n",
- "\n",
- "The `Clustering.expand_data()` method maps aggregated data back to original timesteps.\n",
- "This is useful for comparing clustering results before optimization:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "15",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get original and aggregated data\n",
- "clustering = fs_clustered.clustering\n",
- "original = clustering.original_data['HeatDemand(Q_th)|fixed_relative_profile']\n",
- "aggregated = clustering.aggregated_data['HeatDemand(Q_th)|fixed_relative_profile']\n",
- "\n",
- "# Expand aggregated data back to original timesteps\n",
- "expanded = clustering.expand_data(aggregated)\n",
- "\n",
- "print(f'Original: {len(original.time)} timesteps')\n",
- "print(f'Aggregated: {len(aggregated.time)} timesteps')\n",
- "print(f'Expanded: {len(expanded.time)} timesteps')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "16",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "| Property | Description |\n",
- "|----------|-------------|\n",
- "| `clustering.n_clusters` | Number of representative clusters |\n",
- "| `clustering.timesteps_per_cluster` | Timesteps in each cluster period |\n",
- "| `clustering.cluster_assignments` | Maps original periods to clusters |\n",
- "| `clustering.cluster_occurrences` | Count of original periods per cluster |\n",
- "| `clustering.timestep_mapping` | Maps original timesteps to representative indices |\n",
- "| `clustering.original_data` | Dataset before clustering |\n",
- "| `clustering.aggregated_data` | Dataset after clustering |\n",
- "| `clustering.results` | `ClusteringResults` with xarray-like interface |\n",
- "\n",
- "### ClusteringResults (xarray-like)\n",
- "\n",
- "Access the underlying tsam results via `clustering.results`:\n",
- "\n",
- "```python\n",
- "# Dimension info (like xarray)\n",
- "clustering.results.dims # ('period', 'scenario') or ()\n",
- "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n",
- "\n",
- "# Select specific result (like xarray)\n",
- "clustering.results.sel(period=2020, scenario='high') # Label-based\n",
- "clustering.results.isel(period=0, scenario=1) # Index-based\n",
- "```\n",
- "\n",
- "### Plot Accessor Methods\n",
- "\n",
- "| Method | Description |\n",
- "|--------|-------------|\n",
- "| `plot.compare()` | Compare original vs aggregated data (timeseries) |\n",
- "| `plot.compare(kind='duration_curve')` | Compare as duration curves |\n",
- "| `plot.clusters()` | View each cluster's profile |\n",
- "| `plot.heatmap()` | Visualize cluster assignments |\n",
- "\n",
- "### Key Parameters\n",
- "\n",
- "```python\n",
- "# Compare with options\n",
- "clustering.plot.compare(\n",
- " variables='Demand|profile', # Single variable, list, or None (all)\n",
- " kind='timeseries', # 'timeseries' or 'duration_curve'\n",
- " select={'scenario': 'Base'}, # xarray-style selection\n",
- " colors='viridis', # Colorscale name, list, or dict\n",
- " facet_col='period', # Facet by period if present\n",
- " facet_row='scenario', # Facet by scenario if present\n",
- ")\n",
- "\n",
- "# Heatmap shows cluster assignments (no variable needed)\n",
- "clustering.plot.heatmap()\n",
- "\n",
- "# Expand aggregated data to original timesteps\n",
- "expanded = clustering.expand_data(aggregated_data)\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "17",
- "metadata": {},
- "source": [
- "## Cluster Weights\n",
- "\n",
- "Each representative timestep has a weight equal to the number of original periods it represents.\n",
- "This ensures operational costs scale correctly:\n",
- "\n",
- "$$\\text{Objective} = \\sum_{t \\in \\text{typical}} w_t \\cdot c_t$$\n",
- "\n",
- "The weights sum to the original timestep count:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18",
- "metadata": {},
- "outputs": [],
- "source": [
- "print(f'Sum of weights: {fs_clustered.cluster_weight.sum().item():.0f}')\n",
- "print(f'Original timesteps: {len(flow_system.timesteps)}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "19",
- "metadata": {},
- "source": [
- "## Solution Expansion\n",
- "\n",
- "After optimization, `expand()` maps results back to full resolution:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "20",
- "metadata": {},
- "outputs": [],
- "source": [
- "solver = fx.solvers.HighsSolver(mip_gap=0.01, log_to_console=False)\n",
- "fs_clustered.optimize(solver)\n",
- "\n",
- "fs_expanded = fs_clustered.transform.expand()\n",
- "\n",
- "print(f'Clustered: {len(fs_clustered.timesteps)} timesteps')\n",
- "print(f'Expanded: {len(fs_expanded.timesteps)} timesteps')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "21",
- "metadata": {},
- "source": [
- "## IO Workflow\n",
- "\n",
- "When saving and loading a clustered FlowSystem, most clustering information is preserved.\n",
- "However, some methods that access tsam's internal `AggregationResult` objects are not available after IO.\n",
- "\n",
- "### What's Preserved After IO\n",
- "\n",
- "- **Structure**: `n_clusters`, `timesteps_per_cluster`, `dims`, `coords`\n",
- "- **Mappings**: `cluster_assignments`, `cluster_occurrences`, `timestep_mapping`\n",
- "- **Data**: `original_data`, `aggregated_data`\n",
- "- **Original timesteps**: `original_timesteps`\n",
- "- **Results structure**: `results.sel()`, `results.isel()` for `ClusteringResult` access\n",
- "\n",
- "### What's Lost After IO\n",
- "\n",
- "- **`clustering.sel()`**: Accessing full `AggregationResult` objects\n",
- "- **`clustering.items()`**: Iterating over `AggregationResult` objects\n",
- "- **tsam internals**: `AggregationResult.accuracy`, `AggregationResult.plot`, etc."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "22",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Before IO: Full tsam access is available\n",
- "result = fs_clustered.clustering.sel() # Get the AggregationResult\n",
- "print(f'Before IO - AggregationResult available: {type(result).__name__}')\n",
- "print(f' - n_clusters: {result.n_clusters}')\n",
- "print(f' - accuracy.rmse (mean): {result.accuracy.rmse.mean():.4f}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "23",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Save and load the clustered system\n",
- "import tempfile\n",
- "from pathlib import Path\n",
- "\n",
- "try:\n",
- " with tempfile.TemporaryDirectory() as tmpdir:\n",
- " path = Path(tmpdir) / 'clustered_system.nc'\n",
- " fs_clustered.to_netcdf(path)\n",
- " fs_loaded = fx.FlowSystem.from_netcdf(path)\n",
- "\n",
- " # Structure is preserved\n",
- " print('After IO - Structure preserved:')\n",
- " print(f' - n_clusters: {fs_loaded.clustering.n_clusters}')\n",
- " print(f' - dims: {fs_loaded.clustering.dims}')\n",
- " print(f' - original_data variables: {list(fs_loaded.clustering.original_data.data_vars)[:3]}...')\n",
- "except OSError as e:\n",
- " print(f'Note: NetCDF save/load skipped due to environment issue: {type(e).__name__}')\n",
- " print('This can happen in some CI environments. The functionality works locally.')\n",
- " fs_loaded = fs_clustered # Use original for subsequent cells"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "24",
- "metadata": {},
- "outputs": [],
- "source": [
- "# After IO: sel() raises ValueError because AggregationResult is not preserved\n",
- "try:\n",
- " fs_loaded.clustering.sel()\n",
- "except ValueError as e:\n",
- " print('After IO - sel() raises ValueError:')\n",
- " print(f' \"{e}\"')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "25",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Key operations still work after IO:\n",
- "# - Optimization\n",
- "# - Expansion back to full resolution\n",
- "# - Accessing original_data and aggregated_data\n",
- "\n",
- "fs_loaded.optimize(solver)\n",
- "fs_loaded_expanded = fs_loaded.transform.expand()\n",
- "\n",
- "print('Loaded system can still be:')\n",
- "print(f' - Optimized: {fs_loaded.solution is not None}')\n",
- "print(f' - Expanded: {len(fs_loaded_expanded.timesteps)} timesteps')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "26",
- "metadata": {},
- "source": [
- "### IO Workflow Summary\n",
- "\n",
- "```\n",
- "┌─────────────────┐ to_netcdf() ┌─────────────────┐\n",
- "│ fs_clustered │ ─────────────────► │ NetCDF file │\n",
- "│ │ │ │\n",
- "│ ✓ clustering │ │ ✓ structure │\n",
- "│ ✓ sel() │ │ ✓ mappings │\n",
- "│ ✓ items() │ │ ✓ data │\n",
- "│ ✓ AggregationResult │ ✗ AggregationResult\n",
- "└─────────────────┘ └─────────────────┘\n",
- " │\n",
- " │ from_netcdf()\n",
- " ▼\n",
- " ┌─────────────────┐\n",
- " │ fs_loaded │\n",
- " │ │\n",
- " │ ✓ optimize() │\n",
- " │ ✓ expand() │\n",
- " │ ✓ original_data │\n",
- " │ ✗ sel() │\n",
- " │ ✗ items() │\n",
- " └─────────────────┘\n",
- "```\n",
- "\n",
- "!!! tip \"Best Practice\"\n",
- " If you need tsam's `AggregationResult` for analysis (accuracy metrics, built-in plots),\n",
- " do this **before** saving the FlowSystem. After loading, the core workflow\n",
- " (optimize → expand) works normally."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "27",
- "metadata": {},
- "source": [
- "### Reducing File Size\n",
- "\n",
- "For smaller files (~38% reduction), use `include_original_data=False` when saving.\n",
- "This disables `plot.compare()` after loading, but the core workflow still works:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "28",
- "metadata": {},
- "outputs": [],
- "source": [
- "import tempfile\n",
- "from pathlib import Path\n",
- "\n",
- "# Compare file sizes with and without original_data\n",
- "try:\n",
- " with tempfile.TemporaryDirectory() as tmpdir:\n",
- " path_full = Path(tmpdir) / 'full.nc'\n",
- " path_small = Path(tmpdir) / 'small.nc'\n",
- "\n",
- " fs_clustered.to_netcdf(path_full, include_original_data=True)\n",
- " fs_clustered.to_netcdf(path_small, include_original_data=False)\n",
- "\n",
- " size_full = path_full.stat().st_size / 1024\n",
- " size_small = path_small.stat().st_size / 1024\n",
- "\n",
- " print(f'With original_data: {size_full:.1f} KB')\n",
- " print(f'Without original_data: {size_small:.1f} KB')\n",
- " print(f'Size reduction: {(1 - size_small / size_full) * 100:.0f}%')\n",
- "except OSError as e:\n",
- " print(f'Note: File size comparison skipped due to environment issue: {type(e).__name__}')"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/notebooks/08f-clustering-segmentation.ipynb b/docs/notebooks/08f-clustering-segmentation.ipynb
deleted file mode 100644
index bc1915de4..000000000
--- a/docs/notebooks/08f-clustering-segmentation.ipynb
+++ /dev/null
@@ -1,647 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "0",
- "metadata": {},
- "source": [
- "# Intra-Period Segmentation with `cluster()`\n",
- "\n",
- "Reduce timesteps within each typical period using segmentation.\n",
- "\n",
- "This notebook demonstrates:\n",
- "\n",
- "- **Segmentation**: Aggregate timesteps within each cluster into fewer segments\n",
- "- **Variable durations**: Each segment can have different duration (hours)\n",
- "- **Combined reduction**: Use clustering AND segmentation for maximum speedup\n",
- "- **Expansion**: Map segmented results back to original timesteps\n",
- "\n",
- "!!! note \"Requirements\"\n",
- " This notebook requires the `tsam` package with `SegmentConfig` and `ExtremeConfig` support.\n",
- " Install with: `pip install \"flixopt[full]\"`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1",
- "metadata": {},
- "outputs": [],
- "source": [
- "import timeit\n",
- "\n",
- "import pandas as pd\n",
- "import plotly.express as px\n",
- "\n",
- "import flixopt as fx\n",
- "\n",
- "fx.CONFIG.notebook()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2",
- "metadata": {},
- "source": [
- "## What is Segmentation?\n",
- "\n",
- "**Clustering** groups similar time periods (e.g., days) into representative clusters.\n",
- "\n",
- "**Segmentation** goes further by aggregating timesteps *within* each cluster into fewer segments with variable durations.\n",
- "\n",
- "```\n",
- "Original: | Day 1 (24h) | Day 2 (24h) | Day 3 (24h) | ... | Day 365 (24h) |\n",
- " ↓ ↓ ↓ ↓\n",
- "Clustered: | Typical Day A (24h) | Typical Day B (24h) | Typical Day C (24h) |\n",
- " ↓ ↓ ↓\n",
- "Segmented: | Seg1 (4h) | Seg2 (8h) | Seg3 (8h) | Seg4 (4h) | (per typical day)\n",
- "```\n",
- "\n",
- "This can dramatically reduce problem size:\n",
- "- **Original**: 365 days × 24 hours = 8,760 timesteps\n",
- "- **Clustered (8 days)**: 8 × 24 = 192 timesteps\n",
- "- **Segmented (6 segments)**: 8 × 6 = 48 timesteps"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3",
- "metadata": {},
- "source": [
- "## Create the FlowSystem\n",
- "\n",
- "We use a district heating system with one month of data at 15-min resolution:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4",
- "metadata": {},
- "outputs": [],
- "source": [
- "from data.generate_example_systems import create_district_heating_system\n",
- "\n",
- "flow_system = create_district_heating_system()\n",
- "flow_system.connect_and_transform()\n",
- "\n",
- "print(f'Timesteps: {len(flow_system.timesteps)}')\n",
- "print(f'Duration: {(flow_system.timesteps[-1] - flow_system.timesteps[0]).days + 1} days')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize input data\n",
- "heat_demand = flow_system.components['HeatDemand'].inputs[0].fixed_relative_profile\n",
- "heat_demand.plotly.line(title='Heat Demand Profile')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6",
- "metadata": {},
- "source": [
- "## Full Optimization (Baseline)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7",
- "metadata": {},
- "outputs": [],
- "source": [
- "solver = fx.solvers.HighsSolver(mip_gap=0.01)\n",
- "\n",
- "start = timeit.default_timer()\n",
- "fs_full = flow_system.copy()\n",
- "fs_full.name = 'Full Optimization'\n",
- "fs_full.optimize(solver)\n",
- "time_full = timeit.default_timer() - start\n",
- "\n",
- "print(f'Full optimization: {time_full:.2f} seconds')\n",
- "print(f'Total cost: {fs_full.solution[\"costs\"].item():,.0f} €')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8",
- "metadata": {},
- "source": [
- "## Clustering with Segmentation\n",
- "\n",
- "Use `SegmentConfig` to enable intra-period segmentation:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9",
- "metadata": {},
- "outputs": [],
- "source": [
- "from tsam import ExtremeConfig, SegmentConfig\n",
- "\n",
- "start = timeit.default_timer()\n",
- "\n",
- "# Cluster into 8 typical days with 6 segments each\n",
- "fs_segmented = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " segments=SegmentConfig(n_segments=6), # 6 segments per day instead of 96 quarter-hours\n",
- " extremes=ExtremeConfig(method='replace', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n",
- ")\n",
- "\n",
- "time_clustering = timeit.default_timer() - start\n",
- "\n",
- "print(f'Clustering time: {time_clustering:.2f} seconds')\n",
- "print(f'Original timesteps: {len(flow_system.timesteps)}')\n",
- "print(\n",
- " f'Segmented timesteps: {len(fs_segmented.timesteps)} × {len(fs_segmented.clusters)} clusters = {len(fs_segmented.timesteps) * len(fs_segmented.clusters)}'\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "10",
- "metadata": {},
- "source": [
- "## Understanding Segmentation Properties\n",
- "\n",
- "After segmentation, the clustering object has additional properties:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "11",
- "metadata": {},
- "outputs": [],
- "source": [
- "clustering = fs_segmented.clustering\n",
- "\n",
- "print('Segmentation Properties:')\n",
- "print(f' is_segmented: {clustering.is_segmented}')\n",
- "print(f' n_segments: {clustering.n_segments}')\n",
- "print(f' n_clusters: {clustering.n_clusters}')\n",
- "print(f' timesteps_per_cluster (original): {clustering.timesteps_per_cluster}')\n",
- "print(f'\\nTime dimension uses RangeIndex: {type(fs_segmented.timesteps)}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "12",
- "metadata": {},
- "source": [
- "## Variable Timestep Durations\n",
- "\n",
- "Each segment has a different duration, determined by how many original timesteps it represents:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "13",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Timestep duration is now a DataArray with (cluster, time) dimensions\n",
- "timestep_duration = fs_segmented.timestep_duration\n",
- "\n",
- "print(f'Timestep duration shape: {dict(timestep_duration.sizes)}')\n",
- "print('\\nSegment durations for cluster 0:')\n",
- "cluster_0_durations = timestep_duration.sel(cluster=0).values\n",
- "for i, dur in enumerate(cluster_0_durations):\n",
- " print(f' Segment {i}: {dur:.2f} hours')\n",
- "print(f' Total: {cluster_0_durations.sum():.2f} hours (should be 24h)')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "14",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Visualize segment durations across clusters\n",
- "duration_df = timestep_duration.to_dataframe('duration').reset_index()\n",
- "fig = px.bar(\n",
- " duration_df,\n",
- " x='time',\n",
- " y='duration',\n",
- " facet_col='cluster',\n",
- " facet_col_wrap=4,\n",
- " title='Segment Durations by Cluster',\n",
- " labels={'time': 'Segment', 'duration': 'Duration [hours]'},\n",
- ")\n",
- "fig.update_layout(height=400)\n",
- "fig.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "15",
- "metadata": {},
- "source": [
- "## Optimize the Segmented System"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "16",
- "metadata": {},
- "outputs": [],
- "source": [
- "start = timeit.default_timer()\n",
- "fs_segmented.optimize(solver)\n",
- "time_segmented = timeit.default_timer() - start\n",
- "\n",
- "print(f'Segmented optimization: {time_segmented:.2f} seconds')\n",
- "print(f'Total cost: {fs_segmented.solution[\"costs\"].item():,.0f} €')\n",
- "print(f'\\nSpeedup vs full: {time_full / (time_clustering + time_segmented):.1f}x')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "17",
- "metadata": {},
- "source": [
- "## Compare Clustering Quality\n",
- "\n",
- "View how well the segmented data represents the original:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "18",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Duration curves show how well the distribution is preserved\n",
- "fs_segmented.clustering.plot.compare(kind='duration_curve')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "19",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Clustering quality metrics\n",
- "fs_segmented.clustering.metrics.to_dataframe().style.format('{:.3f}')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "20",
- "metadata": {},
- "source": [
- "## Expand to Original Timesteps\n",
- "\n",
- "Use `expand()` to map the segmented solution back to all original timesteps:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "21",
- "metadata": {},
- "outputs": [],
- "source": [
- "start = timeit.default_timer()\n",
- "fs_expanded = fs_segmented.transform.expand()\n",
- "time_expand = timeit.default_timer() - start\n",
- "\n",
- "print(f'Expansion time: {time_expand:.3f} seconds')\n",
- "print(f'Expanded timesteps: {len(fs_expanded.timesteps)}')\n",
- "print(f'Objective preserved: {fs_expanded.solution[\"costs\"].item():,.0f} €')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "22",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Compare flow rates: Full vs Expanded\n",
- "import xarray as xr\n",
- "\n",
- "flow_var = 'CHP(Q_th)|flow_rate'\n",
- "comparison_ds = xr.concat(\n",
- " [fs_full.solution[flow_var], fs_expanded.solution[flow_var]],\n",
- " dim=pd.Index(['Full', 'Expanded'], name='method'),\n",
- ")\n",
- "comparison_ds.plotly.line(color='method', title='CHP Heat Output Comparison')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "23",
- "metadata": {},
- "source": [
- "## Two-Stage Workflow with Segmentation\n",
- "\n",
- "For investment optimization, use segmentation for fast sizing, then dispatch at full resolution:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "24",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stage 1: Sizing with segmentation (already done)\n",
- "SAFETY_MARGIN = 1.05\n",
- "sizes_with_margin = {name: float(size.item()) * SAFETY_MARGIN for name, size in fs_segmented.stats.sizes.items()}\n",
- "\n",
- "print('Optimized sizes with safety margin:')\n",
- "for name, size in sizes_with_margin.items():\n",
- " print(f' {name}: {size:.1f}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "25",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Stage 2: Full resolution dispatch with fixed sizes\n",
- "start = timeit.default_timer()\n",
- "fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin)\n",
- "fs_dispatch.name = 'Two-Stage'\n",
- "fs_dispatch.optimize(solver)\n",
- "time_dispatch = timeit.default_timer() - start\n",
- "\n",
- "print(f'Dispatch time: {time_dispatch:.2f} seconds')\n",
- "print(f'Final cost: {fs_dispatch.solution[\"costs\"].item():,.0f} €')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "26",
- "metadata": {},
- "source": [
- "## Compare Results"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "27",
- "metadata": {},
- "outputs": [],
- "source": [
- "total_segmented = time_clustering + time_segmented\n",
- "total_two_stage = total_segmented + time_dispatch\n",
- "\n",
- "results = {\n",
- " 'Full (baseline)': {\n",
- " 'Time [s]': time_full,\n",
- " 'Cost [€]': fs_full.solution['costs'].item(),\n",
- " 'CHP': fs_full.stats.sizes['CHP(Q_th)'].item(),\n",
- " 'Boiler': fs_full.stats.sizes['Boiler(Q_th)'].item(),\n",
- " 'Storage': fs_full.stats.sizes['Storage'].item(),\n",
- " },\n",
- " 'Segmented (8×6)': {\n",
- " 'Time [s]': total_segmented,\n",
- " 'Cost [€]': fs_segmented.solution['costs'].item(),\n",
- " 'CHP': fs_segmented.stats.sizes['CHP(Q_th)'].item(),\n",
- " 'Boiler': fs_segmented.stats.sizes['Boiler(Q_th)'].item(),\n",
- " 'Storage': fs_segmented.stats.sizes['Storage'].item(),\n",
- " },\n",
- " 'Two-Stage': {\n",
- " 'Time [s]': total_two_stage,\n",
- " 'Cost [€]': fs_dispatch.solution['costs'].item(),\n",
- " 'CHP': sizes_with_margin['CHP(Q_th)'],\n",
- " 'Boiler': sizes_with_margin['Boiler(Q_th)'],\n",
- " 'Storage': sizes_with_margin['Storage'],\n",
- " },\n",
- "}\n",
- "\n",
- "comparison = pd.DataFrame(results).T\n",
- "baseline_cost = comparison.loc['Full (baseline)', 'Cost [€]']\n",
- "baseline_time = comparison.loc['Full (baseline)', 'Time [s]']\n",
- "comparison['Cost Gap [%]'] = ((comparison['Cost [€]'] - baseline_cost) / abs(baseline_cost) * 100).round(2)\n",
- "comparison['Speedup'] = (baseline_time / comparison['Time [s]']).round(1)\n",
- "\n",
- "comparison.style.format(\n",
- " {\n",
- " 'Time [s]': '{:.2f}',\n",
- " 'Cost [€]': '{:,.0f}',\n",
- " 'CHP': '{:.1f}',\n",
- " 'Boiler': '{:.1f}',\n",
- " 'Storage': '{:.0f}',\n",
- " 'Cost Gap [%]': '{:.2f}',\n",
- " 'Speedup': '{:.1f}x',\n",
- " }\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "28",
- "metadata": {},
- "source": [
- "## Segmentation with Multi-Period Systems\n",
- "\n",
- "Segmentation works with multi-period systems (multiple years, scenarios).\n",
- "Each period/scenario combination is segmented independently:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "29",
- "metadata": {},
- "outputs": [],
- "source": [
- "from data.generate_example_systems import create_multiperiod_system\n",
- "\n",
- "fs_multi = create_multiperiod_system()\n",
- "# Use first week only for faster demo\n",
- "fs_multi = fs_multi.transform.isel(time=slice(0, 168))\n",
- "\n",
- "print(f'Periods: {list(fs_multi.periods.values)}')\n",
- "print(f'Scenarios: {list(fs_multi.scenarios.values)}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "30",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cluster with segmentation\n",
- "fs_multi_seg = fs_multi.transform.cluster(\n",
- " n_clusters=3,\n",
- " cluster_duration='1D',\n",
- " segments=SegmentConfig(n_segments=6),\n",
- " extremes=ExtremeConfig(method='replace', max_value=['Building(Heat)|fixed_relative_profile']),\n",
- ")\n",
- "\n",
- "print(f'Original: {len(fs_multi.timesteps)} timesteps')\n",
- "print(f'Segmented: {len(fs_multi_seg.timesteps)} × {len(fs_multi_seg.clusters)} clusters')\n",
- "print(f'is_segmented: {fs_multi_seg.clustering.is_segmented}')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "31",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Cluster assignments have period/scenario dimensions\n",
- "fs_multi_seg.clustering.cluster_assignments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "32",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Optimize and expand\n",
- "fs_multi_seg.optimize(solver)\n",
- "fs_multi_expanded = fs_multi_seg.transform.expand()\n",
- "\n",
- "print(f'Expanded timesteps: {len(fs_multi_expanded.timesteps)}')\n",
- "print(f'Objective: {fs_multi_expanded.solution[\"objective\"].item():,.0f} €')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "33",
- "metadata": {},
- "source": [
- "## API Reference\n",
- "\n",
- "### SegmentConfig Parameters\n",
- "\n",
- "```python\n",
- "from tsam import SegmentConfig\n",
- "\n",
- "segments = SegmentConfig(\n",
- " n_segments=6, # Number of segments per cluster period\n",
- " representation_method='mean', # How to represent segment values ('mean', 'medoid', etc.)\n",
- ")\n",
- "```\n",
- "\n",
- "### Segmentation Properties\n",
- "\n",
- "After segmentation, `fs.clustering` has additional properties:\n",
- "\n",
- "| Property | Description |\n",
- "|----------|-------------|\n",
- "| `is_segmented` | `True` if segmentation was used |\n",
- "| `n_segments` | Number of segments per cluster |\n",
- "| `timesteps_per_cluster` | Original timesteps per cluster (before segmentation) |\n",
- "\n",
- "### Timestep Duration\n",
- "\n",
- "For segmented systems, `fs.timestep_duration` is a DataArray with `(cluster, time)` dimensions:\n",
- "\n",
- "```python\n",
- "# Each segment has different duration\n",
- "fs_segmented.timestep_duration # Shape: (n_clusters, n_segments)\n",
- "\n",
- "# Sum should equal original period duration\n",
- "fs_segmented.timestep_duration.sum('time') # Should be 24h for daily clusters\n",
- "```\n",
- "\n",
- "### Example Workflow\n",
- "\n",
- "```python\n",
- "from tsam import ExtremeConfig, SegmentConfig\n",
- "\n",
- "# Cluster with segmentation\n",
- "fs_segmented = flow_system.transform.cluster(\n",
- " n_clusters=8,\n",
- " cluster_duration='1D',\n",
- " segments=SegmentConfig(n_segments=6),\n",
- " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|profile']),\n",
- ")\n",
- "\n",
- "# Optimize\n",
- "fs_segmented.optimize(solver)\n",
- "\n",
- "# Expand back to original timesteps\n",
- "fs_expanded = fs_segmented.transform.expand()\n",
- "\n",
- "# Two-stage workflow\n",
- "sizes = {k: v.item() * 1.05 for k, v in fs_segmented.stats.sizes.items()}\n",
- "fs_dispatch = flow_system.transform.fix_sizes(sizes)\n",
- "fs_dispatch.optimize(solver)\n",
- "```"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "34",
- "metadata": {},
- "source": [
- "## Summary\n",
- "\n",
- "You learned how to:\n",
- "\n",
- "- Use **`SegmentConfig`** to enable intra-period segmentation\n",
- "- Work with **variable timestep durations** for each segment\n",
- "- **Combine clustering and segmentation** for maximum problem size reduction\n",
- "- **Expand segmented solutions** back to original timesteps\n",
- "- Use segmentation with **multi-period systems**\n",
- "\n",
- "### Key Takeaways\n",
- "\n",
- "1. **Segmentation reduces problem size further**: From 8×24=192 to 8×6=48 timesteps\n",
- "2. **Variable durations preserve accuracy**: Important periods get more timesteps\n",
- "3. **Works with multi-period**: Each period/scenario is segmented independently\n",
- "4. **expand() works correctly**: Maps segment values to all original timesteps\n",
- "5. **Two-stage is still recommended**: Use segmentation for sizing, full resolution for dispatch\n",
- "\n",
- "### Trade-offs\n",
- "\n",
- "| More Segments | Fewer Segments |\n",
- "|---------------|----------------|\n",
- "| Higher accuracy | Lower accuracy |\n",
- "| Slower solve | Faster solve |\n",
- "| More memory | Less memory |\n",
- "\n",
- "Start with 6-12 segments and adjust based on your accuracy needs."
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/flixopt/clustering/__init__.py b/flixopt/clustering/__init__.py
index 07e6e775f..9f12f2a02 100644
--- a/flixopt/clustering/__init__.py
+++ b/flixopt/clustering/__init__.py
@@ -1,13 +1,11 @@
"""
Time Series Aggregation Module for flixopt.
-This module provides wrapper classes around tsam's clustering functionality:
-- Clustering: Top-level class stored on FlowSystem after clustering
-- ClusteringResults: Manages collection of tsam ClusteringResult objects (for IO)
+This module provides the Clustering class stored on FlowSystem after clustering,
+wrapping tsam_xarray's ClusteringResult.
Example usage:
- # Cluster a FlowSystem to reduce timesteps
from tsam import ExtremeConfig
fs_clustered = flow_system.transform.cluster(
@@ -16,36 +14,21 @@
extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|fixed_relative_profile']),
)
- # Access clustering structure (available before AND after IO)
clustering = fs_clustered.clustering
print(f'Number of clusters: {clustering.n_clusters}')
- print(f'Dims: {clustering.dims}') # e.g., ('period', 'scenario')
- print(f'Coords: {clustering.coords}') # e.g., {'period': [2024, 2025]}
+ print(f'Clustering result: {clustering.clustering_result}')
- # Access tsam AggregationResult for detailed analysis
- # NOTE: Only available BEFORE saving/loading. Lost after IO.
- result = clustering.sel(period=2024, scenario='high')
- result.cluster_representatives # DataFrame with aggregated time series
- result.accuracy # AccuracyMetrics (rmse, mae)
- result.plot.compare() # tsam's built-in comparison plot
-
- # Iterate over all results (only before IO)
- for key, result in clustering.items():
- print(f'{key}: {result.n_clusters} clusters')
-
- # Save and load - structure preserved, AggregationResult access lost
- fs_clustered.to_netcdf('system.nc')
- # Use include_original_data=False for smaller files (~38% reduction)
- fs_clustered.to_netcdf('system.nc', include_original_data=False)
+ # Access tsam_xarray AggregationResult (only before saving/loading)
+ result = clustering.aggregation_result
+ result.cluster_representatives # DataArray
+ result.accuracy # AccuracyMetrics
# Expand back to full resolution
fs_expanded = fs_clustered.transform.expand()
"""
-from .base import AggregationResults, Clustering, ClusteringResults
+from .base import Clustering
__all__ = [
- 'ClusteringResults',
- 'AggregationResults',
'Clustering',
]
diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py
index 7082929c3..b78bf89b3 100644
--- a/flixopt/clustering/base.py
+++ b/flixopt/clustering/base.py
@@ -1,960 +1,286 @@
"""
Clustering classes for time series aggregation.
-This module provides wrapper classes around tsam's clustering functionality:
-- `ClusteringResults`: Collection of tsam ClusteringResult objects for multi-dim (period, scenario) data
-- `Clustering`: Top-level class stored on FlowSystem after clustering
+This module provides the `Clustering` class stored on FlowSystem after clustering,
+wrapping tsam_xarray's ClusteringResult for structure access and AggregationResult
+for full data access (pre-serialization only).
"""
from __future__ import annotations
-import functools
import json
-from collections import Counter
from typing import TYPE_CHECKING, Any
-import numpy as np
import pandas as pd
-import xarray as xr
if TYPE_CHECKING:
from pathlib import Path
- from tsam import AggregationResult
- from tsam import ClusteringResult as TsamClusteringResult
+ import xarray as xr
+ from tsam_xarray import AggregationResult as TsamXarrayAggregationResult
+ from tsam_xarray import ClusteringResult
- from ..color_processing import ColorType
- from ..plot_result import PlotResult
- from ..statistics_accessor import SelectType
-from ..statistics_accessor import _build_color_kwargs
-
-
-def _apply_slot_defaults(plotly_kwargs: dict, defaults: dict[str, str | None]) -> None:
- """Apply default slot assignments to plotly kwargs.
-
- Args:
- plotly_kwargs: The kwargs dict to update (modified in place).
- defaults: Default slot assignments. None values block slots.
- """
- for slot, value in defaults.items():
- plotly_kwargs.setdefault(slot, value)
-
-
-def _select_dims(da: xr.DataArray, period: Any = None, scenario: Any = None) -> xr.DataArray:
- """Select from DataArray by period/scenario if those dimensions exist."""
- if 'period' in da.dims and period is not None:
- da = da.sel(period=period)
- if 'scenario' in da.dims and scenario is not None:
- da = da.sel(scenario=scenario)
- return da
-
-
-def _cluster_occurrences(cr: TsamClusteringResult) -> np.ndarray:
- """Compute cluster occurrences from ClusteringResult."""
- counts = Counter(cr.cluster_assignments)
- return np.array([counts.get(i, 0) for i in range(cr.n_clusters)])
+class Clustering:
+ """Clustering information for a FlowSystem.
+ Wraps tsam_xarray's ClusteringResult for structure access and optionally
+ AggregationResult for full data access (pre-serialization only).
-def _build_timestep_mapping(cr: TsamClusteringResult, n_timesteps: int) -> np.ndarray:
- """Build mapping from original timesteps to representative timestep indices.
-
- For segmented systems, the mapping uses segment_assignments from tsam to map
- each original timestep position to its corresponding segment index.
- """
- timesteps_per_cluster = cr.n_timesteps_per_period
- # For segmented systems, representative time dimension has n_segments entries
- # For non-segmented, it has timesteps_per_cluster entries
- n_segments = cr.n_segments
- is_segmented = n_segments is not None
- time_dim_size = n_segments if is_segmented else timesteps_per_cluster
-
- # For segmented systems, tsam provides segment_assignments which maps
- # each position within a period to its segment index
- segment_assignments = cr.segment_assignments if is_segmented else None
-
- mapping = np.zeros(n_timesteps, dtype=np.int32)
- for period_idx, cluster_id in enumerate(cr.cluster_assignments):
- for pos in range(timesteps_per_cluster):
- orig_idx = period_idx * timesteps_per_cluster + pos
- if orig_idx < n_timesteps:
- if is_segmented and segment_assignments is not None:
- # For segmented: use tsam's segment_assignments to get segment index
- # segment_assignments[cluster_id][pos] gives the segment index
- segment_idx = segment_assignments[cluster_id][pos]
- mapping[orig_idx] = int(cluster_id) * time_dim_size + segment_idx
- else:
- # Non-segmented: direct position mapping
- mapping[orig_idx] = int(cluster_id) * time_dim_size + pos
- return mapping
-
-
-class ClusteringResults:
- """Collection of tsam ClusteringResult objects for multi-dimensional data.
-
- Manages multiple ClusteringResult objects keyed by (period, scenario) tuples
- and provides convenient access and multi-dimensional DataArray building.
-
- Follows xarray-like patterns with `.dims`, `.coords`, `.sel()`, and `.isel()`.
-
- Attributes:
- dims: Tuple of dimension names, e.g., ('period', 'scenario').
- coords: Dict mapping dimension names to their coordinate values.
+ For advanced access to clustering structure (dims, coords, cluster_centers,
+ segment_centers, etc.), use ``clustering_result`` directly.
Example:
- >>> results = ClusteringResults({(): cr}, dim_names=[])
- >>> results.n_clusters
- 2
- >>> results.cluster_assignments # Returns DataArray
-
-
- >>> # Multi-dimensional case
- >>> results = ClusteringResults(
- ... {(2024, 'high'): cr1, (2024, 'low'): cr2},
- ... dim_names=['period', 'scenario'],
- ... )
- >>> results.dims
- ('period', 'scenario')
- >>> results.coords
- {'period': [2024], 'scenario': ['high', 'low']}
- >>> results.sel(period=2024, scenario='high') # Label-based
-
- >>> results.isel(period=0, scenario=1) # Index-based
-
+ >>> clustering = fs_clustered.clustering
+ >>> clustering.n_clusters
+ 8
+ >>> clustering.clustering_result # tsam_xarray ClusteringResult for full access
"""
def __init__(
self,
- results: dict[tuple, TsamClusteringResult],
- dim_names: list[str],
+ clustering_result: ClusteringResult | dict | None = None,
+ original_timesteps: pd.DatetimeIndex | list[str] | None = None,
+ # Internal: tsam_xarray AggregationResult for full data access
+ _aggregation_result: TsamXarrayAggregationResult | None = None,
+ # Internal: mapping from renamed dims back to originals (e.g., _period -> period)
+ _unrename_map: dict[str, str] | None = None,
+ # Legacy: accept 'results' kwarg for netcdf files saved before this refactor.
+ # The IO resolver passes serialized dict keys as kwargs to __init__().
+ # Remove once all users have re-saved their netcdf files with the new format.
+ results: Any = None,
+ # Legacy kwargs ignored (removed: original_data, aggregated_data, _metrics, refs)
+ **_ignored: Any,
):
- """Initialize ClusteringResults.
-
- Args:
- results: Dict mapping (period, scenario) tuples to tsam ClusteringResult objects.
- For simple cases without periods/scenarios, use {(): result}.
- dim_names: Names of extra dimensions, e.g., ['period', 'scenario'].
- """
- if not results:
- raise ValueError('results cannot be empty')
- self._results = results
- self._dim_names = dim_names
-
- # ==========================================================================
- # xarray-like interface
- # ==========================================================================
-
- @property
- def dims(self) -> tuple[str, ...]:
- """Dimension names as tuple (xarray-like)."""
- return tuple(self._dim_names)
-
- @property
- def dim_names(self) -> list[str]:
- """Dimension names as list (backwards compatibility)."""
- return list(self._dim_names)
-
- @property
- def coords(self) -> dict[str, list]:
- """Coordinate values for each dimension (xarray-like).
-
- Returns:
- Dict mapping dimension names to lists of coordinate values.
- """
- return {dim: self._get_dim_values(dim) for dim in self._dim_names}
-
- def sel(self, **kwargs: Any) -> TsamClusteringResult:
- """Select result by dimension labels (xarray-like).
-
- Args:
- **kwargs: Dimension name=value pairs, e.g., period=2024, scenario='high'.
-
- Returns:
- The tsam ClusteringResult for the specified combination.
-
- Raises:
- KeyError: If no result found for the specified combination.
-
- Example:
- >>> results.sel(period=2024, scenario='high')
-
- """
- key = self._make_key(**kwargs)
- if key not in self._results:
- raise KeyError(f'No result found for {kwargs}')
- return self._results[key]
-
- def isel(self, **kwargs: int) -> TsamClusteringResult:
- """Select result by dimension indices (xarray-like).
-
- Args:
- **kwargs: Dimension name=index pairs, e.g., period=0, scenario=1.
-
- Returns:
- The tsam ClusteringResult for the specified combination.
-
- Raises:
- IndexError: If index is out of range for a dimension.
-
- Example:
- >>> results.isel(period=0, scenario=1)
-
- """
- label_kwargs = {}
- for dim, idx in kwargs.items():
- coord_values = self._get_dim_values(dim)
- if coord_values is None:
- raise KeyError(f"Dimension '{dim}' not found in dims {self.dims}")
- if idx < 0 or idx >= len(coord_values):
- raise IndexError(f"Index {idx} out of range for dimension '{dim}' with {len(coord_values)} values")
- label_kwargs[dim] = coord_values[idx]
- return self.sel(**label_kwargs)
-
- def __getitem__(self, key: tuple) -> TsamClusteringResult:
- """Get result by key tuple."""
- return self._results[key]
-
- # === Iteration ===
-
- def __iter__(self):
- """Iterate over ClusteringResult objects."""
- return iter(self._results.values())
-
- def __len__(self) -> int:
- """Number of ClusteringResult objects."""
- return len(self._results)
-
- def items(self):
- """Iterate over (key, ClusteringResult) pairs."""
- return self._results.items()
-
- def keys(self):
- """Iterate over keys."""
- return self._results.keys()
-
- def values(self):
- """Iterate over ClusteringResult objects."""
- return self._results.values()
-
- # === Properties from first result ===
-
- @property
- def _first_result(self) -> TsamClusteringResult:
- """Get the first ClusteringResult (for structure info)."""
- return next(iter(self._results.values()))
-
- @property
- def n_clusters(self) -> int:
- """Number of clusters (same for all results)."""
- return self._first_result.n_clusters
-
- @property
- def timesteps_per_cluster(self) -> int:
- """Number of timesteps per cluster (same for all results)."""
- return self._first_result.n_timesteps_per_period
-
- @property
- def n_original_periods(self) -> int:
- """Number of original periods (same for all results)."""
- return self._first_result.n_original_periods
-
- @property
- def n_segments(self) -> int | None:
- """Number of segments per cluster, or None if not segmented."""
- return self._first_result.n_segments
-
- # === Multi-dim DataArrays ===
-
- @property
- def cluster_assignments(self) -> xr.DataArray:
- """Maps each original cluster to its typical cluster index.
-
- Returns:
- DataArray with dims [original_cluster, period?, scenario?].
- """
- # Note: No coords on original_cluster - they cause issues when used as isel() indexer
- return self._build_property_array(
- lambda cr: np.array(cr.cluster_assignments),
- base_dims=['original_cluster'],
- name='cluster_assignments',
- )
-
- @property
- def cluster_occurrences(self) -> xr.DataArray:
- """How many original clusters map to each typical cluster.
-
- Returns:
- DataArray with dims [cluster, period?, scenario?].
- """
- return self._build_property_array(
- _cluster_occurrences,
- base_dims=['cluster'],
- base_coords={'cluster': range(self.n_clusters)},
- name='cluster_occurrences',
- )
-
- @property
- def cluster_centers(self) -> xr.DataArray:
- """Which original cluster is the representative (center) for each typical cluster.
-
- Returns:
- DataArray with dims [cluster, period?, scenario?].
- """
- return self._build_property_array(
- lambda cr: np.array(cr.cluster_centers),
- base_dims=['cluster'],
- base_coords={'cluster': range(self.n_clusters)},
- name='cluster_centers',
- )
-
- @property
- def segment_assignments(self) -> xr.DataArray | None:
- """For each timestep within a cluster, which segment it belongs to.
-
- Returns:
- DataArray with dims [cluster, time, period?, scenario?], or None if not segmented.
- """
- if self._first_result.segment_assignments is None:
- return None
- timesteps = self._first_result.n_timesteps_per_period
- return self._build_property_array(
- lambda cr: np.array(cr.segment_assignments),
- base_dims=['cluster', 'time'],
- base_coords={'cluster': range(self.n_clusters), 'time': range(timesteps)},
- name='segment_assignments',
- )
-
- @property
- def segment_durations(self) -> xr.DataArray | None:
- """Duration of each segment in timesteps.
-
- Returns:
- DataArray with dims [cluster, segment, period?, scenario?], or None if not segmented.
- """
- if self._first_result.segment_durations is None:
- return None
- n_segments = self._first_result.n_segments
-
- def _get_padded_durations(cr: TsamClusteringResult) -> np.ndarray:
- """Pad ragged segment durations to uniform shape."""
- return np.array([list(d) + [np.nan] * (n_segments - len(d)) for d in cr.segment_durations])
-
- return self._build_property_array(
- _get_padded_durations,
- base_dims=['cluster', 'segment'],
- base_coords={'cluster': range(self.n_clusters), 'segment': range(n_segments)},
- name='segment_durations',
- )
-
- @property
- def segment_centers(self) -> xr.DataArray | None:
- """Center of each intra-period segment.
-
- Only available if segmentation was configured during clustering.
-
- Returns:
- DataArray or None if no segmentation.
- """
- first = self._first_result
- if first.segment_centers is None:
- return None
-
- n_segments = first.n_segments
- return self._build_property_array(
- lambda cr: np.array(cr.segment_centers),
- base_dims=['cluster', 'segment'],
- base_coords={'cluster': range(self.n_clusters), 'segment': range(n_segments)},
- name='segment_centers',
- )
-
- @property
- def position_within_segment(self) -> xr.DataArray | None:
- """Position of each timestep within its segment (0-indexed).
-
- For each (cluster, time) position, returns how many timesteps into the
- segment that position is. Used for interpolation within segments.
-
- Returns:
- DataArray with dims [cluster, time] or [cluster, time, period?, scenario?].
- Returns None if no segmentation.
- """
- segment_assignments = self.segment_assignments
- if segment_assignments is None:
- return None
-
- def _compute_positions(seg_assigns: np.ndarray) -> np.ndarray:
- """Compute position within segment for each (cluster, time)."""
- n_clusters, n_times = seg_assigns.shape
- positions = np.zeros_like(seg_assigns)
- for c in range(n_clusters):
- pos = 0
- prev_seg = -1
- for t in range(n_times):
- seg = seg_assigns[c, t]
- if seg != prev_seg:
- pos = 0
- prev_seg = seg
- positions[c, t] = pos
- pos += 1
- return positions
-
- # Handle extra dimensions by applying _compute_positions to each slice
- extra_dims = [d for d in segment_assignments.dims if d not in ('cluster', 'time')]
-
- if not extra_dims:
- positions = _compute_positions(segment_assignments.values)
- return xr.DataArray(
- positions,
- dims=['cluster', 'time'],
- coords=segment_assignments.coords,
- name='position_within_segment',
- )
-
- # Multi-dimensional case: compute for each period/scenario slice
- result = xr.apply_ufunc(
- _compute_positions,
- segment_assignments,
- input_core_dims=[['cluster', 'time']],
- output_core_dims=[['cluster', 'time']],
- vectorize=True,
- )
- return result.rename('position_within_segment')
-
- # === Serialization ===
-
- def to_dict(self) -> dict:
- """Serialize to dict.
+ from tsam_xarray import ClusteringResult as ClusteringResultClass
- The dict can be used to reconstruct via from_dict().
- """
- return {
- 'dim_names': list(self._dim_names),
- 'results': {self._key_to_str(key): result.to_dict() for key, result in self._results.items()},
- }
-
- @classmethod
- def from_dict(cls, d: dict) -> ClusteringResults:
- """Reconstruct from dict.
-
- Args:
- d: Dict from to_dict().
+ # Handle ISO timestamp strings from serialization
+ if (
+ isinstance(original_timesteps, list)
+ and len(original_timesteps) > 0
+ and isinstance(original_timesteps[0], str)
+ ):
+ original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in original_timesteps])
- Returns:
- Reconstructed ClusteringResults.
- """
- from tsam import ClusteringResult
-
- dim_names = d['dim_names']
- results = {}
- for key_str, result_dict in d['results'].items():
- key = cls._str_to_key(key_str, dim_names)
- results[key] = ClusteringResult.from_dict(result_dict)
- return cls(results, dim_names)
-
- # === Private helpers ===
-
- def _make_key(self, **kwargs: Any) -> tuple:
- """Create a key tuple from dimension keyword arguments."""
- key_parts = []
- for dim in self._dim_names:
- if dim in kwargs:
- key_parts.append(kwargs[dim])
- return tuple(key_parts)
-
- def _get_dim_values(self, dim: str) -> list | None:
- """Get unique values for a dimension, or None if dimension not present.
-
- Preserves insertion order to ensure .isel() positional indexing matches
- the original FlowSystem dimension order.
- """
- if dim not in self._dim_names:
- return None
- idx = self._dim_names.index(dim)
- # Use dict.fromkeys to preserve insertion order while removing duplicates
- values = [k[idx] for k in self._results.keys()]
- return list(dict.fromkeys(values))
+ # Store tsam_xarray AggregationResult if provided (full data access)
+ self._aggregation_result = _aggregation_result
- def _build_property_array(
- self,
- get_data: callable,
- base_dims: list[str],
- base_coords: dict | None = None,
- name: str | None = None,
- ) -> xr.DataArray:
- """Build a DataArray property, handling both single and multi-dimensional cases."""
- slices = []
- for key, cr in self._results.items():
- da = xr.DataArray(get_data(cr), dims=base_dims, coords=base_coords or {}, name=name)
- for dim_name, coord_val in zip(self._dim_names, key, strict=True):
- da = da.expand_dims({dim_name: [coord_val]})
- slices.append(da)
-
- if len(slices) == 1:
- result = slices[0]
- else:
- combined = xr.combine_by_coords(slices)
- if isinstance(combined, xr.Dataset):
- result = combined[name]
+ # Resolve ClusteringResult from various sources
+ if clustering_result is not None:
+ if isinstance(clustering_result, dict):
+ self._clustering_result = self._clustering_result_from_dict(clustering_result)
else:
- result = combined
- return result.transpose(*base_dims, *self._dim_names)
-
- @staticmethod
- def _key_to_str(key: tuple) -> str:
- """Convert key tuple to string for serialization."""
- if not key:
- return '__single__'
- return '|'.join(str(k) for k in key)
-
- @staticmethod
- def _str_to_key(key_str: str, dim_names: list[str]) -> tuple:
- """Convert string back to key tuple."""
- if key_str == '__single__':
- return ()
- parts = key_str.split('|')
- # Try to convert to int if possible (for period years)
- result = []
- for part in parts:
- try:
- result.append(int(part))
- except ValueError:
- result.append(part)
- return tuple(result)
-
- def __repr__(self) -> str:
- if not self.dims:
- return f'ClusteringResults(n_clusters={self.n_clusters})'
- coords_str = ', '.join(f'{k}: {len(v)}' for k, v in self.coords.items())
- return f'ClusteringResults(dims={self.dims}, coords=({coords_str}), n_clusters={self.n_clusters})'
-
- def apply(self, data: xr.Dataset) -> AggregationResults:
- """Apply clustering to dataset for all (period, scenario) combinations.
-
- Args:
- data: Dataset with time-varying data. Must have 'time' dimension.
- May have 'period' and/or 'scenario' dimensions matching this object.
-
- Returns:
- AggregationResults with full access to aggregated data.
- Use `.clustering` on the result to get ClusteringResults for IO.
-
- Example:
- >>> agg_results = clustering_results.apply(dataset)
- >>> agg_results.clustering # Get ClusteringResults for IO
- >>> for key, result in agg_results:
- ... print(result.cluster_representatives)
- """
- from ..core import drop_constant_arrays
-
- results = {}
- for key, cr in self._results.items():
- # Build selector from key based on dim_names
- selector = {dim_name: key[i] for i, dim_name in enumerate(self._dim_names)}
- data_slice = data.sel(**selector, drop=True) if selector else data
-
- # Drop constant arrays and convert to DataFrame
- time_varying = drop_constant_arrays(data_slice, dim='time')
- df = time_varying.to_dataframe()
+ self._clustering_result = clustering_result
+ elif _aggregation_result is not None:
+ self._clustering_result = _aggregation_result.clustering
+ elif results is not None:
+ # Legacy path: accept old ClusteringResults or dict
+ if isinstance(results, dict):
+ self._clustering_result = self._clustering_result_from_dict(results)
+ elif hasattr(results, '_results') and hasattr(results, '_dim_names'):
+ self._clustering_result = ClusteringResultClass(
+ time_dim='time',
+ cluster_dim=['variable'],
+ slice_dims=list(results._dim_names),
+ clusterings=dict(results._results),
+ )
+ else:
+ raise TypeError(f'Cannot create ClusteringResult from {type(results)}')
+ else:
+ raise ValueError('Either clustering_result or _aggregation_result must be provided')
- # Apply clustering
- results[key] = cr.apply(df)
+ # Resolve unrename_map: if not explicitly provided, infer from slice_dims
+ # (e.g., '_period' in slice_dims → {'_period': 'period'})
+ if _unrename_map:
+ self._unrename_map = _unrename_map
+ else:
+ known_renames = {'_period': 'period', '_cluster': 'cluster'}
+ self._unrename_map = {k: v for k, v in known_renames.items() if k in self._clustering_result.slice_dims}
- return Clustering._from_aggregation_results(results, self._dim_names)
+ # Flag indicating this was loaded from serialization (missing full AggregationResult data)
+ self._from_serialization = _aggregation_result is None
+ self.original_timesteps = original_timesteps if original_timesteps is not None else pd.DatetimeIndex([])
-class Clustering:
- """Clustering information for a FlowSystem.
+ # Ensure time_coords is set on ClusteringResult (needed for disaggregate)
+ if self._clustering_result.time_coords is None and len(self.original_timesteps) > 0:
+ object.__setattr__(self._clustering_result, 'time_coords', self.original_timesteps)
- Thin wrapper around tsam 3.0's AggregationResult objects, providing:
- 1. Multi-dimensional access for (period, scenario) combinations
- 2. Structure properties (n_clusters, dims, coords, cluster_assignments)
- 3. JSON persistence via ClusteringResults
+ @staticmethod
+ def _clustering_result_from_dict(d: dict) -> ClusteringResult:
+ """Create ClusteringResult from serialized dict."""
+ from tsam_xarray import ClusteringResult as ClusteringResultClass
- Use ``sel()`` to access individual tsam AggregationResult objects for
- detailed analysis (cluster_representatives, accuracy, plotting).
+ return ClusteringResultClass.from_dict(d)
- Attributes:
- results: ClusteringResults for structure access (works after JSON load).
- original_timesteps: Original timesteps before clustering.
- dims: Dimension names, e.g., ('period', 'scenario').
- coords: Coordinate values, e.g., {'period': [2024, 2025]}.
+ # ==========================================================================
+ # Helper for dim unrenaming
+ # ==========================================================================
- Example:
- >>> clustering = fs_clustered.clustering
- >>> clustering.n_clusters
- 8
- >>> clustering.dims
- ('period',)
-
- # Access tsam AggregationResult for detailed analysis
- >>> result = clustering.sel(period=2024)
- >>> result.cluster_representatives # DataFrame
- >>> result.accuracy # AccuracyMetrics
- >>> result.plot.compare() # tsam's built-in plotting
- """
+ def _unrename(self, da: xr.DataArray) -> xr.DataArray:
+ """Rename tsam_xarray output dims back to original names (e.g., _period -> period)."""
+ if not self._unrename_map:
+ return da
+ renames = {k: v for k, v in self._unrename_map.items() if k in da.dims}
+ return da.rename(renames) if renames else da
# ==========================================================================
- # Core properties (delegated to ClusteringResults)
+ # Core properties (delegated to ClusteringResult)
# ==========================================================================
+ @property
+ def clustering_result(self) -> ClusteringResult:
+ """tsam_xarray ClusteringResult for reuse with apply_clustering()."""
+ return self._clustering_result
+
@property
def n_clusters(self) -> int:
"""Number of clusters (typical periods)."""
- return self.results.n_clusters
+ return self._clustering_result.n_clusters
@property
def timesteps_per_cluster(self) -> int:
"""Number of timesteps in each cluster."""
- return self.results.timesteps_per_cluster
-
- @property
- def timesteps_per_period(self) -> int:
- """Alias for timesteps_per_cluster."""
- return self.timesteps_per_cluster
+ return self._clustering_result.n_timesteps_per_period
@property
def n_original_clusters(self) -> int:
"""Number of original periods (before clustering)."""
- return self.results.n_original_periods
-
- @property
- def dim_names(self) -> list[str]:
- """Names of extra dimensions, e.g., ['period', 'scenario']."""
- return self.results.dim_names
-
- @property
- def dims(self) -> tuple[str, ...]:
- """Dimension names as tuple (xarray-like)."""
- return self.results.dims
-
- @property
- def coords(self) -> dict[str, list]:
- """Coordinate values for each dimension (xarray-like).
-
- Returns:
- Dict mapping dimension names to lists of coordinate values.
-
- Example:
- >>> clustering.coords
- {'period': [2024, 2025], 'scenario': ['low', 'high']}
- """
- return self.results.coords
-
- def sel(
- self,
- period: int | str | None = None,
- scenario: str | None = None,
- ) -> AggregationResult:
- """Select AggregationResult by period and/or scenario.
-
- Access individual tsam AggregationResult objects for detailed analysis.
-
- Note:
- This method is only available before saving/loading the FlowSystem.
- After IO (to_dataset/from_dataset or to_json), the full AggregationResult
- data is not preserved. Use `results.sel()` for structure-only access
- after loading.
-
- Args:
- period: Period value (e.g., 2024). Required if clustering has periods.
- scenario: Scenario name (e.g., 'high'). Required if clustering has scenarios.
-
- Returns:
- The tsam AggregationResult for the specified combination.
- Access its properties like `cluster_representatives`, `accuracy`, etc.
-
- Raises:
- KeyError: If no result found for the specified combination.
- ValueError: If accessed on a Clustering loaded from JSON/NetCDF.
-
- Example:
- >>> result = clustering.sel(period=2024, scenario='high')
- >>> result.cluster_representatives # DataFrame with aggregated data
- >>> result.accuracy # AccuracyMetrics
- >>> result.plot.compare() # tsam's built-in comparison plot
- """
- self._require_full_data('sel()')
- # Build key from provided args in dim order
- key_parts = []
- if 'period' in self._dim_names:
- if period is None:
- raise KeyError(f"'period' is required. Available: {self.coords.get('period', [])}")
- key_parts.append(period)
- if 'scenario' in self._dim_names:
- if scenario is None:
- raise KeyError(f"'scenario' is required. Available: {self.coords.get('scenario', [])}")
- key_parts.append(scenario)
- key = tuple(key_parts)
- if key not in self._aggregation_results:
- raise KeyError(f'No result found for period={period}, scenario={scenario}')
- return self._aggregation_results[key]
-
- @property
- def is_segmented(self) -> bool:
- """Whether intra-period segmentation was used.
-
- Segmented systems have variable timestep durations within each cluster,
- where each segment represents a different number of original timesteps.
- """
- return self.results.n_segments is not None
+ return self._clustering_result.n_original_periods
@property
def n_segments(self) -> int | None:
"""Number of segments per cluster, or None if not segmented."""
- return self.results.n_segments
+ return self._clustering_result.n_segments
@property
- def cluster_assignments(self) -> xr.DataArray:
- """Mapping from original periods to cluster IDs.
-
- Returns:
- DataArray with dims [original_cluster] or [original_cluster, period?, scenario?].
- """
- return self.results.cluster_assignments
+ def is_segmented(self) -> bool:
+ """Whether intra-period segmentation was used."""
+ return self._clustering_result.n_segments is not None
@property
- def n_representatives(self) -> int:
- """Number of representative timesteps after clustering."""
- if self.is_segmented:
- return self.n_clusters * self.n_segments
- return self.n_clusters * self.timesteps_per_cluster
+ def dim_names(self) -> list[str]:
+ """Names of extra dimensions, e.g., ['period', 'scenario']."""
+ return [self._unrename_map.get(d, d) for d in self._clustering_result.slice_dims]
# ==========================================================================
- # Derived properties
+ # DataArray properties (delegated to ClusteringResult with unrename)
# ==========================================================================
@property
- def cluster_occurrences(self) -> xr.DataArray:
- """Count of how many original periods each cluster represents.
-
- Returns:
- DataArray with dims [cluster] or [cluster, period?, scenario?].
- """
- return self.results.cluster_occurrences
-
- @property
- def representative_weights(self) -> xr.DataArray:
- """Weight for each cluster (number of original periods it represents).
-
- This is the same as cluster_occurrences but named for API consistency.
- Used as cluster_weight in FlowSystem.
- """
- return self.cluster_occurrences.rename('representative_weights')
-
- @functools.cached_property
- def timestep_mapping(self) -> xr.DataArray:
- """Mapping from original timesteps to representative timestep indices.
-
- Each value indicates which representative timestep index (0 to n_representatives-1)
- corresponds to each original timestep.
-
- Note: This property is cached for performance since it's accessed frequently
- during expand() operations.
- """
- return self._build_timestep_mapping()
-
- @property
- def metrics(self) -> xr.Dataset:
- """Clustering quality metrics (RMSE, MAE, etc.).
-
- Returns:
- Dataset with dims [time_series, period?, scenario?], or empty Dataset if no metrics.
- """
- if self._metrics is None:
- return xr.Dataset()
- return self._metrics
-
- @property
- def cluster_start_positions(self) -> np.ndarray:
- """Integer positions where clusters start in reduced timesteps.
+ def cluster_assignments(self) -> xr.DataArray:
+ """Mapping from original periods to cluster IDs.
Returns:
- 1D array: [0, T, 2T, ...] where T = timesteps_per_cluster (or n_segments if segmented).
+ DataArray with dims [original_cluster, period?, scenario?].
"""
- if self.is_segmented:
- n_timesteps = self.n_clusters * self.n_segments
- return np.arange(0, n_timesteps, self.n_segments)
- n_timesteps = self.n_clusters * self.timesteps_per_cluster
- return np.arange(0, n_timesteps, self.timesteps_per_cluster)
+ da = self._clustering_result.cluster_assignments
+ # Rename tsam_xarray's 'period' dim to our 'original_cluster' convention
+ # (must happen before _unrename to avoid conflict with _period → period rename)
+ if 'period' in da.dims:
+ da = da.rename({'period': 'original_cluster'})
+ da = self._unrename(da)
+ # Ensure original_cluster is first dim (tsam_xarray puts slice dims first)
+ if 'original_cluster' in da.dims and da.dims[0] != 'original_cluster':
+ other_dims = [d for d in da.dims if d != 'original_cluster']
+ da = da.transpose('original_cluster', *other_dims)
+ return da
@property
- def cluster_centers(self) -> xr.DataArray:
- """Which original period is the representative (center) for each cluster.
+ def cluster_occurrences(self) -> xr.DataArray:
+ """How many original clusters map to each typical cluster.
Returns:
- DataArray with dims [cluster] containing original period indices.
+ DataArray with dims [cluster, period?, scenario?].
"""
- return self.results.cluster_centers
+ return self._unrename(self._clustering_result.cluster_occurrences)
@property
def segment_assignments(self) -> xr.DataArray | None:
- """For each timestep within a cluster, which intra-period segment it belongs to.
-
- Only available if segmentation was configured during clustering.
+ """For each timestep within a cluster, which segment it belongs to.
Returns:
- DataArray with dims [cluster, time] or None if no segmentation.
+ DataArray with dims [cluster, time, period?, scenario?], or None if not segmented.
"""
- return self.results.segment_assignments
+ result = self._clustering_result.segment_assignments
+ if result is None:
+ return None
+ # tsam_xarray uses 'timestep', we use 'time'
+ if 'timestep' in result.dims:
+ result = result.rename({'timestep': 'time'})
+ return self._unrename(result)
@property
def segment_durations(self) -> xr.DataArray | None:
- """Duration of each intra-period segment in hours.
-
- Only available if segmentation was configured during clustering.
-
- Returns:
- DataArray with dims [cluster, segment] or None if no segmentation.
- """
- return self.results.segment_durations
-
- @property
- def segment_centers(self) -> xr.DataArray | None:
- """Center of each intra-period segment.
-
- Only available if segmentation was configured during clustering.
+ """Duration of each segment in timesteps.
Returns:
- DataArray with dims [cluster, segment] or None if no segmentation.
+ DataArray with dims [cluster, segment, period?, scenario?], or None if not segmented.
"""
- return self.results.segment_centers
+ result = self._clustering_result.segment_durations
+ if result is None:
+ return None
+ # tsam_xarray uses 'timestep', we use 'segment'
+ if 'timestep' in result.dims:
+ result = result.rename({'timestep': 'segment'})
+ return self._unrename(result)
# ==========================================================================
# Methods
# ==========================================================================
- def expand_data(
- self,
- aggregated: xr.DataArray,
- original_time: pd.DatetimeIndex | None = None,
- ) -> xr.DataArray:
- """Expand aggregated data back to original timesteps.
-
- Uses the timestep_mapping to map each original timestep to its
- representative value from the aggregated data. Fully vectorized using
- xarray's advanced indexing - no loops over period/scenario dimensions.
+ def disaggregate(self, data: xr.DataArray) -> xr.DataArray:
+ """Expand clustered data back to original timesteps.
- Args:
- aggregated: DataArray with aggregated (cluster, time) or (time,) dimension.
- original_time: Original time coordinates. Defaults to self.original_timesteps.
-
- Returns:
- DataArray expanded to original timesteps.
- """
- if original_time is None:
- original_time = self.original_timesteps
+ Delegates to tsam_xarray's ClusteringResult.disaggregate(). Handles
+ the dim rename from flixopt's ``(cluster, time)`` to tsam_xarray's
+ ``(cluster, timestep)`` convention.
- timestep_mapping = self.timestep_mapping # Already multi-dimensional DataArray
-
- if 'cluster' not in aggregated.dims:
- # No cluster dimension: use mapping directly as time index
- expanded = aggregated.isel(time=timestep_mapping)
- else:
- # Has cluster dimension: compute cluster and time indices from mapping
- # For segmented systems, time dimension is n_segments, not timesteps_per_cluster
- if self.is_segmented and self.n_segments is not None:
- time_dim_size = self.n_segments
- else:
- time_dim_size = self.timesteps_per_cluster
-
- cluster_indices = timestep_mapping // time_dim_size
- time_indices = timestep_mapping % time_dim_size
-
- # xarray's advanced indexing handles broadcasting across period/scenario dims
- expanded = aggregated.isel(cluster=cluster_indices, time=time_indices)
-
- # Clean up: drop coordinate artifacts from isel, then rename original_time -> time
- # The isel operation may leave 'cluster' and 'time' as non-dimension coordinates
- expanded = expanded.drop_vars(['cluster', 'time'], errors='ignore')
- expanded = expanded.rename({'original_time': 'time'}).assign_coords(time=original_time)
-
- return expanded.transpose('time', ...).assign_attrs(aggregated.attrs)
-
- def build_expansion_divisor(
- self,
- original_time: pd.DatetimeIndex | None = None,
- ) -> xr.DataArray:
- """Build divisor for correcting segment totals when expanding to hourly.
-
- For segmented systems, each segment value is a total that gets repeated N times
- when expanded to hourly resolution (where N = segment duration in timesteps).
- This divisor allows converting those totals back to hourly rates during expansion.
-
- For each original timestep, returns the number of original timesteps that map
- to the same (cluster, segment) - i.e., the segment duration in timesteps.
-
- Fully vectorized using xarray's advanced indexing - no loops over period/scenario.
-
- Args:
- original_time: Original time coordinates. Defaults to self.original_timesteps.
-
- Returns:
- DataArray with dims ['time'] or ['time', 'period'?, 'scenario'?] containing
- the number of timesteps in each segment, aligned to original timesteps.
- """
- if not self.is_segmented or self.n_segments is None:
- raise ValueError('build_expansion_divisor requires a segmented clustering')
-
- if original_time is None:
- original_time = self.original_timesteps
-
- timestep_mapping = self.timestep_mapping # Already multi-dimensional
- segment_durations = self.results.segment_durations # [cluster, segment, period?, scenario?]
-
- # Decode cluster and segment indices from timestep_mapping
- # For segmented systems, encoding is: cluster_id * n_segments + segment_idx
- time_dim_size = self.n_segments
- cluster_indices = timestep_mapping // time_dim_size
- segment_indices = timestep_mapping % time_dim_size # This IS the segment index
-
- # Get duration for each segment directly
- # segment_durations[cluster, segment] -> duration
- divisor = segment_durations.isel(cluster=cluster_indices, segment=segment_indices)
-
- # Clean up coordinates and rename
- divisor = divisor.drop_vars(['cluster', 'time', 'segment'], errors='ignore')
- divisor = divisor.rename({'original_time': 'time'}).assign_coords(time=original_time)
-
- return divisor.transpose('time', ...).rename('expansion_divisor')
-
- def get_result(
- self,
- period: Any = None,
- scenario: Any = None,
- ) -> TsamClusteringResult:
- """Get the tsam ClusteringResult for a specific (period, scenario).
+ For non-segmented systems, values are repeated for each timestep in the period.
+ For segmented systems, values are placed at segment boundaries with NaN
+ elsewhere — use ``.ffill()``, ``.interpolate_na()``, or ``.fillna()``
+ on the result.
Args:
- period: Period label (if applicable).
- scenario: Scenario label (if applicable).
+ data: DataArray with ``(cluster, time)`` or ``(cluster, segment)`` dims.
Returns:
- The tsam ClusteringResult for the specified combination.
+ DataArray with ``time`` dim restored to original timesteps.
"""
- return self.results.sel(period=period, scenario=scenario)
+ # Rename flixopt dim names to tsam_xarray's 'timestep' convention
+ flixopt_to_tsam = {'time': 'timestep', 'segment': 'timestep'}
+ renames_to_tsam = {k: v for k, v in flixopt_to_tsam.items() if k in data.dims}
+ if renames_to_tsam:
+ data = data.rename(renames_to_tsam)
+ # Rename period/scenario dims to internal names (_period, _scenario)
+ reverse_unrename = {v: k for k, v in self._unrename_map.items()}
+ renames = {k: v for k, v in reverse_unrename.items() if k in data.dims}
+ if renames:
+ data = data.rename(renames)
+ result = self._clustering_result.disaggregate(data)
+ return self._unrename(result)
def apply(
self,
- data: pd.DataFrame,
- period: Any = None,
- scenario: Any = None,
- ) -> AggregationResult:
+ data: xr.DataArray,
+ ) -> TsamXarrayAggregationResult:
"""Apply the saved clustering to new data.
Args:
- data: DataFrame with time series data to cluster.
- period: Period label (if applicable).
- scenario: Scenario label (if applicable).
+ data: DataArray with time series data to cluster.
Returns:
- tsam AggregationResult with the clustering applied.
+ tsam_xarray AggregationResult with the clustering applied.
"""
- return self.results.sel(period=period, scenario=scenario).apply(data)
+ return self._clustering_result.apply(data)
+
+ # ==========================================================================
+ # Serialization
+ # ==========================================================================
def to_json(self, path: str | Path) -> None:
"""Save the clustering for reuse.
- Uses ClusteringResults.to_dict() which preserves full tsam ClusteringResult.
Can be loaded later with Clustering.from_json() and used with
flow_system.transform.apply_clustering().
@@ -962,7 +288,7 @@ def to_json(self, path: str | Path) -> None:
path: Path to save the JSON file.
"""
data = {
- 'results': self.results.to_dict(),
+ 'clustering_result': self._clustering_result.to_dict(),
'original_timesteps': [ts.isoformat() for ts in self.original_timesteps],
}
@@ -977,8 +303,8 @@ def from_json(
) -> Clustering:
"""Load a clustering from JSON.
- The loaded Clustering has full apply() support because ClusteringResult
- is fully preserved via tsam's serialization.
+ The loaded Clustering has full apply() and disaggregate() support
+ because ClusteringResult is fully preserved via serialization.
Args:
path: Path to the JSON file.
@@ -991,286 +317,59 @@ def from_json(
with open(path) as f:
data = json.load(f)
- results = ClusteringResults.from_dict(data['results'])
+ # Support both new format (clustering_result) and legacy format (results)
+ if 'clustering_result' in data:
+ clustering_result = data['clustering_result']
+ elif 'results' in data:
+ clustering_result = data['results'] # Legacy format, handled by __init__
+ else:
+ raise ValueError('JSON file must contain "clustering_result" or "results" key')
if original_timesteps is None:
original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in data['original_timesteps']])
return cls(
- results=results,
+ clustering_result=clustering_result,
original_timesteps=original_timesteps,
)
- # ==========================================================================
- # Visualization
- # ==========================================================================
-
- @property
- def plot(self) -> ClusteringPlotAccessor:
- """Access plotting methods for clustering visualization.
-
- Returns:
- ClusteringPlotAccessor with compare(), heatmap(), and clusters() methods.
- """
- return ClusteringPlotAccessor(self)
-
- # ==========================================================================
- # Private helpers
- # ==========================================================================
-
- def _build_timestep_mapping(self) -> xr.DataArray:
- """Build timestep_mapping DataArray."""
- n_original = len(self.original_timesteps)
- original_time_coord = self.original_timesteps.rename('original_time')
- return self.results._build_property_array(
- lambda cr: _build_timestep_mapping(cr, n_original),
- base_dims=['original_time'],
- base_coords={'original_time': original_time_coord},
- name='timestep_mapping',
- )
-
- def _create_reference_structure(self, include_original_data: bool = True) -> tuple[dict, dict[str, xr.DataArray]]:
+ def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]:
"""Create serialization structure for to_dataset().
- Args:
- include_original_data: Whether to include original_data in serialization.
- Set to False for smaller files when plot.compare() isn't needed after IO.
- Defaults to True.
-
Returns:
Tuple of (reference_dict, arrays_dict).
"""
- arrays = {}
-
- # Collect original_data arrays
- # Rename 'time' to 'original_time' to avoid conflict with clustered FlowSystem's time coord
- original_data_refs = None
- if include_original_data and self.original_data is not None:
- original_data_refs = []
- # Use variables for faster access (avoids _construct_dataarray overhead)
- variables = self.original_data.variables
- for name in self.original_data.data_vars:
- var = variables[name]
- ref_name = f'original_data|{name}'
- # Rename time dim to avoid xarray alignment issues
- if 'time' in var.dims:
- new_dims = tuple('original_time' if d == 'time' else d for d in var.dims)
- arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs)
- else:
- arrays[ref_name] = var
- original_data_refs.append(f':::{ref_name}')
-
- # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's
- # main data arrays and would be redundant. After loading, aggregated_data is
- # reconstructed from the FlowSystem's dataset.
-
- # Collect metrics arrays
- metrics_refs = None
- if self._metrics is not None:
- metrics_refs = []
- # Use variables for faster access (avoids _construct_dataarray overhead)
- metrics_vars = self._metrics.variables
- for name in self._metrics.data_vars:
- ref_name = f'metrics|{name}'
- arrays[ref_name] = metrics_vars[name]
- metrics_refs.append(f':::{ref_name}')
-
reference = {
'__class__': 'Clustering',
- 'results': self.results.to_dict(), # Full ClusteringResults serialization
+ 'clustering_result': self._clustering_result.to_dict(),
'original_timesteps': [ts.isoformat() for ts in self.original_timesteps],
- '_original_data_refs': original_data_refs,
- '_metrics_refs': metrics_refs,
}
-
- return reference, arrays
-
- def __init__(
- self,
- results: ClusteringResults | dict | None = None,
- original_timesteps: pd.DatetimeIndex | list[str] | None = None,
- original_data: xr.Dataset | None = None,
- aggregated_data: xr.Dataset | None = None,
- _metrics: xr.Dataset | None = None,
- # These are for reconstruction from serialization
- _original_data_refs: list[str] | None = None,
- _metrics_refs: list[str] | None = None,
- # Internal: AggregationResult dict for full data access
- _aggregation_results: dict[tuple, AggregationResult] | None = None,
- _dim_names: list[str] | None = None,
- ):
- """Initialize Clustering object.
-
- Args:
- results: ClusteringResults instance, or dict from to_dict() (for deserialization).
- Not needed if _aggregation_results is provided.
- original_timesteps: Original timesteps before clustering.
- original_data: Original dataset before clustering (for expand/plotting).
- aggregated_data: Aggregated dataset after clustering (for plotting).
- After loading from file, this is reconstructed from FlowSystem data.
- _metrics: Pre-computed metrics dataset.
- _original_data_refs: Internal: resolved DataArrays from serialization.
- _metrics_refs: Internal: resolved DataArrays from serialization.
- _aggregation_results: Internal: dict of AggregationResult for full data access.
- _dim_names: Internal: dimension names when using _aggregation_results.
- """
- # Handle ISO timestamp strings from serialization
- if (
- isinstance(original_timesteps, list)
- and len(original_timesteps) > 0
- and isinstance(original_timesteps[0], str)
- ):
- original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in original_timesteps])
-
- # Store AggregationResults if provided (full data access)
- self._aggregation_results = _aggregation_results
- self._dim_names = _dim_names or []
-
- # Handle results - only needed for serialization path
- if results is not None:
- if isinstance(results, dict):
- results = ClusteringResults.from_dict(results)
- self._results_cache = results
- else:
- self._results_cache = None
-
- # Flag indicating this was loaded from serialization (missing full AggregationResult data)
- self._from_serialization = _aggregation_results is None and results is not None
-
- self.original_timesteps = original_timesteps if original_timesteps is not None else pd.DatetimeIndex([])
- self._metrics = _metrics
-
- # Handle reconstructed data from refs (list of DataArrays)
- if _original_data_refs is not None and isinstance(_original_data_refs, list):
- # These are resolved DataArrays from the structure resolver
- if all(isinstance(da, xr.DataArray) for da in _original_data_refs):
- # Rename 'original_time' back to 'time' and strip 'original_data|' prefix
- data_vars = {}
- for da in _original_data_refs:
- if 'original_time' in da.dims:
- da = da.rename({'original_time': 'time'})
- # Strip 'original_data|' prefix from name (added during serialization)
- name = da.name
- if name.startswith('original_data|'):
- name = name[14:] # len('original_data|') = 14
- data_vars[name] = da.rename(name)
- self.original_data = xr.Dataset(data_vars)
- else:
- self.original_data = original_data
- else:
- self.original_data = original_data
-
- self.aggregated_data = aggregated_data
-
- if _metrics_refs is not None and isinstance(_metrics_refs, list):
- if all(isinstance(da, xr.DataArray) for da in _metrics_refs):
- # Strip 'metrics|' prefix from name (added during serialization)
- data_vars = {}
- for da in _metrics_refs:
- name = da.name
- if name.startswith('metrics|'):
- name = name[8:] # len('metrics|') = 8
- data_vars[name] = da.rename(name)
- self._metrics = xr.Dataset(data_vars)
-
- @property
- def results(self) -> ClusteringResults:
- """ClusteringResults for structure access (derived from AggregationResults or cached)."""
- if self._results_cache is not None:
- return self._results_cache
- if self._aggregation_results is not None:
- # Derive from AggregationResults (cached on first access)
- self._results_cache = ClusteringResults(
- {k: r.clustering for k, r in self._aggregation_results.items()},
- self._dim_names,
- )
- return self._results_cache
- raise ValueError('No results available - neither AggregationResults nor ClusteringResults set')
-
- @classmethod
- def _from_aggregation_results(
- cls,
- aggregation_results: dict[tuple, AggregationResult],
- dim_names: list[str],
- original_timesteps: pd.DatetimeIndex | None = None,
- original_data: xr.Dataset | None = None,
- ) -> Clustering:
- """Create Clustering from AggregationResult dict.
-
- This is the primary way to create a Clustering with full data access.
- Called by ClusteringResults.apply() and TransformAccessor.
-
- Args:
- aggregation_results: Dict mapping (period, scenario) tuples to AggregationResult.
- dim_names: Dimension names, e.g., ['period', 'scenario'].
- original_timesteps: Original timesteps (optional, for expand).
- original_data: Original dataset (optional, for plotting).
-
- Returns:
- Clustering with full AggregationResult access.
- """
- return cls(
- original_timesteps=original_timesteps,
- original_data=original_data,
- _aggregation_results=aggregation_results,
- _dim_names=dim_names,
- )
+ return reference, {}
# ==========================================================================
- # Iteration over AggregationResults (for direct access to tsam results)
+ # Access to tsam_xarray AggregationResult
# ==========================================================================
- def __iter__(self):
- """Iterate over (key, AggregationResult) pairs.
+ @property
+ def aggregation_result(self) -> TsamXarrayAggregationResult:
+ """The tsam_xarray AggregationResult for full data access.
+
+ Only available before serialization. After loading from file,
+ use clustering_result for structure-only access.
Raises:
- ValueError: If accessed on a Clustering loaded from JSON.
+ ValueError: If accessed on a Clustering loaded from JSON/NetCDF.
"""
- self._require_full_data('iteration')
- return iter(self._aggregation_results.items())
+ self._require_full_data('aggregation_result')
+ return self._aggregation_result
def __len__(self) -> int:
"""Number of (period, scenario) combinations."""
- if self._aggregation_results is not None:
- return len(self._aggregation_results)
- return len(list(self.results.keys()))
-
- def __getitem__(self, key: tuple) -> AggregationResult:
- """Get AggregationResult by (period, scenario) key.
-
- Raises:
- ValueError: If accessed on a Clustering loaded from JSON.
- """
- self._require_full_data('item access')
- return self._aggregation_results[key]
-
- def items(self):
- """Iterate over (key, AggregationResult) pairs.
-
- Raises:
- ValueError: If accessed on a Clustering loaded from JSON.
- """
- self._require_full_data('items()')
- return self._aggregation_results.items()
-
- def keys(self):
- """Iterate over (period, scenario) keys."""
- if self._aggregation_results is not None:
- return self._aggregation_results.keys()
- return self.results.keys()
-
- def values(self):
- """Iterate over AggregationResult objects.
-
- Raises:
- ValueError: If accessed on a Clustering loaded from JSON.
- """
- self._require_full_data('values()')
- return self._aggregation_results.values()
+ return len(self._clustering_result.clusterings)
def _require_full_data(self, operation: str) -> None:
"""Raise error if full AggregationResult data is not available."""
- if self._from_serialization:
+ if self._from_serialization or self._aggregation_result is None:
raise ValueError(
f'{operation} requires full AggregationResult data, '
f'but this Clustering was loaded from JSON. '
@@ -1287,376 +386,6 @@ def __repr__(self) -> str:
)
-class ClusteringPlotAccessor:
- """Plot accessor for Clustering objects.
-
- Provides visualization methods for comparing original vs aggregated data
- and understanding the clustering structure.
- """
-
- def __init__(self, clustering: Clustering):
- self._clustering = clustering
-
- def compare(
- self,
- kind: str = 'timeseries',
- variables: str | list[str] | None = None,
- *,
- select: SelectType | None = None,
- colors: ColorType | None = None,
- show: bool | None = None,
- data_only: bool = False,
- **plotly_kwargs: Any,
- ) -> PlotResult:
- """Compare original vs aggregated data.
-
- Args:
- kind: Type of comparison plot.
- - 'timeseries': Time series comparison (default)
- - 'duration_curve': Sorted duration curve comparison
- variables: Variable(s) to plot. Can be a string, list of strings,
- or None to plot all time-varying variables.
- select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}.
- colors: Color specification (colorscale name, color list, or label-to-color dict).
- show: Whether to display the figure.
- Defaults to CONFIG.Plotting.default_show.
- data_only: If True, skip figure creation and return only data.
- **plotly_kwargs: Additional arguments passed to plotly (e.g., color, line_dash,
- facet_col, facet_row). Defaults: x='time'/'duration', color='variable',
- line_dash='representation', symbol=None.
-
- Returns:
- PlotResult containing the comparison figure and underlying data.
- """
- import plotly.graph_objects as go
-
- from ..config import CONFIG
- from ..plot_result import PlotResult
- from ..statistics_accessor import _apply_selection
-
- if kind not in ('timeseries', 'duration_curve'):
- raise ValueError(f"Unknown kind '{kind}'. Use 'timeseries' or 'duration_curve'.")
-
- clustering = self._clustering
- if clustering.original_data is None or clustering.aggregated_data is None:
- raise ValueError('No original/aggregated data available for comparison')
-
- resolved_variables = self._resolve_variables(variables)
-
- # Build Dataset with variables as data_vars
- data_vars = {}
- for var in resolved_variables:
- original = clustering.original_data[var]
- clustered = clustering.expand_data(clustering.aggregated_data[var])
- combined = xr.concat([original, clustered], dim=pd.Index(['Original', 'Clustered'], name='representation'))
- data_vars[var] = combined
- ds = xr.Dataset(data_vars)
-
- ds = _apply_selection(ds, select)
-
- if kind == 'duration_curve':
- sorted_vars = {}
- # Use variables for faster access (avoids _construct_dataarray overhead)
- variables = ds.variables
- rep_values = ds.coords['representation'].values
- rep_idx = {rep: i for i, rep in enumerate(rep_values)}
- for var in ds.data_vars:
- data = variables[var].values
- for rep in rep_values:
- # Direct numpy indexing instead of .sel()
- values = np.sort(data[rep_idx[rep]].flatten())[::-1]
- sorted_vars[(var, rep)] = values
- # Get length from first sorted array
- n = len(next(iter(sorted_vars.values())))
- ds = xr.Dataset(
- {
- var: xr.DataArray(
- [sorted_vars[(var, r)] for r in ['Original', 'Clustered']],
- dims=['representation', 'duration'],
- coords={'representation': ['Original', 'Clustered'], 'duration': range(n)},
- )
- for var in resolved_variables
- }
- )
-
- title = (
- (
- 'Original vs Clustered'
- if len(resolved_variables) > 1
- else f'Original vs Clustered: {resolved_variables[0]}'
- )
- if kind == 'timeseries'
- else ('Duration Curve' if len(resolved_variables) > 1 else f'Duration Curve: {resolved_variables[0]}')
- )
-
- # Early return for data_only mode
- if data_only:
- return PlotResult(data=ds, figure=go.Figure())
-
- # Apply slot defaults
- defaults = {
- 'x': 'duration' if kind == 'duration_curve' else 'time',
- 'color': 'variable',
- 'line_dash': 'representation',
- 'line_dash_map': {'Original': 'dot', 'Clustered': 'solid'},
- 'symbol': None, # Block symbol slot
- }
- _apply_slot_defaults(plotly_kwargs, defaults)
-
- color_kwargs = _build_color_kwargs(colors, list(ds.data_vars))
- fig = ds.plotly.line(
- title=title,
- **color_kwargs,
- **plotly_kwargs,
- )
- fig.update_yaxes(matches=None)
- fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
-
- plot_result = PlotResult(data=ds, figure=fig)
-
- if show is None:
- show = CONFIG.Plotting.default_show
- if show:
- plot_result.show()
-
- return plot_result
-
- def _get_time_varying_variables(self) -> list[str]:
- """Get list of time-varying variables from original data that also exist in aggregated data."""
- if self._clustering.original_data is None:
- return []
- # Get variables that exist in both original and aggregated data
- aggregated_vars = (
- set(self._clustering.aggregated_data.data_vars)
- if self._clustering.aggregated_data is not None
- else set(self._clustering.original_data.data_vars)
- )
- return [
- name
- for name in self._clustering.original_data.data_vars
- if name in aggregated_vars
- and 'time' in self._clustering.original_data[name].dims
- and not np.isclose(
- self._clustering.original_data[name].min(),
- self._clustering.original_data[name].max(),
- )
- ]
-
- def _resolve_variables(self, variables: str | list[str] | None) -> list[str]:
- """Resolve variables parameter to a list of valid variable names."""
- time_vars = self._get_time_varying_variables()
- if not time_vars:
- raise ValueError('No time-varying variables found')
-
- if variables is None:
- return time_vars
- elif isinstance(variables, str):
- if variables not in time_vars:
- raise ValueError(f"Variable '{variables}' not found. Available: {time_vars}")
- return [variables]
- else:
- invalid = [v for v in variables if v not in time_vars]
- if invalid:
- raise ValueError(f'Variables {invalid} not found. Available: {time_vars}')
- return list(variables)
-
- def heatmap(
- self,
- *,
- select: SelectType | None = None,
- colors: str | list[str] | None = None,
- show: bool | None = None,
- data_only: bool = False,
- **plotly_kwargs: Any,
- ) -> PlotResult:
- """Plot cluster assignments over time as a heatmap timeline.
-
- Shows which cluster each timestep belongs to as a horizontal color bar.
- The x-axis is time, color indicates cluster assignment. This visualization
- aligns with time series data, making it easy to correlate cluster
- assignments with other plots.
-
- For multi-period/scenario data, uses faceting and/or animation.
-
- Args:
- select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}.
- colors: Colorscale name (str) or list of colors for heatmap coloring.
- Dicts are not supported for heatmaps.
- Defaults to plotly template's sequential colorscale.
- show: Whether to display the figure.
- Defaults to CONFIG.Plotting.default_show.
- data_only: If True, skip figure creation and return only data.
- **plotly_kwargs: Additional arguments passed to plotly (e.g., facet_col, animation_frame).
-
- Returns:
- PlotResult containing the heatmap figure and cluster assignment data.
- The data has 'cluster' variable with time dimension, matching original timesteps.
- """
- import plotly.graph_objects as go
-
- from ..config import CONFIG
- from ..plot_result import PlotResult
- from ..statistics_accessor import _apply_selection
-
- clustering = self._clustering
- cluster_assignments = clustering.cluster_assignments
- timesteps_per_cluster = clustering.timesteps_per_cluster
- original_time = clustering.original_timesteps
-
- if select:
- cluster_assignments = _apply_selection(cluster_assignments.to_dataset(name='cluster'), select)['cluster']
-
- # Expand cluster_assignments to per-timestep
- extra_dims = [d for d in cluster_assignments.dims if d != 'original_cluster']
- expanded_values = np.repeat(cluster_assignments.values, timesteps_per_cluster, axis=0)
-
- coords = {'time': original_time}
- coords.update({d: cluster_assignments.coords[d].values for d in extra_dims})
- cluster_da = xr.DataArray(expanded_values, dims=['time'] + extra_dims, coords=coords)
- cluster_da.name = 'cluster'
-
- # Early return for data_only mode
- if data_only:
- return PlotResult(data=xr.Dataset({'cluster': cluster_da}), figure=go.Figure())
-
- heatmap_da = cluster_da.expand_dims('y', axis=-1).assign_coords(y=['Cluster'])
- heatmap_da.name = 'cluster_assignment'
- heatmap_da = heatmap_da.transpose('time', 'y', ...)
-
- # Use plotly.imshow for heatmap
- # Only pass color_continuous_scale if explicitly provided (template handles default)
- if colors is not None:
- plotly_kwargs.setdefault('color_continuous_scale', colors)
- fig = heatmap_da.plotly.imshow(
- title='Cluster Assignments',
- aspect='auto',
- **plotly_kwargs,
- )
-
- fig.update_yaxes(showticklabels=False)
- fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
-
- # Data is exactly what we plotted (without dummy y dimension)
- data = xr.Dataset({'cluster': cluster_da})
- plot_result = PlotResult(data=data, figure=fig)
-
- if show is None:
- show = CONFIG.Plotting.default_show
- if show:
- plot_result.show()
-
- return plot_result
-
- def clusters(
- self,
- variables: str | list[str] | None = None,
- *,
- select: SelectType | None = None,
- colors: ColorType | None = None,
- show: bool | None = None,
- data_only: bool = False,
- **plotly_kwargs: Any,
- ) -> PlotResult:
- """Plot each cluster's typical period profile.
-
- Shows each cluster as a separate faceted subplot with all variables
- colored differently. Useful for understanding what each cluster represents.
-
- Args:
- variables: Variable(s) to plot. Can be a string, list of strings,
- or None to plot all time-varying variables.
- select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}.
- colors: Color specification (colorscale name, color list, or label-to-color dict).
- show: Whether to display the figure.
- Defaults to CONFIG.Plotting.default_show.
- data_only: If True, skip figure creation and return only data.
- **plotly_kwargs: Additional arguments passed to plotly (e.g., color, facet_col,
- facet_col_wrap). Defaults: x='time', color='variable', symbol=None.
-
- Returns:
- PlotResult containing the figure and underlying data.
- """
- import plotly.graph_objects as go
-
- from ..config import CONFIG
- from ..plot_result import PlotResult
- from ..statistics_accessor import _apply_selection
-
- clustering = self._clustering
- if clustering.aggregated_data is None:
- raise ValueError('No aggregated data available')
-
- aggregated_data = _apply_selection(clustering.aggregated_data, select)
- resolved_variables = self._resolve_variables(variables)
-
- n_clusters = clustering.n_clusters
- timesteps_per_cluster = clustering.timesteps_per_cluster
- cluster_occurrences = clustering.cluster_occurrences
-
- # Build cluster labels
- occ_extra_dims = [d for d in cluster_occurrences.dims if d != 'cluster']
- if occ_extra_dims:
- cluster_labels = [f'Cluster {c}' for c in range(n_clusters)]
- else:
- cluster_labels = [
- f'Cluster {c} (×{int(cluster_occurrences.sel(cluster=c).values)})' for c in range(n_clusters)
- ]
-
- data_vars = {}
- for var in resolved_variables:
- da = aggregated_data[var]
- if 'cluster' in da.dims:
- data_by_cluster = da.values
- else:
- data_by_cluster = da.values.reshape(n_clusters, timesteps_per_cluster)
- data_vars[var] = xr.DataArray(
- data_by_cluster,
- dims=['cluster', 'time'],
- coords={'cluster': cluster_labels, 'time': range(timesteps_per_cluster)},
- )
-
- ds = xr.Dataset(data_vars)
-
- # Early return for data_only mode (include occurrences in result)
- if data_only:
- data_vars['occurrences'] = cluster_occurrences
- return PlotResult(data=xr.Dataset(data_vars), figure=go.Figure())
-
- title = 'Clusters' if len(resolved_variables) > 1 else f'Clusters: {resolved_variables[0]}'
-
- # Apply slot defaults
- defaults = {
- 'x': 'time',
- 'color': 'variable',
- 'symbol': None, # Block symbol slot
- }
- _apply_slot_defaults(plotly_kwargs, defaults)
-
- color_kwargs = _build_color_kwargs(colors, list(ds.data_vars))
- fig = ds.plotly.line(
- title=title,
- **color_kwargs,
- **plotly_kwargs,
- )
- fig.update_yaxes(matches=None)
- fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1]))
-
- data_vars['occurrences'] = cluster_occurrences
- result_data = xr.Dataset(data_vars)
- plot_result = PlotResult(data=result_data, figure=fig)
-
- if show is None:
- show = CONFIG.Plotting.default_show
- if show:
- plot_result.show()
-
- return plot_result
-
-
-# Backwards compatibility alias
-AggregationResults = Clustering
-
-
def _register_clustering_classes():
"""Register clustering classes for IO."""
from ..structure import CLASS_REGISTRY
diff --git a/flixopt/core.py b/flixopt/core.py
index aca380f5e..c2a32349d 100644
--- a/flixopt/core.py
+++ b/flixopt/core.py
@@ -40,32 +40,11 @@ class TimeSeriesData(xr.DataArray):
def __init__(
self,
*args: Any,
- clustering_group: str | None = None,
- clustering_weight: float | None = None,
**kwargs: Any,
):
- """
- Args:
- *args: Arguments passed to DataArray
- clustering_group: Clustering group name. Use this when multiple time series should share the same
- clustering weight (1/n where n is the number of series in the group). Mutually exclusive with clustering_weight.
- clustering_weight: Clustering weight (0-1). Use this to assign a specific weight to a single time series.
- Mutually exclusive with clustering_group.
- **kwargs: Additional arguments passed to DataArray
- """
-
- if (clustering_group is not None) and (clustering_weight is not None):
- raise ValueError('Use either clustering_group or clustering_weight, not both')
-
# Let xarray handle all the initialization complexity
super().__init__(*args, **kwargs)
- # Add our metadata to attrs after initialization
- if clustering_group is not None:
- self.attrs['clustering_group'] = clustering_group
- if clustering_weight is not None:
- self.attrs['clustering_weight'] = clustering_weight
-
# Always mark as TimeSeriesData
self.attrs['__timeseries_data__'] = True
@@ -81,33 +60,16 @@ def fit_to_coords(
da = DataConverter.to_dataarray(self.data, coords=coords)
return self.__class__(
da,
- clustering_group=self.clustering_group,
- clustering_weight=self.clustering_weight,
name=name if name is not None else self.name,
)
- @property
- def clustering_group(self) -> str | None:
- return self.attrs.get('clustering_group')
-
- @property
- def clustering_weight(self) -> float | None:
- return self.attrs.get('clustering_weight')
-
@classmethod
def from_dataarray(
cls,
da: xr.DataArray,
- clustering_group: str | None = None,
- clustering_weight: float | None = None,
):
"""Create TimeSeriesData from DataArray, extracting metadata from attrs."""
- final_clustering_group = clustering_group if clustering_group is not None else da.attrs.get('clustering_group')
- final_clustering_weight = (
- clustering_weight if clustering_weight is not None else da.attrs.get('clustering_weight')
- )
-
- return cls(da, clustering_group=final_clustering_group, clustering_weight=final_clustering_weight)
+ return cls(da)
@classmethod
def is_timeseries_data(cls, obj) -> bool:
@@ -115,13 +77,7 @@ def is_timeseries_data(cls, obj) -> bool:
return isinstance(obj, xr.DataArray) and obj.attrs.get('__timeseries_data__', False)
def __repr__(self):
- clustering_info = []
- if self.clustering_group:
- clustering_info.append(f"clustering_group='{self.clustering_group}'")
- if self.clustering_weight is not None:
- clustering_info.append(f'clustering_weight={self.clustering_weight}')
-
- info_str = f'TimeSeriesData({", ".join(clustering_info)})' if clustering_info else 'TimeSeriesData'
+ info_str = 'TimeSeriesData'
return f'{info_str}\n{super().__repr__()}'
diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py
index e838c5480..df71e2ff5 100644
--- a/flixopt/flow_system.py
+++ b/flixopt/flow_system.py
@@ -696,7 +696,7 @@ def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]:
return reference_structure, all_extracted_arrays
- def to_dataset(self, include_solution: bool = True, include_original_data: bool = True) -> xr.Dataset:
+ def to_dataset(self, include_solution: bool = True) -> xr.Dataset:
"""
Convert the FlowSystem to an xarray Dataset.
Ensures FlowSystem is connected before serialization.
@@ -714,10 +714,6 @@ def to_dataset(self, include_solution: bool = True, include_original_data: bool
include_solution: Whether to include the optimization solution in the dataset.
Defaults to True. Set to False to get only the FlowSystem structure
without solution data (useful for copying or saving templates).
- include_original_data: Whether to include clustering.original_data in the dataset.
- Defaults to True. Set to False for smaller files (~38% reduction) when
- clustering.plot.compare() isn't needed after loading. The core workflow
- (optimize → expand) works without original_data.
Returns:
xr.Dataset: Dataset containing all DataArrays with structure in attributes
@@ -734,7 +730,7 @@ def to_dataset(self, include_solution: bool = True, include_original_data: bool
base_ds = super().to_dataset()
# Add FlowSystem-specific data (solution, clustering, metadata)
- return fx_io.flow_system_to_dataset(self, base_ds, include_solution, include_original_data)
+ return fx_io.flow_system_to_dataset(self, base_ds, include_solution)
@classmethod
def from_dataset(cls, ds: xr.Dataset) -> FlowSystem:
@@ -766,7 +762,6 @@ def to_netcdf(
path: str | pathlib.Path,
compression: int = 5,
overwrite: bool = False,
- include_original_data: bool = True,
):
"""
Save the FlowSystem to a NetCDF file.
@@ -779,9 +774,6 @@ def to_netcdf(
path: The path to the netCDF file. Parent directories are created if they don't exist.
compression: The compression level to use when saving the file (0-9).
overwrite: If True, overwrite existing file. If False, raise error if file exists.
- include_original_data: Whether to include clustering.original_data in the file.
- Defaults to True. Set to False for smaller files (~38% reduction) when
- clustering.plot.compare() isn't needed after loading.
Raises:
FileExistsError: If overwrite=False and file already exists.
@@ -801,7 +793,7 @@ def to_netcdf(
self.name = path.stem
try:
- ds = self.to_dataset(include_original_data=include_original_data)
+ ds = self.to_dataset()
fx_io.save_dataset_to_netcdf(ds, path, compression=compression)
logger.info(f'Saved FlowSystem to {path}')
except Exception as e:
diff --git a/flixopt/io.py b/flixopt/io.py
index c9ce26919..20d302204 100644
--- a/flixopt/io.py
+++ b/flixopt/io.py
@@ -1858,16 +1858,9 @@ def _restore_clustering(
clustering = fs_cls._resolve_reference_structure(clustering_structure, clustering_arrays)
flow_system.clustering = clustering
- # Reconstruct aggregated_data from FlowSystem's main data arrays
- if clustering.aggregated_data is None and main_var_names:
- from .core import drop_constant_arrays
-
- main_vars = {name: arrays_dict[name] for name in main_var_names}
- clustering.aggregated_data = drop_constant_arrays(xr.Dataset(main_vars), dim='time')
-
- # Restore cluster_weight from clustering's representative_weights
- if hasattr(clustering, 'representative_weights'):
- flow_system.cluster_weight = clustering.representative_weights
+ # Restore cluster_weight from clustering's cluster_occurrences
+ if hasattr(clustering, 'cluster_occurrences'):
+ flow_system.cluster_weight = clustering.cluster_occurrences.rename('cluster_weight')
@staticmethod
def _restore_metadata(
@@ -1904,7 +1897,6 @@ def to_dataset(
flow_system: FlowSystem,
base_dataset: xr.Dataset,
include_solution: bool = True,
- include_original_data: bool = True,
) -> xr.Dataset:
"""Convert FlowSystem-specific data to dataset.
@@ -1915,7 +1907,6 @@ def to_dataset(
flow_system: The FlowSystem to serialize
base_dataset: Dataset from parent class with basic structure
include_solution: Whether to include optimization solution
- include_original_data: Whether to include clustering.original_data
Returns:
Complete dataset with all FlowSystem data
@@ -1931,7 +1922,7 @@ def to_dataset(
ds = cls._add_carriers_to_dataset(ds, flow_system._carriers)
# Add clustering
- ds = cls._add_clustering_to_dataset(ds, flow_system.clustering, include_original_data)
+ ds = cls._add_clustering_to_dataset(ds, flow_system.clustering)
# Add variable categories
ds = cls._add_variable_categories_to_dataset(ds, flow_system._variable_categories)
@@ -1996,17 +1987,13 @@ def _add_clustering_to_dataset(
cls,
ds: xr.Dataset,
clustering: Any,
- include_original_data: bool,
) -> xr.Dataset:
"""Add clustering object to dataset."""
if clustering is not None:
- clustering_ref, clustering_arrays = clustering._create_reference_structure(
- include_original_data=include_original_data
- )
- # Add clustering arrays with prefix using batch assignment
- # (individual ds[name] = arr assignments are slow)
- prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()}
- ds = ds.assign(prefixed_arrays)
+ clustering_ref, clustering_arrays = clustering._create_reference_structure()
+ if clustering_arrays:
+ prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()}
+ ds = ds.assign(prefixed_arrays)
ds.attrs['clustering'] = json.dumps(clustering_ref, ensure_ascii=False)
return ds
@@ -2064,7 +2051,6 @@ def flow_system_to_dataset(
flow_system: FlowSystem,
base_dataset: xr.Dataset,
include_solution: bool = True,
- include_original_data: bool = True,
) -> xr.Dataset:
"""Convert FlowSystem-specific data to dataset.
@@ -2075,7 +2061,6 @@ def flow_system_to_dataset(
flow_system: The FlowSystem to serialize
base_dataset: Dataset from parent class with basic structure
include_solution: Whether to include optimization solution
- include_original_data: Whether to include clustering.original_data
Returns:
Complete dataset with all FlowSystem data
@@ -2083,4 +2068,4 @@ def flow_system_to_dataset(
See Also:
FlowSystemDatasetIO: Class containing the implementation
"""
- return FlowSystemDatasetIO.to_dataset(flow_system, base_dataset, include_solution, include_original_data)
+ return FlowSystemDatasetIO.to_dataset(flow_system, base_dataset, include_solution)
diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py
index f7a3698cc..b200fa9cb 100644
--- a/flixopt/transform_accessor.py
+++ b/flixopt/transform_accessor.py
@@ -28,102 +28,37 @@
logger = logging.getLogger('flixopt')
-def _combine_dataarray_slices(
- slices: list[xr.DataArray],
- base_dims: list[str],
- extra_dims: list[str],
- name: str | None = None,
-) -> xr.DataArray:
- """Combine DataArray slices with extra dimensions into a single DataArray.
-
- Args:
- slices: List of DataArrays, each with extra dims already expanded.
- base_dims: Base dimension names (e.g., ['cluster', 'time']).
- extra_dims: Extra dimension names (e.g., ['period', 'scenario']).
- name: Optional name for the result.
-
- Returns:
- Combined DataArray with dims [*base_dims, *extra_dims].
- """
- if len(slices) == 1:
- result = slices[0]
- else:
- combined = xr.combine_by_coords(slices)
- # combine_by_coords returns Dataset when DataArrays have names
- if isinstance(combined, xr.Dataset):
- result = list(combined.data_vars.values())[0]
- else:
- result = combined
-
- # Ensure consistent dimension order for both single and multi-slice paths
- result = result.transpose(*base_dims, *extra_dims)
-
- if name is not None:
- result = result.rename(name)
- return result
-
-
-def _expand_dims_for_key(da: xr.DataArray, dim_names: list[str], key: tuple) -> xr.DataArray:
- """Add dimensions to a DataArray based on key values.
-
- Args:
- da: DataArray without extra dimensions.
- dim_names: Names of dimensions to add (e.g., ['period', 'scenario']).
- key: Tuple of coordinate values matching dim_names.
-
- Returns:
- DataArray with extra dimensions added.
- """
- for dim_name, coord_val in zip(dim_names, key, strict=True):
- da = da.expand_dims({dim_name: [coord_val]})
- return da
-
-
class _ReducedFlowSystemBuilder:
- """Builds a reduced FlowSystem from tsam aggregation results.
+ """Builds a reduced FlowSystem from a tsam_xarray AggregationResult.
This class encapsulates the construction of reduced FlowSystem datasets,
- pre-computing shared coordinates and providing methods for building
- each component (weights, typical periods, segment durations, metrics).
+ extracting cluster representatives, weights, and metrics from the
+ tsam_xarray result.
Args:
fs: The original FlowSystem being reduced.
- aggregation_results: Dict mapping key tuples to tsam AggregationResult.
+ agg_result: tsam_xarray AggregationResult with DataArray-based results.
timesteps_per_cluster: Number of timesteps per cluster.
dt: Hours per timestep.
- dim_names: Names of extra dimensions (e.g., ['period', 'scenario']).
"""
def __init__(
self,
fs: FlowSystem,
- aggregation_results: dict[tuple, Any],
+ agg_result: Any, # tsam_xarray.AggregationResult
timesteps_per_cluster: int,
dt: float,
- dim_names: list[str],
+ unrename_map: dict[str, str] | None = None,
):
self._fs = fs
- self._aggregation_results = aggregation_results
+ self._agg_result = agg_result
self._timesteps_per_cluster = timesteps_per_cluster
self._dt = dt
- self._dim_names = dim_names
-
- # Extract info from first result (all should be consistent)
- first_result = next(iter(aggregation_results.values()))
- self._n_reduced_timesteps = len(first_result.cluster_representatives)
- self._n_clusters = first_result.n_clusters
- self._is_segmented = first_result.n_segments is not None
- self._n_segments = first_result.n_segments
-
- # Validate all results have consistent structure
- for key, result in aggregation_results.items():
- if result.n_clusters != self._n_clusters:
- key_str = dict(zip(dim_names, key, strict=False)) if dim_names else key
- raise ValueError(
- f'Inconsistent cluster counts across periods/scenarios: '
- f'{key_str} has {result.n_clusters} clusters, but expected {self._n_clusters}. '
- f'This can happen when ExtremeConfig does not preserve cluster counts.'
- )
+ self._unrename_map = unrename_map or {}
+
+ self._n_clusters = agg_result.n_clusters
+ self._is_segmented = agg_result.n_segments is not None
+ self._n_segments = agg_result.n_segments
# Pre-compute coordinates
self._cluster_coords = np.arange(self._n_clusters)
@@ -142,135 +77,65 @@ def __init__(
self._base_coords = {'cluster': self._cluster_coords, 'time': self._time_coords}
- def _expand_and_combine(
- self,
- data_per_key: dict[tuple, xr.DataArray],
- base_dims: list[str],
- name: str | None = None,
- ) -> xr.DataArray:
- """Expand dims for each key and combine slices.
-
- Args:
- data_per_key: Dict mapping keys to DataArrays without extra dims.
- base_dims: Base dimension names (e.g., ['cluster'] or ['cluster', 'time']).
- name: Optional name for the result.
-
- Returns:
- Combined DataArray with dims [*base_dims, *dim_names].
- """
- slices = [_expand_dims_for_key(da, self._dim_names, key) for key, da in data_per_key.items()]
- return _combine_dataarray_slices(slices, base_dims, self._dim_names, name=name)
+ def _unrename(self, da: xr.DataArray) -> xr.DataArray:
+ """Rename tsam_xarray output dims back to original names (e.g., _period -> period)."""
+ renames = {k: v for k, v in self._unrename_map.items() if k in da.dims}
+ return da.rename(renames) if renames else da
def build_cluster_weights(self) -> xr.DataArray:
- """Build cluster_weight DataArray from aggregation results.
+ """Build cluster_weight DataArray from aggregation result.
Returns:
- DataArray with dims [cluster, *dim_names].
+ DataArray with dims [cluster, period?, scenario?].
"""
- data_per_key = {}
- for key, result in self._aggregation_results.items():
- weights = np.array([result.cluster_weights.get(c, 0) for c in range(self._n_clusters)])
- data_per_key[key] = xr.DataArray(weights, dims=['cluster'], coords={'cluster': self._cluster_coords})
- return self._expand_and_combine(data_per_key, ['cluster'], name='cluster_weight')
+ weights = self._agg_result.cluster_weights.rename(cluster='cluster')
+ return self._unrename(weights.rename('cluster_weight'))
def build_typical_periods(self) -> dict[str, xr.DataArray]:
- """Build typical periods DataArrays with (cluster, time, *dim_names) shape.
+ """Build typical periods DataArrays with (cluster, time, ...) shape.
Returns:
- Dict mapping column names to combined DataArrays.
+ Dict mapping column names to DataArrays.
"""
- column_slices: dict[str, dict[tuple, xr.DataArray]] = {}
-
- for key, tsam_result in self._aggregation_results.items():
- typical_df = tsam_result.cluster_representatives
- for col in typical_df.columns:
- series = typical_df[col]
- if self._is_segmented:
- # Segmented: MultiIndex (cluster, segment_step, segment_duration)
- # Drop duration level and unstack by segment step
- unstacked = series.droplevel('Segment Duration').unstack(level='Segment Step')
- else:
- # Non-segmented: MultiIndex (cluster, timestep)
- unstacked = series.unstack(level='timestep')
- da = xr.DataArray(unstacked.values, dims=['cluster', 'time'], coords=self._base_coords)
- column_slices.setdefault(col, {})[key] = da
-
- return {
- col: self._expand_and_combine(data_per_key, ['cluster', 'time'])
- for col, data_per_key in column_slices.items()
- }
+ representatives = self._agg_result.cluster_representatives
+ # representatives has dims: (cluster, timestep, variable, _period?, scenario?)
+ # We need to split by variable and rename timestep -> time
+ result = {}
+ # Exclude known dims (including renamed variants like _period, _cluster)
+ known_dims = {'cluster', 'timestep', 'period', 'scenario'} | set(self._unrename_map.keys())
+ unknown_dims = [d for d in representatives.dims if d not in known_dims]
+ assert len(unknown_dims) == 1, (
+ f'Expected exactly 1 variable dim, got {unknown_dims} (known: {known_dims}, all: {representatives.dims})'
+ )
+ variable_dim = unknown_dims[0]
+ for var_name in representatives.coords[variable_dim].values:
+ da = representatives.sel({variable_dim: var_name}, drop=True)
+ # Rename timestep -> time and assign our coordinates
+ da = da.rename({'timestep': 'time'})
+ da = da.assign_coords(cluster=self._cluster_coords, time=self._time_coords)
+ # Ensure cluster and time are first two dims
+ other_dims = [d for d in da.dims if d not in ('cluster', 'time')]
+ da = da.transpose('cluster', 'time', *other_dims)
+ result[str(var_name)] = self._unrename(da)
+ return result
def build_segment_durations(self) -> xr.DataArray:
"""Build timestep_duration DataArray from segment durations.
Returns:
- DataArray with dims [cluster, time, *dim_names].
-
- Raises:
- ValueError: If not a segmented system.
+ DataArray with dims [cluster, time, period?, scenario?].
"""
if not self._is_segmented:
raise ValueError('build_segment_durations() requires a segmented system')
- data_per_key = {}
- for key, tsam_result in self._aggregation_results.items():
- seg_durs = tsam_result.segment_durations
- data = np.array(
- [[seg_durs[c][s] * self._dt for s in range(self._n_segments)] for c in range(self._n_clusters)]
- )
- data_per_key[key] = xr.DataArray(data, dims=['cluster', 'time'], coords=self._base_coords)
-
- return self._expand_and_combine(data_per_key, ['cluster', 'time'], name='timestep_duration')
-
- def build_metrics(self) -> xr.Dataset:
- """Build clustering metrics Dataset from aggregation results.
-
- Returns:
- Dataset with RMSE, MAE, RMSE_duration metrics.
- """
- # Convert accuracy to DataFrames, filtering out failures
- metrics_dfs: dict[tuple, pd.DataFrame] = {}
- for key, result in self._aggregation_results.items():
- try:
- metrics_dfs[key] = _accuracy_to_dataframe(result.accuracy)
- except Exception as e:
- logger.warning(f'Failed to compute clustering metrics for {key}: {e}')
- metrics_dfs[key] = pd.DataFrame()
-
- non_empty_metrics = {k: v for k, v in metrics_dfs.items() if not v.empty}
-
- if not non_empty_metrics:
- return xr.Dataset()
-
- # Single slice case
- if len(metrics_dfs) == 1 and len(non_empty_metrics) == 1:
- metrics_df = next(iter(non_empty_metrics.values()))
- return xr.Dataset(
- {
- col: xr.DataArray(
- metrics_df[col].values,
- dims=['time_series'],
- coords={'time_series': metrics_df.index},
- )
- for col in metrics_df.columns
- }
- )
-
- # Multi-dim case - all periods have same time series
- sample_df = next(iter(non_empty_metrics.values()))
- time_series_index = list(sample_df.index)
- data_vars = {}
-
- for metric in sample_df.columns:
- data_per_key = {}
- for key, df in metrics_dfs.items():
- values = np.full(len(time_series_index), np.nan) if df.empty else df[metric].values
- data_per_key[key] = xr.DataArray(
- values, dims=['time_series'], coords={'time_series': time_series_index}
- )
- data_vars[metric] = self._expand_and_combine(data_per_key, ['time_series'], name=metric)
-
- return xr.Dataset(data_vars)
+ seg_durs = self._agg_result.segment_durations
+ # Convert from timestep counts to hours
+ da = seg_durs * self._dt
+ # Rename dims to match our convention
+ da = da.rename({'timestep': 'time'})
+ da = da.assign_coords(cluster=self._cluster_coords, time=self._time_coords)
+ other_dims = [d for d in da.dims if d not in ('cluster', 'time')]
+ return self._unrename(da.transpose('cluster', 'time', *other_dims).rename('timestep_duration'))
def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataArray]) -> xr.Dataset:
"""Build the reduced dataset with (cluster, time) structure.
@@ -284,6 +149,8 @@ def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataAr
"""
from .core import TimeSeriesData
+ n_reduced_timesteps = self._n_clusters * self._n_time_points
+
ds_new_vars = {}
variables = ds.variables
coord_cache = {k: ds.coords[k].values for k in ds.coords}
@@ -298,7 +165,7 @@ def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataAr
# Time-dependent but constant: reshape to (cluster, time, ...)
time_idx = var.dims.index('time')
slices = [slice(None)] * len(var.dims)
- slices[time_idx] = slice(0, self._n_reduced_timesteps)
+ slices[time_idx] = slice(0, n_reduced_timesteps)
sliced_values = var.values[tuple(slices)]
other_dims = [d for d in var.dims if d != 'time']
@@ -337,13 +204,11 @@ def build(self, ds: xr.Dataset) -> FlowSystem:
Reduced FlowSystem with clustering metadata attached.
"""
from .clustering import Clustering
- from .core import drop_constant_arrays
from .flow_system import FlowSystem
# Build all components
cluster_weight = self.build_cluster_weights()
typical_das = self.build_typical_periods()
- metrics = self.build_metrics()
ds_new = self.build_reduced_dataset(ds, typical_das)
# Add segment durations if segmented
@@ -374,34 +239,13 @@ def build(self, ds: xr.Dataset) -> FlowSystem:
# Create Clustering object with full AggregationResult access
reduced_fs.clustering = Clustering(
original_timesteps=self._fs.timesteps,
- original_data=drop_constant_arrays(ds, dim='time'),
- aggregated_data=drop_constant_arrays(ds_new, dim='time'),
- _metrics=metrics if metrics.data_vars else None,
- _aggregation_results=self._aggregation_results,
- _dim_names=self._dim_names,
+ _aggregation_result=self._agg_result,
+ _unrename_map=self._unrename_map,
)
return reduced_fs
-def _accuracy_to_dataframe(accuracy: Any) -> pd.DataFrame:
- """Convert tsam ClusteringAccuracy to DataFrame with metrics.
-
- Args:
- accuracy: tsam ClusteringAccuracy object.
-
- Returns:
- DataFrame with RMSE, MAE, RMSE_duration columns indexed by time series name.
- """
- return pd.DataFrame(
- {
- 'RMSE': accuracy.rmse,
- 'MAE': accuracy.mae,
- 'RMSE_duration': accuracy.rmse_duration,
- }
- )
-
-
class _Expander:
"""Handles expansion of clustered FlowSystem to original timesteps.
@@ -419,8 +263,6 @@ def __init__(self, fs: FlowSystem, clustering: Clustering):
# Pre-compute clustering dimensions
self._timesteps_per_cluster = clustering.timesteps_per_cluster
- self._n_segments = clustering.n_segments
- self._time_dim_size = self._n_segments if self._n_segments else self._timesteps_per_cluster
self._n_clusters = clustering.n_clusters
self._n_original_clusters = clustering.n_original_clusters
@@ -439,69 +281,16 @@ def __init__(self, fs: FlowSystem, clustering: Clustering):
self._n_original_clusters - 1,
)
- # Build variable category sets
- self._variable_categories = getattr(fs, '_variable_categories', {})
- if self._variable_categories:
- self._state_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_INTERPOLATE}
- self._first_timestep_vars = {
- name for name, cat in self._variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP
- }
- self._segment_total_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_DIVIDE}
- else:
- # Fallback to pattern matching for old FlowSystems without categories
- self._state_vars = set()
- self._first_timestep_vars = set()
- self._segment_total_vars = self._build_segment_total_varnames() if clustering.is_segmented else set()
+ # Build variable category sets from registered categories
+ variable_categories = fs._variable_categories
+ self._state_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_INTERPOLATE}
+ self._first_timestep_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP}
+ self._segment_total_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_DIVIDE}
- # Build expansion divisor for segmented systems
+ # Pre-compute expansion divisor for segmented systems (segment durations on original time)
self._expansion_divisor = None
if clustering.is_segmented:
- self._expansion_divisor = clustering.build_expansion_divisor(original_time=self._original_timesteps)
-
- def _is_state_variable(self, var_name: str) -> bool:
- """Check if variable is a state variable requiring interpolation."""
- return var_name in self._state_vars or (not self._variable_categories and var_name.endswith('|charge_state'))
-
- def _is_first_timestep_variable(self, var_name: str) -> bool:
- """Check if variable is a first-timestep-only variable (startup/shutdown)."""
- return var_name in self._first_timestep_vars or (
- not self._variable_categories and (var_name.endswith('|startup') or var_name.endswith('|shutdown'))
- )
-
- def _build_segment_total_varnames(self) -> set[str]:
- """Build segment total variable names - BACKWARDS COMPATIBILITY FALLBACK.
-
- This method is only used when variable_categories is empty (old FlowSystems
- saved before category registration was implemented). New FlowSystems use
- the VariableCategory registry with EXPAND_DIVIDE categories (PER_TIMESTEP, SHARE).
-
- Returns:
- Set of variable names that should be divided by expansion divisor.
- """
- segment_total_vars: set[str] = set()
- effect_names = list(self._fs.effects.keys())
-
- # 1. Per-timestep totals for each effect
- for effect in effect_names:
- segment_total_vars.add(f'{effect}(temporal)|per_timestep')
-
- # 2. Flow contributions to effects
- for flow_label in self._fs.flows:
- for effect in effect_names:
- segment_total_vars.add(f'{flow_label}->{effect}(temporal)')
-
- # 3. Component contributions to effects
- for component_label in self._fs.components:
- for effect in effect_names:
- segment_total_vars.add(f'{component_label}->{effect}(temporal)')
-
- # 4. Effect-to-effect contributions
- for target_effect_name, target_effect in self._fs.effects.items():
- if target_effect.share_from_temporal:
- for source_effect_name in target_effect.share_from_temporal:
- segment_total_vars.add(f'{source_effect_name}(temporal)->{target_effect_name}(temporal)')
-
- return segment_total_vars
+ self._expansion_divisor = clustering.disaggregate(clustering.segment_durations).ffill(dim='time')
def _append_final_state(self, expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray:
"""Append final state value from original data to expanded data."""
@@ -516,113 +305,15 @@ def _append_final_state(self, expanded: xr.DataArray, da: xr.DataArray) -> xr.Da
extra_val = extra_val.expand_dims(time=[self._original_timesteps_extra[-1]])
return xr.concat([expanded, extra_val], dim='time')
- def _interpolate_charge_state_segmented(self, da: xr.DataArray) -> xr.DataArray:
- """Interpolate charge_state values within segments for segmented systems.
-
- For segmented systems, charge_state has values at segment boundaries (n_segments+1).
- This method interpolates between start and end boundary values to show the
- actual charge trajectory as the storage charges/discharges.
-
- Args:
- da: charge_state DataArray with dims (cluster, time) where time has n_segments+1 entries.
-
- Returns:
- Interpolated charge_state with dims (time, ...) for original timesteps.
- """
- clustering = self._clustering
-
- # Get multi-dimensional properties from Clustering
- segment_assignments = clustering.results.segment_assignments
- segment_durations = clustering.results.segment_durations
- position_within_segment = clustering.results.position_within_segment
- cluster_assignments = clustering.cluster_assignments
-
- # Compute original period index and position within period
- original_period_indices = np.minimum(
- np.arange(self._n_original_timesteps) // self._timesteps_per_cluster,
- self._n_original_clusters - 1,
- )
- positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster
-
- # Create DataArrays for indexing
- original_period_da = xr.DataArray(original_period_indices, dims=['original_time'])
- position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time'])
-
- # Map original period to cluster
- cluster_indices = cluster_assignments.isel(original_cluster=original_period_da)
-
- # Get segment index and position for each original timestep
- seg_indices = segment_assignments.isel(cluster=cluster_indices, time=position_in_period_da)
- positions = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da)
- durations = segment_durations.isel(cluster=cluster_indices, segment=seg_indices)
-
- # Calculate interpolation factor: position within segment (0 to 1)
- factor = xr.where(durations > 1, (positions + 0.5) / durations, 0.5)
-
- # Get start and end boundary values from charge_state
- start_vals = da.isel(cluster=cluster_indices, time=seg_indices)
- end_vals = da.isel(cluster=cluster_indices, time=seg_indices + 1)
-
- # Linear interpolation
- interpolated = start_vals + (end_vals - start_vals) * factor
-
- # Clean up coordinate artifacts and rename
- interpolated = interpolated.drop_vars(['cluster', 'time', 'segment'], errors='ignore')
- interpolated = interpolated.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps)
-
- return interpolated.transpose('time', ...).assign_attrs(da.attrs)
-
- def _expand_first_timestep_only(self, da: xr.DataArray) -> xr.DataArray:
- """Expand binary event variables to first timestep of each segment only.
-
- For segmented systems, binary event variables like startup and shutdown indicate
- that an event occurred somewhere in the segment. When expanded, the event is placed
- at the first timestep of each segment, with zeros elsewhere.
-
- Args:
- da: Binary event DataArray with dims including (cluster, time).
-
- Returns:
- Expanded DataArray with event values only at first timestep of each segment.
- """
- clustering = self._clustering
-
- # First expand normally (repeats values)
- expanded = clustering.expand_data(da, original_time=self._original_timesteps)
-
- # Build mask: True only at first timestep of each segment
- position_within_segment = clustering.results.position_within_segment
- cluster_assignments = clustering.cluster_assignments
-
- # Compute original period index and position within period
- original_period_indices = np.minimum(
- np.arange(self._n_original_timesteps) // self._timesteps_per_cluster,
- self._n_original_clusters - 1,
- )
- positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster
-
- # Create DataArrays for indexing
- original_period_da = xr.DataArray(original_period_indices, dims=['original_time'])
- position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time'])
-
- # Map to cluster and get position within segment
- cluster_indices = cluster_assignments.isel(original_cluster=original_period_da)
- pos_in_segment = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da)
-
- # Clean up and create mask
- pos_in_segment = pos_in_segment.drop_vars(['cluster', 'time'], errors='ignore')
- pos_in_segment = pos_in_segment.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps)
-
- # First timestep of segment has position 0
- is_first = pos_in_segment == 0
-
- # Apply mask: keep value at first timestep, zero elsewhere
- result = xr.where(is_first, expanded, 0)
- return result.assign_attrs(da.attrs)
-
def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bool = False) -> xr.DataArray:
"""Expand a DataArray from clustered to original timesteps.
+ Uses clustering.disaggregate() as the core expansion, then applies
+ post-processing based on variable category:
+ - State variables (segmented): interpolate within segments
+ - First-timestep variables (segmented): value at segment start, zero elsewhere
+ - Segment totals: divide by segment duration for hourly rate
+
Args:
da: DataArray to expand.
var_name: Variable name for category-based expansion handling.
@@ -636,19 +327,30 @@ def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bo
clustering = self._clustering
has_cluster_dim = 'cluster' in da.dims
- is_state = self._is_state_variable(var_name) and has_cluster_dim
- is_first_timestep = self._is_first_timestep_variable(var_name) and has_cluster_dim
+ is_state = var_name in self._state_vars and has_cluster_dim
+ is_first_timestep = var_name in self._first_timestep_vars and has_cluster_dim
is_segment_total = is_solution and var_name in self._segment_total_vars
- # Choose expansion method
- if is_state and clustering.is_segmented:
- expanded = self._interpolate_charge_state_segmented(da)
- elif is_first_timestep and is_solution and clustering.is_segmented:
- return self._expand_first_timestep_only(da)
- else:
- expanded = clustering.expand_data(da, original_time=self._original_timesteps)
- if is_segment_total and self._expansion_divisor is not None:
- expanded = expanded / self._expansion_divisor
+ # Solution variables have n+1 timesteps (extra boundary value).
+ # Strip it before disaggregating — it will be appended back for state variables.
+ expected_time = clustering.n_segments if clustering.is_segmented else clustering.timesteps_per_cluster
+ has_extra = has_cluster_dim and da.sizes.get('time', 0) > expected_time
+ da_for_disagg = da.isel(time=slice(None, expected_time)) if has_extra else da
+
+ # Disaggregate: map (cluster, time) back to original time axis.
+ # For non-segmented: values are repeated. For segmented: NaN between boundaries.
+ expanded = clustering.disaggregate(da_for_disagg)
+
+ # Post-processing for segmented systems
+ if clustering.is_segmented and has_cluster_dim:
+ if is_state:
+ expanded = expanded.interpolate_na(dim='time')
+ elif is_first_timestep and is_solution:
+ return expanded.fillna(0).assign_attrs(da.attrs)
+ else:
+ expanded = expanded.ffill(dim='time')
+ if is_segment_total and self._expansion_divisor is not None:
+ expanded = expanded / self._expansion_divisor
# State variables need final state appended
if is_state:
@@ -822,8 +524,10 @@ def expand_flow_system(self) -> FlowSystem:
n_combinations = (len(self._fs.periods) if has_periods else 1) * (
len(self._fs.scenarios) if has_scenarios else 1
)
- n_reduced_timesteps = self._n_clusters * self._time_dim_size
- segmented_info = f' ({self._n_segments} segments)' if self._n_segments else ''
+ n_segments = self._clustering.n_segments
+ time_dim_size = n_segments if n_segments else self._timesteps_per_cluster
+ n_reduced_timesteps = self._n_clusters * time_dim_size
+ segmented_info = f' ({n_segments} segments)' if n_segments else ''
logger.info(
f'Expanded FlowSystem from {n_reduced_timesteps} to {self._n_original_timesteps} timesteps '
f'({self._n_clusters} clusters{segmented_info}'
@@ -866,81 +570,6 @@ def __init__(self, flow_system: FlowSystem) -> None:
"""
self._fs = flow_system
- @staticmethod
- def _calculate_clustering_weights(ds) -> dict[str, float]:
- """Calculate weights for clustering based on dataset attributes."""
- from collections import Counter
-
- import numpy as np
-
- groups = [da.attrs.get('clustering_group') for da in ds.data_vars.values() if 'clustering_group' in da.attrs]
- group_counts = Counter(groups)
-
- # Calculate weight for each group (1/count)
- group_weights = {group: 1 / count for group, count in group_counts.items()}
-
- weights = {}
- variables = ds.variables
- for name in ds.data_vars:
- var_attrs = variables[name].attrs
- clustering_group = var_attrs.get('clustering_group')
- group_weight = group_weights.get(clustering_group)
- if group_weight is not None:
- weights[name] = group_weight
- else:
- weights[name] = var_attrs.get('clustering_weight', 1)
-
- if np.all(np.isclose(list(weights.values()), 1, atol=1e-6)):
- logger.debug('All Clustering weights were set to 1')
-
- return weights
-
- @staticmethod
- def _build_cluster_config_with_weights(
- cluster: ClusterConfig | None,
- auto_weights: dict[str, float],
- available_columns: set[str] | None = None,
- ) -> ClusterConfig:
- """Merge auto-calculated weights into ClusterConfig.
-
- Args:
- cluster: Optional user-provided ClusterConfig.
- auto_weights: Automatically calculated weights based on data variance.
- available_columns: Column names present in the clustering DataFrame.
- If provided, weights are filtered to only include these columns.
- This prevents tsam errors when some time series are dropped
- (e.g., constant arrays removed before clustering).
-
- Returns:
- ClusterConfig with weights set (either user-provided or auto-calculated).
- """
- from tsam import ClusterConfig
-
- # Determine weights: user-provided take priority over auto-calculated
- if cluster is not None and cluster.weights is not None:
- weights = dict(cluster.weights)
- else:
- weights = auto_weights
-
- # Filter weights to only include columns present in the clustering data
- if available_columns is not None:
- weights = {name: w for name, w in weights.items() if name in available_columns}
-
- # No ClusterConfig provided - use defaults with weights
- if cluster is None:
- return ClusterConfig(weights=weights)
-
- # ClusterConfig provided - use its settings with (possibly filtered) weights
- return ClusterConfig(
- method=cluster.method,
- representation=cluster.representation,
- weights=weights,
- normalize_column_means=cluster.normalize_column_means,
- use_duration_curves=cluster.use_duration_curves,
- include_period_sums=cluster.include_period_sums,
- solver=cluster.solver,
- )
-
def sel(
self,
time: str | slice | list[str] | pd.Timestamp | pd.DatetimeIndex | None = None,
@@ -1465,90 +1094,10 @@ def fix_sizes(
return new_fs
- def clustering_data(
- self,
- period: Any | None = None,
- scenario: Any | None = None,
- ) -> xr.Dataset:
- """
- Get the time-varying data that would be used for clustering.
-
- This method extracts only the data arrays that vary over time, which is
- the data that clustering algorithms use to identify typical periods.
- Constant arrays (same value for all timesteps) are excluded since they
- don't contribute to pattern identification.
-
- Use this to inspect or pre-process the data before clustering, or to
- understand which variables influence the clustering result.
-
- Args:
- period: Optional period label to select. If None and the FlowSystem
- has multiple periods, returns data for all periods.
- scenario: Optional scenario label to select. If None and the FlowSystem
- has multiple scenarios, returns data for all scenarios.
-
- Returns:
- xr.Dataset containing only time-varying data arrays. The dataset
- includes arrays like demand profiles, price profiles, and other
- time series that vary over the time dimension.
-
- Examples:
- Inspect clustering input data:
-
- >>> data = flow_system.transform.clustering_data()
- >>> print(f'Variables used for clustering: {list(data.data_vars)}')
- >>> data['HeatDemand(Q)|fixed_relative_profile'].plot()
-
- Get data for a specific period/scenario:
-
- >>> data_2024 = flow_system.transform.clustering_data(period=2024)
- >>> data_high = flow_system.transform.clustering_data(scenario='high')
-
- Convert to DataFrame for external tools:
-
- >>> df = flow_system.transform.clustering_data().to_dataframe()
- """
- from .core import drop_constant_arrays
-
- if not self._fs.connected_and_transformed:
- self._fs.connect_and_transform()
-
- ds = self._fs.to_dataset(include_solution=False)
-
- # Build selector for period/scenario
- selector = {}
- if period is not None:
- selector['period'] = period
- if scenario is not None:
- selector['scenario'] = scenario
-
- # Apply selection if specified
- if selector:
- ds = ds.sel(**selector, drop=True)
-
- # Filter to only time-varying arrays
- result = drop_constant_arrays(ds, dim='time')
-
- # Guard against empty dataset (all variables are constant)
- if not result.data_vars:
- selector_info = f' for {selector}' if selector else ''
- raise ValueError(
- f'No time-varying data found{selector_info}. '
- f'All variables are constant over time. Check your period/scenario filter or input data.'
- )
-
- # Remove attrs for cleaner output
- result.attrs = {}
- for var in result.data_vars:
- result[var].attrs = {}
-
- return result
-
def cluster(
self,
n_clusters: int,
cluster_duration: str | float,
- data_vars: list[str] | None = None,
cluster: ClusterConfig | None = None,
extremes: ExtremeConfig | None = None,
segments: SegmentConfig | None = None,
@@ -1580,16 +1129,11 @@ def cluster(
n_clusters: Number of clusters (typical periods) to extract (e.g., 8 typical days).
cluster_duration: Duration of each cluster. Can be a pandas-style string
('1D', '24h', '6h') or a numeric value in hours.
- data_vars: Optional list of variable names to use for clustering. If specified,
- only these variables are used to determine cluster assignments, but the
- clustering is then applied to ALL time-varying data in the FlowSystem.
- Use ``transform.clustering_data()`` to see available variables.
- Example: ``data_vars=['HeatDemand(Q)|fixed_relative_profile']`` to cluster
- based only on heat demand patterns.
cluster: Optional tsam ``ClusterConfig`` object specifying clustering algorithm,
- representation method, and weights. If None, uses default settings (hierarchical
- clustering with medoid representation) and automatically calculated weights
- based on data variance.
+ representation method, and weights. Use ``weights={var: 0}`` to exclude
+ specific variables from influencing cluster assignments while still
+ aggregating them. If None, uses default settings (hierarchical clustering
+ with medoid representation).
extremes: Optional tsam ``ExtremeConfig`` object specifying how to handle
extreme periods (peaks). Use this to ensure peak demand days are captured.
Example: ``ExtremeConfig(method='new_cluster', max_value=['demand'])``.
@@ -1632,16 +1176,18 @@ def cluster(
... )
>>> fs_clustered.optimize(solver)
- Clustering based on specific variables only:
+ Clustering based on specific variables only (zero-weight the rest):
- >>> # See available variables for clustering
- >>> print(flow_system.transform.clustering_data().data_vars)
- >>>
- >>> # Cluster based only on demand profile
+ >>> from tsam import ClusterConfig
>>> fs_clustered = flow_system.transform.cluster(
... n_clusters=8,
... cluster_duration='1D',
- ... data_vars=['HeatDemand(Q)|fixed_relative_profile'],
+ ... cluster=ClusterConfig(
+ ... weights={
+ ... 'HeatDemand(Q)|fixed_relative_profile': 1,
+ ... 'GasSource(Gas)|costs|per_flow_hour': 0, # ignored for clustering
+ ... }
+ ... ),
... )
Note:
@@ -1651,10 +1197,7 @@ def cluster(
- For seasonal storage (e.g., hydrogen, thermal storage), set
``Storage.cluster_mode='intercluster'`` or ``'intercluster_cyclic'``
"""
- import tsam
-
- from .clustering import ClusteringResults
- from .core import drop_constant_arrays
+ import tsam_xarray
# Parse cluster_duration to hours
hours_per_cluster = (
@@ -1677,50 +1220,19 @@ def cluster(
has_periods = self._fs.periods is not None
has_scenarios = self._fs.scenarios is not None
- # Determine iteration dimensions
- periods = list(self._fs.periods) if has_periods else [None]
- scenarios = list(self._fs.scenarios) if has_scenarios else [None]
-
ds = self._fs.to_dataset(include_solution=False)
- # Validate and prepare data_vars for clustering
- if data_vars is not None:
- missing = set(data_vars) - set(ds.data_vars)
- if missing:
- raise ValueError(
- f'data_vars not found in FlowSystem: {missing}. '
- f'Available time-varying variables can be found via transform.clustering_data().'
- )
- ds_for_clustering = ds[list(data_vars)]
- else:
- ds_for_clustering = ds
-
- # Validate user-provided weight keys against the selected clustering input
- if cluster is not None and cluster.weights is not None:
- selected_vars = set(ds_for_clustering.data_vars)
- unknown = sorted(set(cluster.weights) - selected_vars)
- if unknown:
- raise ValueError(
- f'ClusterConfig weights reference unknown variables: {unknown}. '
- f'Available variables can be found via transform.clustering_data().'
- )
+ # Only keep variables with a time dimension for clustering
+ ds_for_clustering = ds[[name for name in ds.data_vars if 'time' in ds[name].dims]]
- # Filter constant arrays once on the full dataset (not per slice)
- # This ensures all slices have the same variables for consistent metrics
- ds_for_clustering = drop_constant_arrays(ds_for_clustering, dim='time')
-
- # Guard against empty dataset after removing constant arrays
if not ds_for_clustering.data_vars:
- filter_info = f'data_vars={data_vars}' if data_vars else 'all variables'
- raise ValueError(
- f'No time-varying data found for clustering ({filter_info}). '
- f'All variables are constant over time. Check your data_vars filter or input data.'
- )
+ raise ValueError('No time-varying data found for clustering. Check your input data.')
# Validate tsam_kwargs doesn't override explicit parameters
reserved_tsam_keys = {
'n_clusters',
'period_duration', # exposed as cluster_duration
+ 'temporal_resolution', # computed automatically
'timestep_duration', # computed automatically
'cluster',
'segments',
@@ -1738,9 +1250,8 @@ def cluster(
)
# Validate ExtremeConfig compatibility with multi-period/scenario systems
- # Only method='replace' reliably produces consistent cluster counts across all slices.
- total_slices = len(periods) * len(scenarios)
- if total_slices > 1 and extremes is not None:
+ has_slices = has_periods or has_scenarios
+ if has_slices and extremes is not None:
if extremes.method != 'replace':
raise ValueError(
f"ExtremeConfig method='{extremes.method}' is not supported for multi-period "
@@ -1749,75 +1260,90 @@ def cluster(
"ExtremeConfig(..., method='replace')"
)
- # Build dim_names and clean key helper
- dim_names: list[str] = []
+ # Rename reserved dimension names to avoid conflict with tsam_xarray
+ # tsam_xarray reserves: 'period', 'cluster', 'timestep'
+ reserved_renames = {'period': '_period', 'cluster': '_cluster'}
+ # Check against full ds dims (period/cluster may only exist as coords, not in ds_for_clustering)
+ rename_map = {k: v for k, v in reserved_renames.items() if k in ds.dims}
+ unrename_map = {v: k for k, v in rename_map.items()}
+
+ if rename_map:
+ # Only rename dims that exist in each dataset
+ clustering_renames = {k: v for k, v in rename_map.items() if k in ds_for_clustering.dims}
+ if clustering_renames:
+ ds_for_clustering = ds_for_clustering.rename(clustering_renames)
+ ds = ds.rename(rename_map)
+
+ # Stack Dataset into a single DataArray with 'variable' dimension
+ da_for_clustering = ds_for_clustering.to_dataarray(dim='variable')
+
+ # Ensure period/scenario dimensions are present in the DataArray
+ # even if the data doesn't vary across them (tsam_xarray needs them for slicing)
+ extra_dims = []
if has_periods:
- dim_names.append('period')
+ extra_dims.append(rename_map.get('period', 'period'))
if has_scenarios:
- dim_names.append('scenario')
-
- def to_clean_key(period_label, scenario_label) -> tuple:
- """Convert (period, scenario) to clean key based on which dims exist."""
- key_parts = []
- if has_periods:
- key_parts.append(period_label)
- if has_scenarios:
- key_parts.append(scenario_label)
- return tuple(key_parts)
-
- # Cluster each (period, scenario) combination using tsam directly
- aggregation_results: dict[tuple, Any] = {}
-
- for period_label in periods:
- for scenario_label in scenarios:
- key = to_clean_key(period_label, scenario_label)
- selector = {k: v for k, v in [('period', period_label), ('scenario', scenario_label)] if v is not None}
-
- # Select data slice for clustering
- ds_slice = ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering
- df_for_clustering = ds_slice.to_dataframe()
-
- if selector:
- logger.info(f'Clustering {", ".join(f"{k}={v}" for k, v in selector.items())}...')
-
- # Suppress tsam warning about minimal value constraints (informational, not actionable)
- with warnings.catch_warnings():
- warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*')
-
- # Build ClusterConfig with auto-calculated weights, filtered to available columns
- clustering_weights = self._calculate_clustering_weights(ds_slice)
- cluster_config = self._build_cluster_config_with_weights(
- cluster, clustering_weights, available_columns=set(df_for_clustering.columns)
- )
-
- # Perform clustering based on selected data_vars (or all if not specified)
- aggregation_results[key] = tsam.aggregate(
- df_for_clustering,
- n_clusters=n_clusters,
- period_duration=hours_per_cluster,
- temporal_resolution=dt,
- cluster=cluster_config,
- extremes=extremes,
- segments=segments,
- preserve_column_means=preserve_column_means,
- rescale_exclude_columns=rescale_exclude_columns,
- round_decimals=round_decimals,
- numerical_tolerance=numerical_tolerance,
- **tsam_kwargs,
- )
-
- # If data_vars was specified, apply clustering to FULL data
- if data_vars is not None:
- # Build ClusteringResults from subset clustering
- clustering_results = ClusteringResults(
- {k: r.clustering for k, r in aggregation_results.items()},
- dim_names,
+ extra_dims.append(rename_map.get('scenario', 'scenario'))
+ for dim_name in extra_dims:
+ if dim_name not in da_for_clustering.dims and dim_name in ds.dims:
+ # Drop as non-dim coordinate first (to_dataarray may keep it as scalar coord)
+ if dim_name in da_for_clustering.coords:
+ da_for_clustering = da_for_clustering.drop_vars(dim_name)
+ da_for_clustering = da_for_clustering.expand_dims({dim_name: ds.coords[dim_name].values})
+
+ # Pass user-specified weights to tsam_xarray (validates unknown keys)
+ if cluster is not None and cluster.weights is not None:
+ weights = dict(cluster.weights)
+ else:
+ weights = {}
+
+ # Build tsam_kwargs with explicit parameters
+ tsam_kwargs_full = {
+ 'period_duration': hours_per_cluster,
+ 'temporal_resolution': dt,
+ 'extremes': extremes,
+ 'segments': segments,
+ 'preserve_column_means': preserve_column_means,
+ 'rescale_exclude_columns': rescale_exclude_columns,
+ 'round_decimals': round_decimals,
+ 'numerical_tolerance': numerical_tolerance,
+ **tsam_kwargs,
+ }
+
+ # Pass cluster config settings (without weights, which go to tsam_xarray directly)
+ if cluster is not None:
+ from tsam import ClusterConfig
+
+ cluster_config = ClusterConfig(
+ method=cluster.method,
+ representation=cluster.representation,
+ normalize_column_means=cluster.normalize_column_means,
+ use_duration_curves=cluster.use_duration_curves,
+ include_period_sums=cluster.include_period_sums,
+ solver=cluster.solver,
)
- # Apply to full data and replace results
- aggregation_results = dict(clustering_results.apply(ds))
+ tsam_kwargs_full['cluster'] = cluster_config
+
+ # Suppress tsam warning about minimal value constraints (informational, not actionable)
+ with warnings.catch_warnings():
+ warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*')
+
+ # Single call: tsam_xarray handles (period, scenario) slicing automatically
+ agg_result = tsam_xarray.aggregate(
+ da_for_clustering,
+ time_dim='time',
+ cluster_dim='variable',
+ n_clusters=n_clusters,
+ weights=weights,
+ **tsam_kwargs_full,
+ )
+
+ # Rename reserved dims back to original names in the dataset
+ if unrename_map:
+ ds = ds.rename(unrename_map)
# Build and return the reduced FlowSystem
- builder = _ReducedFlowSystemBuilder(self._fs, aggregation_results, timesteps_per_cluster, dt, dim_names)
+ builder = _ReducedFlowSystemBuilder(self._fs, agg_result, timesteps_per_cluster, dt, unrename_map)
return builder.build(ds)
def apply_clustering(
@@ -1864,7 +1390,6 @@ def apply_clustering(
# Get timesteps_per_cluster from the clustering object (survives serialization)
timesteps_per_cluster = clustering.timesteps_per_cluster
- dim_names = clustering.results.dim_names
ds = self._fs.to_dataset(include_solution=False)
@@ -1877,20 +1402,54 @@ def apply_clustering(
f'FlowSystem has {current_timesteps} timesteps, but clustering expects '
f'{expected_timesteps} timesteps ({clustering.n_original_clusters} clusters × '
f'{clustering.timesteps_per_cluster} timesteps/cluster). '
- f'Ensure self._fs.timesteps matches the original data used for clustering.results.apply(ds).'
+ f'Ensure self._fs.timesteps matches the original data used for clustering.'
)
- # Apply existing clustering to all (period, scenario) combinations at once
+ # Rename reserved dimension names to avoid conflict with tsam_xarray
+ reserved_renames = {'period': '_period', 'cluster': '_cluster'}
+ rename_map = {k: v for k, v in reserved_renames.items() if k in ds.dims}
+ unrename_map = {v: k for k, v in rename_map.items()}
+
+ if rename_map:
+ ds = ds.rename(rename_map)
+
+ # Apply existing clustering to full data
logger.info('Applying clustering...')
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*')
- agg_results = clustering.results.apply(ds)
+ da_full = ds.to_dataarray(dim='variable')
+
+ # Ensure extra dims are present in DataArray
+ for _orig_name, renamed in rename_map.items():
+ if renamed not in da_full.dims and renamed in ds.dims:
+ if renamed in da_full.coords:
+ da_full = da_full.drop_vars(renamed)
+ da_full = da_full.expand_dims({renamed: ds.coords[renamed].values})
+
+ # Get clustering result with correct dim names for the renamed data
+ from tsam_xarray import ClusteringResult as ClusteringResultClass
+
+ cr_result = clustering.clustering_result
+ # Map dim names to renamed versions (e.g., period → _period)
+ slice_dims = [rename_map.get(d, d) for d in clustering.dim_names]
+ cr_result = ClusteringResultClass(
+ time_dim='time',
+ cluster_dim=['variable'],
+ slice_dims=slice_dims,
+ clusterings=dict(cr_result.clusterings),
+ )
+ # TODO(tsam_xarray): Same workaround as in cluster() above — remove
+ # once tsam_xarray handles mismatched weights in apply().
+ for cr in cr_result.clusterings.values():
+ object.__setattr__(cr, 'weights', {})
+ agg_result = cr_result.apply(da_full)
- # Convert AggregationResults to dict format
- aggregation_results = dict(agg_results)
+ # Rename back
+ if unrename_map:
+ ds = ds.rename(unrename_map)
# Build and return the reduced FlowSystem
- builder = _ReducedFlowSystemBuilder(self._fs, aggregation_results, timesteps_per_cluster, dt, dim_names)
+ builder = _ReducedFlowSystemBuilder(self._fs, agg_result, timesteps_per_cluster, dt, unrename_map)
return builder.build(ds)
def _validate_for_expansion(self) -> Clustering:
diff --git a/mkdocs.yml b/mkdocs.yml
index e827a5d89..ea81f4487 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -79,9 +79,6 @@ nav:
- Clustering:
- Introduction: notebooks/08c-clustering.ipynb
- Storage Modes: notebooks/08c2-clustering-storage-modes.ipynb
- - Multi-Period: notebooks/08d-clustering-multiperiod.ipynb
- - Segmentation: notebooks/08f-clustering-segmentation.ipynb
- - Internals: notebooks/08e-clustering-internals.ipynb
- Results:
- Plotting: notebooks/09-plotting-and-data-access.ipynb
diff --git a/pyproject.toml b/pyproject.toml
index 0431d1833..4762ff7d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,8 @@ network_viz = [
# Full feature set (everything except dev tools)
full = [
- "tsam >= 3.1.2, < 4", # Time series aggregation for clustering (3.0.0 and 3.1.0 yanked)
+ "tsam_xarray >= 0.5.1, < 1", # Time series aggregation for clustering (wraps tsam)
+ "tsam >= 3.1.2, < 4", # Directly imported for ClusterConfig, ExtremeConfig, SegmentConfig
"pyvis==0.3.2", # Visualizing FlowSystem Network
"scipy >= 1.15.1, < 2", # Used by tsam. Prior versions have conflict with highspy. See https://github.com/scipy/scipy/issues/22257
"gurobipy >= 10.0.0, < 14; python_version < '3.14'", # No Python 3.14 wheels yet (expected Q1 2026)
@@ -77,7 +78,7 @@ full = [
# Development tools and testing
dev = [
- "tsam==3.1.2", # Time series aggregation for clustering
+ "tsam_xarray>=0.5.1", # Time series aggregation for clustering (wraps tsam)
"pytest==9.0.2",
"pytest-xdist==3.8.0",
"nbformat==5.10.4",
diff --git a/tests/conftest.py b/tests/conftest.py
index 84b137c84..b3950cc35 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -566,11 +566,11 @@ def flow_system_long():
thermal_load_ts, electrical_load_ts = (
fx.TimeSeriesData(thermal_load),
- fx.TimeSeriesData(electrical_load, clustering_weight=0.7),
+ fx.TimeSeriesData(electrical_load),
)
p_feed_in, p_sell = (
- fx.TimeSeriesData(-(p_el - 0.5), clustering_group='p_el'),
- fx.TimeSeriesData(p_el + 0.5, clustering_group='p_el'),
+ fx.TimeSeriesData(-(p_el - 0.5)),
+ fx.TimeSeriesData(p_el + 0.5),
)
flow_system = fx.FlowSystem(pd.DatetimeIndex(data.index))
diff --git a/tests/deprecated/conftest.py b/tests/deprecated/conftest.py
index efa9fa119..ff0538073 100644
--- a/tests/deprecated/conftest.py
+++ b/tests/deprecated/conftest.py
@@ -562,11 +562,11 @@ def flow_system_long():
thermal_load_ts, electrical_load_ts = (
fx.TimeSeriesData(thermal_load),
- fx.TimeSeriesData(electrical_load, clustering_weight=0.7),
+ fx.TimeSeriesData(electrical_load),
)
p_feed_in, p_sell = (
- fx.TimeSeriesData(-(p_el - 0.5), clustering_group='p_el'),
- fx.TimeSeriesData(p_el + 0.5, clustering_group='p_el'),
+ fx.TimeSeriesData(-(p_el - 0.5)),
+ fx.TimeSeriesData(p_el + 0.5),
)
flow_system = fx.FlowSystem(pd.DatetimeIndex(data.index))
diff --git a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py
index bbb03f06b..95797888e 100644
--- a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py
+++ b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py
@@ -58,9 +58,9 @@ def get_solutions(optimizations: list, variable: str) -> xr.Dataset:
# TimeSeriesData objects
TS_heat_demand = fx.TimeSeriesData(heat_demand)
- TS_electricity_demand = fx.TimeSeriesData(electricity_demand, clustering_weight=0.7)
- TS_electricity_price_sell = fx.TimeSeriesData(-(electricity_price - 0.5), clustering_group='p_el')
- TS_electricity_price_buy = fx.TimeSeriesData(electricity_price + 0.5, clustering_group='p_el')
+ TS_electricity_demand = fx.TimeSeriesData(electricity_demand)
+ TS_electricity_price_sell = fx.TimeSeriesData(-(electricity_price - 0.5))
+ TS_electricity_price_buy = fx.TimeSeriesData(electricity_price + 0.5)
flow_system = fx.FlowSystem(timesteps)
flow_system.add_elements(
diff --git a/tests/test_clustering/test_base.py b/tests/test_clustering/test_base.py
index 81afc2a97..f69de4cdf 100644
--- a/tests/test_clustering/test_base.py
+++ b/tests/test_clustering/test_base.py
@@ -5,8 +5,29 @@
import pytest
import xarray as xr
-from flixopt.clustering import Clustering, ClusteringResults
-from flixopt.clustering.base import _build_timestep_mapping, _cluster_occurrences
+from flixopt.clustering import Clustering
+
+tsam_xarray = pytest.importorskip('tsam_xarray')
+
+
+def _make_clustering_result(clusterings: dict, dim_names: list[str]):
+ """Create a ClusteringResult from a dict of tsam ClusteringResult-like objects."""
+ return tsam_xarray.ClusteringResult(
+ time_dim='time',
+ cluster_dim=['variable'],
+ slice_dims=dim_names,
+ clusterings=clusterings,
+ )
+
+
+def _make_clustering(clusterings: dict, dim_names: list[str], n_timesteps: int | None = None):
+ """Create a Clustering from mock ClusteringResult objects."""
+ cr_result = _make_clustering_result(clusterings, dim_names)
+ first = next(iter(clusterings.values()))
+ if n_timesteps is None:
+ n_timesteps = first.n_original_periods * first.n_timesteps_per_period
+ original_timesteps = pd.date_range('2024-01-01', periods=n_timesteps, freq='h')
+ return Clustering(clustering_result=cr_result, original_timesteps=original_timesteps)
class TestHelperFunctions:
@@ -22,194 +43,28 @@ class MockClusteringResult:
n_timesteps_per_period = 24
cluster_assignments = (0, 1, 0, 1, 2, 0)
period_duration = 24.0
- n_segments = None # None indicates non-segmented
- segment_assignments = None # None indicates non-segmented
-
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- """Mock apply method."""
- return {'applied': True}
+ n_segments = None
+ segment_assignments = None
+ cluster_centers = (0, 1, 4)
return MockClusteringResult()
def test_cluster_occurrences(self, mock_clustering_result):
- """Test _cluster_occurrences helper."""
- occurrences = _cluster_occurrences(mock_clustering_result)
+ """Test cluster_occurrences via Clustering."""
+ clustering = _make_clustering({(): mock_clustering_result}, [])
+ occurrences = clustering.cluster_occurrences
# cluster 0: 3 occurrences (indices 0, 2, 5)
# cluster 1: 2 occurrences (indices 1, 3)
# cluster 2: 1 occurrence (index 4)
- np.testing.assert_array_equal(occurrences, [3, 2, 1])
-
- def test_build_timestep_mapping(self, mock_clustering_result):
- """Test _build_timestep_mapping helper."""
- mapping = _build_timestep_mapping(mock_clustering_result, n_timesteps=144)
- assert len(mapping) == 144
-
- # First 24 timesteps should map to cluster 0's representative (0-23)
- np.testing.assert_array_equal(mapping[:24], np.arange(24))
-
- # Second 24 timesteps (period 1 -> cluster 1) should map to cluster 1's representative (24-47)
- np.testing.assert_array_equal(mapping[24:48], np.arange(24, 48))
-
-
-class TestClusteringResults:
- """Tests for ClusteringResults collection class."""
-
- @pytest.fixture
- def mock_clustering_result_factory(self):
- """Factory for creating mock ClusteringResult objects."""
-
- def create_result(cluster_assignments, n_timesteps_per_period=24):
- class MockClusteringResult:
- n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0
- n_original_periods = len(cluster_assignments)
- period_duration = 24.0
- n_segments = None # None indicates non-segmented
- segment_assignments = None # None indicates non-segmented
-
- def __init__(self, assignments, n_timesteps):
- self.cluster_assignments = tuple(assignments)
- self.n_timesteps_per_period = n_timesteps
-
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- return {'applied': True}
-
- return MockClusteringResult(cluster_assignments, n_timesteps_per_period)
-
- return create_result
-
- def test_single_result(self, mock_clustering_result_factory):
- """Test ClusteringResults with single result."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(): cr}, dim_names=[])
-
- assert results.n_clusters == 2
- assert results.timesteps_per_cluster == 24
- assert len(results) == 1
-
- def test_multi_period_results(self, mock_clustering_result_factory):
- """Test ClusteringResults with multiple periods."""
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
-
- results = ClusteringResults(
- {(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
-
- assert results.n_clusters == 2
- assert len(results) == 2
-
- # Access by period
- assert results.sel(period=2020) is cr_2020
- assert results.sel(period=2030) is cr_2030
-
- def test_dims_property(self, mock_clustering_result_factory):
- """Test dims property returns tuple (xarray-like)."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(): cr}, dim_names=[])
- assert results.dims == ()
-
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
- results = ClusteringResults(
- {(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
- assert results.dims == ('period',)
-
- def test_coords_property(self, mock_clustering_result_factory):
- """Test coords property returns dict (xarray-like)."""
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
- results = ClusteringResults(
- {(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
- assert results.coords == {'period': [2020, 2030]}
-
- def test_sel_method(self, mock_clustering_result_factory):
- """Test sel() method (xarray-like selection)."""
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
- results = ClusteringResults(
- {(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
- assert results.sel(period=2020) is cr_2020
- assert results.sel(period=2030) is cr_2030
-
- def test_sel_invalid_key_raises(self, mock_clustering_result_factory):
- """Test sel() raises KeyError for invalid key."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(2020,): cr}, dim_names=['period'])
-
- with pytest.raises(KeyError):
- results.sel(period=2030)
-
- def test_isel_method(self, mock_clustering_result_factory):
- """Test isel() method (xarray-like integer selection)."""
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
- results = ClusteringResults(
- {(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
- assert results.isel(period=0) is cr_2020
- assert results.isel(period=1) is cr_2030
-
- def test_isel_invalid_index_raises(self, mock_clustering_result_factory):
- """Test isel() raises IndexError for out-of-range index."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(2020,): cr}, dim_names=['period'])
-
- with pytest.raises(IndexError):
- results.isel(period=5)
-
- def test_cluster_assignments_dataarray(self, mock_clustering_result_factory):
- """Test cluster_assignments returns correct DataArray."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(): cr}, dim_names=[])
-
- cluster_assignments = results.cluster_assignments
- assert isinstance(cluster_assignments, xr.DataArray)
- assert 'original_cluster' in cluster_assignments.dims
- np.testing.assert_array_equal(cluster_assignments.values, [0, 1, 0])
-
- def test_cluster_occurrences_dataarray(self, mock_clustering_result_factory):
- """Test cluster_occurrences returns correct DataArray."""
- cr = mock_clustering_result_factory([0, 1, 0]) # 2 x cluster 0, 1 x cluster 1
- results = ClusteringResults({(): cr}, dim_names=[])
-
- occurrences = results.cluster_occurrences
- assert isinstance(occurrences, xr.DataArray)
- assert 'cluster' in occurrences.dims
- np.testing.assert_array_equal(occurrences.values, [2, 1])
+ np.testing.assert_array_equal(occurrences.values, [3, 2, 1])
class TestClustering:
- """Tests for Clustering dataclass."""
+ """Tests for Clustering class."""
@pytest.fixture
- def basic_cluster_results(self):
- """Create basic ClusteringResults for testing."""
+ def mock_cr(self):
+ """Create a mock tsam ClusteringResult."""
class MockClusteringResult:
n_clusters = 3
@@ -217,33 +72,16 @@ class MockClusteringResult:
n_timesteps_per_period = 24
cluster_assignments = (0, 1, 0, 1, 2, 0)
period_duration = 24.0
- n_segments = None # None indicates non-segmented
- segment_assignments = None # None indicates non-segmented
+ n_segments = None
+ segment_assignments = None
+ cluster_centers = (0, 1, 4)
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- return {'applied': True}
-
- mock_cr = MockClusteringResult()
- return ClusteringResults({(): mock_cr}, dim_names=[])
+ return MockClusteringResult()
@pytest.fixture
- def basic_clustering(self, basic_cluster_results):
+ def basic_clustering(self, mock_cr):
"""Create a basic Clustering instance for testing."""
- original_timesteps = pd.date_range('2024-01-01', periods=144, freq='h')
-
- return Clustering(
- results=basic_cluster_results,
- original_timesteps=original_timesteps,
- )
+ return _make_clustering({(): mock_cr}, [])
def test_basic_creation(self, basic_clustering):
"""Test basic Clustering creation."""
@@ -251,10 +89,6 @@ def test_basic_creation(self, basic_clustering):
assert basic_clustering.timesteps_per_cluster == 24
assert basic_clustering.n_original_clusters == 6
- def test_n_representatives(self, basic_clustering):
- """Test n_representatives property."""
- assert basic_clustering.n_representatives == 72 # 3 * 24
-
def test_cluster_occurrences(self, basic_clustering):
"""Test cluster_occurrences property returns correct values."""
occurrences = basic_clustering.cluster_occurrences
@@ -265,39 +99,6 @@ def test_cluster_occurrences(self, basic_clustering):
assert occurrences.sel(cluster=1).item() == 2
assert occurrences.sel(cluster=2).item() == 1
- def test_representative_weights(self, basic_clustering):
- """Test representative_weights is same as cluster_occurrences."""
- weights = basic_clustering.representative_weights
- occurrences = basic_clustering.cluster_occurrences
- xr.testing.assert_equal(
- weights.drop_vars('cluster', errors='ignore'),
- occurrences.drop_vars('cluster', errors='ignore'),
- )
-
- def test_timestep_mapping(self, basic_clustering):
- """Test timestep_mapping property."""
- mapping = basic_clustering.timestep_mapping
- assert isinstance(mapping, xr.DataArray)
- assert 'original_time' in mapping.dims
- assert len(mapping) == 144 # Original timesteps
-
- def test_metrics(self, basic_clustering):
- """Test metrics property returns empty Dataset when no metrics."""
- metrics = basic_clustering.metrics
- assert isinstance(metrics, xr.Dataset)
- # No metrics provided, so should be empty
- assert len(metrics.data_vars) == 0
-
- def test_cluster_start_positions(self, basic_clustering):
- """Test cluster_start_positions property."""
- positions = basic_clustering.cluster_start_positions
- np.testing.assert_array_equal(positions, [0, 24, 48])
-
- def test_empty_results_raises(self):
- """Test that empty results raises ValueError."""
- with pytest.raises(ValueError, match='cannot be empty'):
- ClusteringResults({}, dim_names=[])
-
def test_repr(self, basic_clustering):
"""Test string representation."""
repr_str = repr(basic_clustering)
@@ -305,12 +106,16 @@ def test_repr(self, basic_clustering):
assert '6 periods' in repr_str
assert '3 clusters' in repr_str
+ def test_dim_names_no_extra(self, basic_clustering):
+ """Test dim_names with no extra dimensions."""
+ assert basic_clustering.dim_names == []
+
class TestClusteringMultiDim:
"""Tests for Clustering with period/scenario dimensions."""
@pytest.fixture
- def mock_clustering_result_factory(self):
+ def mock_cr_factory(self):
"""Factory for creating mock ClusteringResult objects."""
def create_result(cluster_assignments, n_timesteps_per_period=24):
@@ -318,167 +123,28 @@ class MockClusteringResult:
n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0
n_original_periods = len(cluster_assignments)
period_duration = 24.0
- n_segments = None # None indicates non-segmented
- segment_assignments = None # None indicates non-segmented
+ n_segments = None
+ segment_assignments = None
+ cluster_centers = tuple(range(max(cluster_assignments) + 1)) if cluster_assignments else ()
def __init__(self, assignments, n_timesteps):
self.cluster_assignments = tuple(assignments)
self.n_timesteps_per_period = n_timesteps
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- return {'applied': True}
-
return MockClusteringResult(cluster_assignments, n_timesteps_per_period)
return create_result
- def test_multi_period_clustering(self, mock_clustering_result_factory):
+ def test_multi_period_clustering(self, mock_cr_factory):
"""Test Clustering with multiple periods."""
- cr_2020 = mock_clustering_result_factory([0, 1, 0])
- cr_2030 = mock_clustering_result_factory([1, 0, 1])
+ cr_2020 = mock_cr_factory([0, 1, 0])
+ cr_2030 = mock_cr_factory([1, 0, 1])
- results = ClusteringResults(
+ clustering = _make_clustering(
{(2020,): cr_2020, (2030,): cr_2030},
- dim_names=['period'],
- )
- original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h')
-
- clustering = Clustering(
- results=results,
- original_timesteps=original_timesteps,
+ ['period'],
)
assert clustering.n_clusters == 2
assert 'period' in clustering.cluster_occurrences.dims
-
- def test_get_result(self, mock_clustering_result_factory):
- """Test get_result method."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(): cr}, dim_names=[])
- original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h')
-
- clustering = Clustering(
- results=results,
- original_timesteps=original_timesteps,
- )
-
- retrieved = clustering.get_result()
- assert retrieved is cr
-
- def test_get_result_invalid_key(self, mock_clustering_result_factory):
- """Test get_result with invalid key raises KeyError."""
- cr = mock_clustering_result_factory([0, 1, 0])
- results = ClusteringResults({(2020,): cr}, dim_names=['period'])
- original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h')
-
- clustering = Clustering(
- results=results,
- original_timesteps=original_timesteps,
- )
-
- with pytest.raises(KeyError):
- clustering.get_result(period=2030)
-
-
-class TestClusteringPlotAccessor:
- """Tests for ClusteringPlotAccessor."""
-
- @pytest.fixture
- def clustering_with_data(self):
- """Create Clustering with original and aggregated data."""
-
- class MockClusteringResult:
- n_clusters = 2
- n_original_periods = 3
- n_timesteps_per_period = 24
- cluster_assignments = (0, 1, 0)
- period_duration = 24.0
-
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- return {'applied': True}
-
- mock_cr = MockClusteringResult()
- results = ClusteringResults({(): mock_cr}, dim_names=[])
-
- original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h')
-
- original_data = xr.Dataset(
- {
- 'col1': xr.DataArray(np.random.randn(72), dims=['time'], coords={'time': original_timesteps}),
- }
- )
- aggregated_data = xr.Dataset(
- {
- 'col1': xr.DataArray(
- np.random.randn(2, 24),
- dims=['cluster', 'time'],
- coords={'cluster': [0, 1], 'time': pd.date_range('2000-01-01', periods=24, freq='h')},
- ),
- }
- )
-
- return Clustering(
- results=results,
- original_timesteps=original_timesteps,
- original_data=original_data,
- aggregated_data=aggregated_data,
- )
-
- def test_plot_accessor_exists(self, clustering_with_data):
- """Test that plot accessor is available."""
- assert hasattr(clustering_with_data, 'plot')
- assert hasattr(clustering_with_data.plot, 'compare')
- assert hasattr(clustering_with_data.plot, 'heatmap')
- assert hasattr(clustering_with_data.plot, 'clusters')
-
- def test_compare_requires_data(self):
- """Test compare() raises when no data available."""
-
- class MockClusteringResult:
- n_clusters = 2
- n_original_periods = 2
- n_timesteps_per_period = 24
- cluster_assignments = (0, 1)
- period_duration = 24.0
-
- def to_dict(self):
- return {
- 'n_clusters': self.n_clusters,
- 'n_original_periods': self.n_original_periods,
- 'n_timesteps_per_period': self.n_timesteps_per_period,
- 'cluster_assignments': list(self.cluster_assignments),
- 'period_duration': self.period_duration,
- }
-
- def apply(self, data):
- return {'applied': True}
-
- mock_cr = MockClusteringResult()
- results = ClusteringResults({(): mock_cr}, dim_names=[])
- original_timesteps = pd.date_range('2024-01-01', periods=48, freq='h')
-
- clustering = Clustering(
- results=results,
- original_timesteps=original_timesteps,
- )
-
- with pytest.raises(ValueError, match='No original/aggregated data'):
- clustering.plot.compare()
+ assert clustering.dim_names == ['period']
diff --git a/tests/test_clustering/test_cluster_reduce_expand.py b/tests/test_clustering/test_cluster_reduce_expand.py
index 679307fba..d8c9cbf74 100644
--- a/tests/test_clustering/test_cluster_reduce_expand.py
+++ b/tests/test_clustering/test_cluster_reduce_expand.py
@@ -915,8 +915,8 @@ def test_extremes_append_with_segments(self, solver_fixture, timesteps_8_days):
n_clusters = fs_clustered.clustering.n_clusters
assert n_clusters >= 2
- # n_representatives = n_clusters * n_segments
- assert fs_clustered.clustering.n_representatives == n_clusters * 6
+ # n_clusters * n_segments
+ assert n_clusters * fs_clustered.clustering.n_segments == n_clusters * 6
# Verify optimization works
fs_clustered.optimize(solver_fixture)
@@ -930,157 +930,6 @@ def test_extremes_append_with_segments(self, solver_fixture, timesteps_8_days):
assert int(fs_clustered.clustering.cluster_occurrences.sum()) == 8
-# ==================== Data Vars Parameter Tests ====================
-
-
-class TestDataVarsParameter:
- """Tests for data_vars parameter in cluster() method."""
-
- def test_cluster_with_data_vars_subset(self, timesteps_8_days):
- """Test clustering with a subset of variables."""
- # Create system with multiple time-varying data
- hours = len(timesteps_8_days)
- demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15
- price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 # Different pattern
-
- fs = fx.FlowSystem(timesteps_8_days)
- fs.add_elements(
- fx.Bus('Heat'),
- fx.Bus('Gas'),
- fx.Effect('costs', '€', is_standard=True, is_objective=True),
- fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]),
- fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]),
- fx.linear_converters.Boiler(
- 'Boiler',
- thermal_efficiency=0.9,
- fuel_flow=fx.Flow('Q_fu', bus='Gas'),
- thermal_flow=fx.Flow('Q_th', bus='Heat'),
- ),
- )
-
- # Cluster based only on demand profile (not price)
- fs_reduced = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- data_vars=['HeatDemand(Q)|fixed_relative_profile'],
- )
-
- # Should have clustered structure
- assert len(fs_reduced.timesteps) == 24
- assert len(fs_reduced.clusters) == 2
-
- def test_data_vars_validation_error(self, timesteps_8_days):
- """Test that invalid data_vars raises ValueError."""
- fs = create_simple_system(timesteps_8_days)
-
- with pytest.raises(ValueError, match='data_vars not found'):
- fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- data_vars=['NonExistentVariable'],
- )
-
- def test_data_vars_preserves_all_flowsystem_data(self, timesteps_8_days):
- """Test that clustering with data_vars preserves all FlowSystem variables."""
- # Create system with multiple time-varying data
- hours = len(timesteps_8_days)
- demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15
- price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05
-
- fs = fx.FlowSystem(timesteps_8_days)
- fs.add_elements(
- fx.Bus('Heat'),
- fx.Bus('Gas'),
- fx.Effect('costs', '€', is_standard=True, is_objective=True),
- fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]),
- fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]),
- fx.linear_converters.Boiler(
- 'Boiler',
- thermal_efficiency=0.9,
- fuel_flow=fx.Flow('Q_fu', bus='Gas'),
- thermal_flow=fx.Flow('Q_th', bus='Heat'),
- ),
- )
-
- # Cluster based only on demand profile
- fs_reduced = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- data_vars=['HeatDemand(Q)|fixed_relative_profile'],
- )
-
- # Both demand and price should be preserved in the reduced FlowSystem
- ds = fs_reduced.to_dataset()
- assert 'HeatDemand(Q)|fixed_relative_profile' in ds.data_vars
- assert 'GasSource(Gas)|costs|per_flow_hour' in ds.data_vars
-
- def test_data_vars_optimization_works(self, solver_fixture, timesteps_8_days):
- """Test that FlowSystem clustered with data_vars can be optimized."""
- hours = len(timesteps_8_days)
- demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15
- price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05
-
- fs = fx.FlowSystem(timesteps_8_days)
- fs.add_elements(
- fx.Bus('Heat'),
- fx.Bus('Gas'),
- fx.Effect('costs', '€', is_standard=True, is_objective=True),
- fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]),
- fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]),
- fx.linear_converters.Boiler(
- 'Boiler',
- thermal_efficiency=0.9,
- fuel_flow=fx.Flow('Q_fu', bus='Gas'),
- thermal_flow=fx.Flow('Q_th', bus='Heat'),
- ),
- )
-
- fs_reduced = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- data_vars=['HeatDemand(Q)|fixed_relative_profile'],
- )
-
- # Should optimize successfully
- fs_reduced.optimize(solver_fixture)
- assert fs_reduced.solution is not None
- assert 'Boiler(Q_th)|flow_rate' in fs_reduced.solution
-
- def test_data_vars_with_multiple_variables(self, timesteps_8_days):
- """Test clustering with multiple selected variables."""
- hours = len(timesteps_8_days)
- demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15
- price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05
-
- fs = fx.FlowSystem(timesteps_8_days)
- fs.add_elements(
- fx.Bus('Heat'),
- fx.Bus('Gas'),
- fx.Effect('costs', '€', is_standard=True, is_objective=True),
- fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]),
- fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]),
- fx.linear_converters.Boiler(
- 'Boiler',
- thermal_efficiency=0.9,
- fuel_flow=fx.Flow('Q_fu', bus='Gas'),
- thermal_flow=fx.Flow('Q_th', bus='Heat'),
- ),
- )
-
- # Cluster based on both demand and price
- fs_reduced = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- data_vars=[
- 'HeatDemand(Q)|fixed_relative_profile',
- 'GasSource(Gas)|costs|per_flow_hour',
- ],
- )
-
- assert len(fs_reduced.timesteps) == 24
- assert len(fs_reduced.clusters) == 2
-
-
# ==================== Segmentation Tests ====================
@@ -1249,28 +1098,6 @@ def test_segmented_statistics_after_expand(self, solver_fixture, timesteps_8_day
flow_rates = stats.flow_rates
assert 'time' in flow_rates.dims
- def test_segmented_timestep_mapping_uses_segment_assignments(self, timesteps_8_days):
- """Test that timestep_mapping correctly maps original timesteps to segments."""
- from tsam import SegmentConfig
-
- fs = create_simple_system(timesteps_8_days)
-
- fs_segmented = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- segments=SegmentConfig(n_segments=6),
- )
-
- mapping = fs_segmented.clustering.timestep_mapping
-
- # Mapping should have original timestep count
- assert len(mapping.values) == 192
-
- # Each mapped value should be in valid range: [0, n_clusters * n_segments)
- max_valid_idx = 2 * 6 - 1 # n_clusters * n_segments - 1
- assert mapping.min().item() >= 0
- assert mapping.max().item() <= max_valid_idx
-
@pytest.mark.parametrize('freq', ['1h', '2h'])
def test_segmented_total_effects_match_solution(self, solver_fixture, freq):
"""Test that total_effects matches solution Cost after expand with segmentation.
@@ -1449,13 +1276,6 @@ def test_segmented_expand_maps_correctly_per_period(self, solver_fixture, timest
fs_segmented.optimize(solver_fixture)
- # Get the timestep_mapping which should be multi-dimensional
- mapping = fs_segmented.clustering.timestep_mapping
-
- # Mapping should have period dimension
- assert 'period' in mapping.dims
- assert mapping.sizes['period'] == 2
-
# Expand and verify each period has correct number of timesteps
fs_expanded = fs_segmented.transform.expand()
flow_var = 'Boiler(Q_th)|flow_rate'
diff --git a/tests/test_clustering/test_clustering_io.py b/tests/test_clustering/test_clustering_io.py
index 0e2200885..93769b167 100644
--- a/tests/test_clustering/test_clustering_io.py
+++ b/tests/test_clustering/test_clustering_io.py
@@ -70,13 +70,9 @@ def test_clustering_to_dataset_has_clustering_attrs(self, simple_system_8_days):
ds = fs_clustered.to_dataset(include_solution=False)
- # Check that clustering attrs are present
+ # Check that clustering attrs are present (serialized as JSON string)
assert 'clustering' in ds.attrs
- # Check that clustering arrays are present with prefix
- clustering_vars = [name for name in ds.data_vars if name.startswith('clustering|')]
- assert len(clustering_vars) > 0
-
def test_clustering_roundtrip_preserves_clustering_object(self, simple_system_8_days):
"""Clustering object should be restored after roundtrip."""
from flixopt.clustering import Clustering
@@ -124,17 +120,6 @@ def test_clustering_roundtrip_preserves_original_timesteps(self, simple_system_8
# check_names=False because index name may be lost during serialization
pd.testing.assert_index_equal(fs_restored.clustering.original_timesteps, original_timesteps, check_names=False)
- def test_clustering_roundtrip_preserves_timestep_mapping(self, simple_system_8_days):
- """Timestep mapping should be preserved after roundtrip."""
- fs = simple_system_8_days
- fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D')
- original_mapping = fs_clustered.clustering.timestep_mapping.values.copy()
-
- ds = fs_clustered.to_dataset(include_solution=False)
- fs_restored = fx.FlowSystem.from_dataset(ds)
-
- np.testing.assert_array_equal(fs_restored.clustering.timestep_mapping.values, original_mapping)
-
class TestClusteringWithSolutionRoundtrip:
"""Test that clustering with solution survives roundtrip."""
@@ -623,23 +608,23 @@ def test_cluster_assignments_preserved_after_roundtrip(self, system_with_periods
# cluster_assignments should be exactly preserved
xr.testing.assert_equal(original_cluster_assignments, fs_restored.clustering.cluster_assignments)
- def test_results_preserved_after_load(self, system_with_periods_and_scenarios, tmp_path):
- """ClusteringResults should be preserved after loading (via ClusteringResults.to_dict())."""
+ def test_clustering_result_preserved_after_load(self, system_with_periods_and_scenarios, tmp_path):
+ """ClusteringResult should be preserved after loading."""
fs = system_with_periods_and_scenarios
fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D')
- # Before save, results exists
- assert fs_clustered.clustering.results is not None
+ # Before save, clustering_result exists
+ assert fs_clustered.clustering.clustering_result is not None
# Roundtrip
nc_path = tmp_path / 'multi_dim_clustering.nc'
fs_clustered.to_netcdf(nc_path)
fs_restored = fx.FlowSystem.from_netcdf(nc_path)
- # After load, results should be reconstructed
- assert fs_restored.clustering.results is not None
- # The restored results should have the same structure
- assert len(fs_restored.clustering.results) == len(fs_clustered.clustering.results)
+ # After load, clustering_result should be reconstructed
+ assert fs_restored.clustering.clustering_result is not None
+ # The restored clustering should have the same structure
+ assert len(fs_restored.clustering) == len(fs_clustered.clustering)
def test_derived_properties_work_after_load(self, system_with_periods_and_scenarios, tmp_path):
"""Derived properties should work correctly after loading (computed from cluster_assignments)."""
@@ -676,8 +661,8 @@ def test_apply_clustering_after_load(self, system_with_periods_and_scenarios, tm
# Load the full FlowSystem with clustering
fs_loaded = fx.FlowSystem.from_netcdf(nc_path)
clustering_loaded = fs_loaded.clustering
- # ClusteringResults should be fully preserved after load
- assert clustering_loaded.results is not None
+ # ClusteringResult should be fully preserved after load
+ assert clustering_loaded.clustering_result is not None
# Create a fresh FlowSystem (copy the original, unclustered one)
fs_fresh = fs.copy()
diff --git a/tests/test_clustering/test_expansion_regression.py b/tests/test_clustering/test_expansion_regression.py
new file mode 100644
index 000000000..1bce3b4e7
--- /dev/null
+++ b/tests/test_clustering/test_expansion_regression.py
@@ -0,0 +1,157 @@
+"""Regression tests for cluster → optimize → expand numerical equivalence.
+
+These tests verify that the expanded solution values match known reference
+values, catching any changes in the clustering/expansion pipeline.
+"""
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import flixopt as fx
+
+tsam = pytest.importorskip('tsam')
+
+
+@pytest.fixture
+def system_with_storage():
+ """System with storage (tests charge_state) and effects (tests segment totals)."""
+ ts = pd.date_range('2020-01-01', periods=192, freq='h') # 8 days
+ demand = np.sin(np.linspace(0, 16 * np.pi, 192)) * 10 + 15
+
+ fs = fx.FlowSystem(ts)
+ fs.add_elements(
+ fx.Bus('Heat'),
+ fx.Bus('Gas'),
+ fx.Effect('costs', '€', is_standard=True, is_objective=True),
+ fx.Sink('D', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]),
+ fx.Source('G', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=0.05)]),
+ fx.linear_converters.Boiler(
+ 'B',
+ thermal_efficiency=0.9,
+ fuel_flow=fx.Flow('Q_fu', bus='Gas'),
+ thermal_flow=fx.Flow('Q_th', bus='Heat'),
+ ),
+ fx.Storage(
+ 'S',
+ capacity_in_flow_hours=50,
+ initial_charge_state=0.5,
+ charging=fx.Flow('in', bus='Heat', size=10),
+ discharging=fx.Flow('out', bus='Heat', size=10),
+ ),
+ )
+ return fs
+
+
+class TestNonSegmentedExpansion:
+ """Test that non-segmented cluster → expand produces correct values."""
+
+ def test_expanded_objective_matches(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D')
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ assert fs_e.solution['objective'].item() == pytest.approx(160.0, abs=1e-6)
+
+ def test_expanded_flow_rates(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D')
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ assert float(np.nansum(sol['B(Q_th)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6)
+ assert float(np.nansum(sol['D(Q)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6)
+ assert float(np.nansum(sol['G(Gas)|flow_rate'].values)) == pytest.approx(3200.0, abs=1e-6)
+
+ def test_expanded_costs(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D')
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ assert float(np.nansum(sol['costs(temporal)|per_timestep'].values)) == pytest.approx(160.0, abs=1e-6)
+ assert float(np.nansum(sol['G(Gas)->costs(temporal)'].values)) == pytest.approx(160.0, abs=1e-6)
+
+ def test_expanded_storage(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D')
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ # Storage dispatch varies by solver — check charge_state is non-trivial
+ assert float(np.nansum(sol['S|charge_state'].values)) > 0
+ # Net discharge should be ~0 (balanced storage)
+ assert float(np.nansum(sol['S|netto_discharge'].values)) == pytest.approx(0, abs=1e-4)
+
+ def test_expanded_shapes(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D')
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ # 192 original timesteps + 1 extra boundary = 193
+ for name in sol.data_vars:
+ if 'time' in sol[name].dims:
+ assert sol[name].sizes['time'] == 193, f'{name} has wrong time size'
+
+
+class TestSegmentedExpansion:
+ """Test that segmented cluster → expand produces correct values."""
+
+ def test_expanded_objective_matches(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(
+ n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6)
+ )
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ assert fs_e.solution['objective'].item() == pytest.approx(160.0, abs=1e-6)
+
+ def test_expanded_flow_rates(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(
+ n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6)
+ )
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ assert float(np.nansum(sol['B(Q_th)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6)
+ assert float(np.nansum(sol['D(Q)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6)
+ assert float(np.nansum(sol['G(Gas)|flow_rate'].values)) == pytest.approx(3200.0, abs=1e-6)
+
+ def test_expanded_costs(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(
+ n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6)
+ )
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ assert float(np.nansum(sol['costs(temporal)|per_timestep'].values)) == pytest.approx(160.0, abs=1e-6)
+ assert float(np.nansum(sol['G(Gas)->costs(temporal)'].values)) == pytest.approx(160.0, abs=1e-6)
+
+ def test_expanded_shapes(self, system_with_storage, solver_fixture):
+ fs_c = system_with_storage.transform.cluster(
+ n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6)
+ )
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ for name in sol.data_vars:
+ if 'time' in sol[name].dims:
+ assert sol[name].sizes['time'] == 193, f'{name} has wrong time size'
+
+ def test_no_nans_in_expanded_flow_rates(self, system_with_storage, solver_fixture):
+ """Segmented expansion must ffill — no NaNs in flow rates (except extra boundary)."""
+ fs_c = system_with_storage.transform.cluster(
+ n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6)
+ )
+ fs_c.optimize(solver_fixture)
+ fs_e = fs_c.transform.expand()
+
+ sol = fs_e.solution
+ for name in ['B(Q_th)|flow_rate', 'D(Q)|flow_rate', 'G(Gas)|flow_rate']:
+ # Exclude last timestep (extra boundary, may be NaN for non-state variables)
+ vals = sol[name].isel(time=slice(None, -1))
+ assert not vals.isnull().any(), f'{name} has NaN values after expansion'
diff --git a/tests/test_clustering/test_integration.py b/tests/test_clustering/test_integration.py
index f5d23c691..c424ceca2 100644
--- a/tests/test_clustering/test_integration.py
+++ b/tests/test_clustering/test_integration.py
@@ -122,97 +122,6 @@ def test_weights_with_cluster_weight(self):
np.testing.assert_array_almost_equal(fs.temporal_weight.values, expected.values)
-class TestClusteringData:
- """Tests for FlowSystem.transform.clustering_data method."""
-
- def test_clustering_data_method_exists(self):
- """Test that transform.clustering_data method exists."""
- fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=48, freq='h'))
-
- assert hasattr(fs.transform, 'clustering_data')
- assert callable(fs.transform.clustering_data)
-
- def test_clustering_data_returns_dataset(self):
- """Test that clustering_data returns an xr.Dataset."""
- from flixopt import Bus, Flow, Sink, Source
-
- n_hours = 48
- fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'))
-
- # Add components with time-varying data
- demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2
- bus = Bus('electricity')
- source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)])
- sink = Sink(
- 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)]
- )
- fs.add_elements(source, sink, bus)
-
- clustering_data = fs.transform.clustering_data()
-
- assert isinstance(clustering_data, xr.Dataset)
-
- def test_clustering_data_contains_only_time_varying(self):
- """Test that clustering_data returns only time-varying data."""
- from flixopt import Bus, Flow, Sink, Source
-
- n_hours = 48
- fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'))
-
- # Add components with time-varying and constant data
- demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2
- bus = Bus('electricity')
- source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)])
- sink = Sink(
- 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)]
- )
- fs.add_elements(source, sink, bus)
-
- clustering_data = fs.transform.clustering_data()
-
- # Should contain the demand profile
- assert 'demand(demand_out)|fixed_relative_profile' in clustering_data.data_vars
-
- # All arrays should have 'time' dimension
- for var in clustering_data.data_vars:
- assert 'time' in clustering_data[var].dims
-
- def test_clustering_data_with_periods(self):
- """Test clustering_data with multi-period system."""
- from flixopt import Bus, Effect, Flow, Sink, Source
-
- n_hours = 48
- periods = pd.Index([2024, 2030], name='period')
- fs = FlowSystem(
- timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'),
- periods=periods,
- )
-
- # Add components
- demand_data = xr.DataArray(
- np.random.rand(n_hours, 2),
- dims=['time', 'period'],
- coords={'time': fs.timesteps, 'period': periods},
- )
- bus = Bus('electricity')
- effect = Effect('costs', '€', is_objective=True)
- source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)])
- sink = Sink(
- 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)]
- )
- fs.add_elements(source, sink, bus, effect)
-
- # Get data for specific period
- data_2024 = fs.transform.clustering_data(period=2024)
-
- # Should not have period dimension (it was selected)
- assert 'period' not in data_2024.dims
-
- # Get data for all periods
- data_all = fs.transform.clustering_data()
- assert 'period' in data_all.dims
-
-
class TestClusterMethod:
"""Tests for FlowSystem.transform.cluster method."""
@@ -302,15 +211,6 @@ def test_hierarchical_is_deterministic(self, basic_flow_system):
# Hierarchical clustering should produce identical cluster orders
xr.testing.assert_equal(fs1.clustering.cluster_assignments, fs2.clustering.cluster_assignments)
- def test_metrics_available(self, basic_flow_system):
- """Test that clustering metrics are available after clustering."""
- fs_clustered = basic_flow_system.transform.cluster(n_clusters=2, cluster_duration='1D')
-
- assert fs_clustered.clustering.metrics is not None
- assert isinstance(fs_clustered.clustering.metrics, xr.Dataset)
- assert 'time_series' in fs_clustered.clustering.metrics.dims
- assert len(fs_clustered.clustering.metrics.data_vars) > 0
-
def test_representation_method_parameter(self, basic_flow_system):
"""Test that representation method via ClusterConfig works."""
from tsam import ClusterConfig
@@ -338,118 +238,26 @@ def test_tsam_kwargs_passthrough(self, basic_flow_system):
def test_unknown_weight_keys_raise(self, basic_flow_system):
"""Test that unknown keys in ClusterConfig.weights raise ValueError.
- Regression test: weight keys that don't match any variable in the
- FlowSystem are likely typos and should be caught early with a clear
- error message.
+ tsam_xarray validates weight keys and raises ValueError for unknown coords.
"""
from tsam import ClusterConfig
# Get actual clustering column names
- clustering_data = basic_flow_system.transform.clustering_data()
- real_columns = list(clustering_data.data_vars)
+ ds = basic_flow_system.to_dataset(include_solution=False)
+ real_columns = [n for n in ds.data_vars if 'time' in ds[n].dims]
# Build weights with real keys + extra bogus keys
weights = {col: 1.0 for col in real_columns}
weights['nonexistent_variable'] = 0.5
weights['another_missing_col'] = 0.3
- with pytest.raises(ValueError, match='unknown variables'):
- basic_flow_system.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- cluster=ClusterConfig(weights=weights),
- )
-
- def test_weight_keys_excluded_by_data_vars_raise(self, basic_flow_system):
- """Test that weight keys excluded by the data_vars allow-list raise ValueError.
-
- A variable may exist on the FlowSystem but be intentionally omitted from
- the clustering input via data_vars. Weights referencing such excluded
- variables should be rejected.
- """
- from tsam import ClusterConfig
-
- ds = basic_flow_system.to_dataset(include_solution=False)
- clustering_columns = list(basic_flow_system.transform.clustering_data().data_vars)
- excluded_var = sorted(set(ds.data_vars) - set(clustering_columns))[0]
-
- # Weight references both a selected var and an excluded var
- weights = {clustering_columns[0]: 1.0, excluded_var: 0.5}
-
- with pytest.raises(ValueError, match='unknown variables'):
+ with pytest.raises(ValueError, match='unknown'):
basic_flow_system.transform.cluster(
n_clusters=2,
cluster_duration='1D',
- data_vars=clustering_columns,
cluster=ClusterConfig(weights=weights),
)
- def test_extra_weight_keys_filtered_with_constant_column(self):
- """Test that weights for constant (dropped) columns are filtered out.
-
- When a time series is constant over time it is removed before clustering.
- User-provided weights referencing such columns must be silently dropped.
- """
- pytest.importorskip('tsam')
- from tsam import ClusterConfig
-
- from flixopt import Bus, Flow, Sink, Source
- from flixopt.core import TimeSeriesData
-
- n_hours = 168 # 7 days
- fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'))
-
- demand_data = np.sin(np.linspace(0, 14 * np.pi, n_hours)) + 2
- bus = Bus('electricity')
- grid_flow = Flow('grid_in', bus='electricity', size=100)
- # One varying profile, one constant profile
- demand_flow = Flow(
- 'demand_out',
- bus='electricity',
- size=100,
- fixed_relative_profile=TimeSeriesData(demand_data / 100),
- )
- constant_flow = Flow(
- 'constant_out',
- bus='electricity',
- size=50,
- fixed_relative_profile=TimeSeriesData(np.full(n_hours, 0.8)),
- )
- source = Source('grid', outputs=[grid_flow])
- sink = Sink('demand', inputs=[demand_flow])
- constant_sink = Sink('constant_load', inputs=[constant_flow])
- fs.add_elements(source, sink, constant_sink, bus)
-
- # Use to_dataset() to get ALL columns including the constant one
- # (clustering_data() already strips constants, so it wouldn't test the path)
- all_data = fs.to_dataset(include_solution=False)
- all_columns = set(all_data.data_vars)
- clustering_columns = set(fs.transform.clustering_data().data_vars)
-
- # Identify constant columns: variables with a single unique value across time
- constant_columns = set()
- for name in all_data.data_vars:
- var = all_data[name]
- if 'time' not in var.dims or np.nanmax(var.values) - np.nanmin(var.values) < 1e-10:
- constant_columns.add(name)
-
- assert len(constant_columns) > 0, 'Test requires at least one constant column'
- assert constant_columns <= all_columns, 'Constant columns must be in the full dataset'
- for col in constant_columns:
- assert col not in clustering_columns, f'Constant column {col!r} should not be in clustering_data()'
-
- # Build weights that reference ALL columns including the constant one
- # that will be dropped — these are valid variables, just constant over time
- weights = {col: 1.0 for col in all_columns}
-
- # Must not raise: constant columns are silently filtered, not rejected
- fs_clustered = fs.transform.cluster(
- n_clusters=2,
- cluster_duration='1D',
- cluster=ClusterConfig(weights=weights),
- )
- assert len(fs_clustered.clusters) == 2
-
def test_unknown_weight_keys_raise_multiperiod(self):
"""Test that unknown weight keys raise ValueError in multi-period clustering."""
pytest.importorskip('tsam')
@@ -477,11 +285,11 @@ def test_unknown_weight_keys_raise_multiperiod(self):
sink = Sink('demand', inputs=[demand_flow])
fs.add_elements(source, sink, bus)
- clustering_data = fs.transform.clustering_data()
- weights = {col: 1.0 for col in clustering_data.data_vars}
+ ds = fs.to_dataset(include_solution=False)
+ weights = {n: 1.0 for n in ds.data_vars if 'time' in ds[n].dims}
weights['nonexistent_period_var'] = 0.7
- with pytest.raises(ValueError, match='unknown variables'):
+ with pytest.raises(ValueError, match='unknown'):
fs.transform.cluster(
n_clusters=2,
cluster_duration='1D',
@@ -519,8 +327,8 @@ def test_valid_weight_keys_multiperiod(self):
sink = Sink('demand', inputs=[demand_flow])
fs.add_elements(source, sink, bus)
- clustering_data = fs.transform.clustering_data()
- weights = {col: 1.0 for col in clustering_data.data_vars}
+ ds = fs.to_dataset(include_solution=False)
+ weights = {n: 1.0 for n in ds.data_vars if 'time' in ds[n].dims}
fs_clustered = fs.transform.cluster(
n_clusters=2,
@@ -528,36 +336,6 @@ def test_valid_weight_keys_multiperiod(self):
cluster=ClusterConfig(weights=weights),
)
assert len(fs_clustered.clusters) == 2
- assert 'period' in fs_clustered.clustering.metrics.dims
-
- def test_metrics_with_periods(self):
- """Test that metrics have period dimension for multi-period FlowSystems."""
- pytest.importorskip('tsam')
- from flixopt import Bus, Flow, Sink, Source
- from flixopt.core import TimeSeriesData
-
- n_hours = 168 # 7 days
- fs = FlowSystem(
- timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'),
- periods=pd.Index([2025, 2030], name='period'),
- )
-
- demand_data = np.sin(np.linspace(0, 14 * np.pi, n_hours)) + 2
- bus = Bus('electricity')
- grid_flow = Flow('grid_in', bus='electricity', size=100)
- demand_flow = Flow(
- 'demand_out', bus='electricity', size=100, fixed_relative_profile=TimeSeriesData(demand_data / 100)
- )
- source = Source('grid', outputs=[grid_flow])
- sink = Sink('demand', inputs=[demand_flow])
- fs.add_elements(source, sink, bus)
-
- fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D')
-
- # Metrics should have period dimension
- assert fs_clustered.clustering.metrics is not None
- assert 'period' in fs_clustered.clustering.metrics.dims
- assert len(fs_clustered.clustering.metrics.period) == 2
class TestClusteringModuleImports:
diff --git a/tests/test_clustering/test_multiperiod_extremes.py b/tests/test_clustering/test_multiperiod_extremes.py
index 973efe79d..01cb7d026 100644
--- a/tests/test_clustering/test_multiperiod_extremes.py
+++ b/tests/test_clustering/test_multiperiod_extremes.py
@@ -522,9 +522,9 @@ def test_append_with_segments(self, solver_fixture, timesteps_8_days):
assert fs_clustered.clustering.is_segmented is True
assert fs_clustered.clustering.n_segments == 4
- # n_representatives = n_clusters * n_segments
+ # n_clusters * n_segments
n_clusters = fs_clustered.clustering.n_clusters
- assert fs_clustered.clustering.n_representatives == n_clusters * 4
+ assert n_clusters * fs_clustered.clustering.n_segments == n_clusters * 4
fs_clustered.optimize(solver_fixture)
assert fs_clustered.solution is not None
@@ -987,22 +987,3 @@ def test_cluster_occurrences_sum_to_original(self, timesteps_8_days, periods_2):
f'Occurrences for period {period} with n_clusters={n_clusters}: '
f'{int(period_occurrences.sum())} != 8'
)
-
- def test_timestep_mapping_valid_range(self, timesteps_8_days, periods_2):
- """Test that timestep_mapping values are within valid range."""
- fs = create_multiperiod_system_with_different_profiles(timesteps_8_days, periods_2)
-
- fs_clustered = fs.transform.cluster(n_clusters=3, cluster_duration='1D')
-
- mapping = fs_clustered.clustering.timestep_mapping
-
- # Mapping values should be in [0, n_clusters * timesteps_per_cluster - 1]
- max_valid = 3 * 24 - 1 # n_clusters * timesteps_per_cluster - 1
- assert mapping.min().item() >= 0
- assert mapping.max().item() <= max_valid
-
- # Each period should have valid mappings
- for period in periods_2:
- period_mapping = mapping.sel(period=period)
- assert period_mapping.min().item() >= 0
- assert period_mapping.max().item() <= max_valid
diff --git a/tests/utilities/test_dataconverter.py b/tests/utilities/test_dataconverter.py
index f9f2df889..0909b3d25 100644
--- a/tests/utilities/test_dataconverter.py
+++ b/tests/utilities/test_dataconverter.py
@@ -478,7 +478,7 @@ class TestTimeSeriesDataConversion:
def test_timeseries_data_basic(self, time_coords):
"""TimeSeriesData should work like DataArray."""
data_array = xr.DataArray([10, 20, 30, 40, 50], coords={'time': time_coords}, dims='time')
- ts_data = TimeSeriesData(data_array, clustering_group='test')
+ ts_data = TimeSeriesData(data_array)
result = DataConverter.to_dataarray(ts_data, coords={'time': time_coords})