diff --git a/docs/notebooks/08c-clustering.ipynb b/docs/notebooks/08c-clustering.ipynb index 602468cba..b612756a9 100644 --- a/docs/notebooks/08c-clustering.ipynb +++ b/docs/notebooks/08c-clustering.ipynb @@ -14,10 +14,16 @@ "- **Typical periods**: Cluster similar time segments (e.g., days) and solve only representative ones\n", "- **Weighted costs**: Automatically weight operational costs by cluster occurrence\n", "- **Two-stage workflow**: Fast sizing with clustering, accurate dispatch at full resolution\n", + "- **Segmentation**: Reduce timesteps within each cluster for further compression\n", "\n", "!!! note \"Requirements\"\n", - " This notebook requires the `tsam` package with `ClusterConfig` and `ExtremeConfig` support.\n", - " Install with: `pip install \"flixopt[full]\"`" + " This notebook requires the `tsam` and `tsam_xarray` packages.\n", + " Install with: `pip install \"flixopt[full]\"`\n", + "\n", + "!!! tip \"tsam_xarray\"\n", + " flixopt uses [tsam_xarray](https://github.com/FZJ-IEK3-VSA/tsam_xarray) for clustering,\n", + " which wraps [tsam](https://github.com/FZJ-IEK3-VSA/tsam). For advanced clustering options\n", + " (custom algorithms, weights, tuning), see the tsam_xarray documentation." ] }, { @@ -171,7 +177,9 @@ "source": [ "## Understanding the Clustering\n", "\n", - "The clustering algorithm groups similar days together. Access all metadata via `fs.clustering`:" + "Access clustering metadata via `fs.clustering`. For full access to the underlying\n", + "[tsam_xarray ClusteringResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray),\n", + "use `fs.clustering.clustering_result`." ] }, { @@ -181,202 +189,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Access clustering metadata directly\n", - "clustering = fs_clustered.clustering.results\n", - "clustering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "# Show clustering info using __repr__\n", + "# Clustering overview\n", "fs_clustered.clustering" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "# Quality metrics - how well do the clusters represent the original data?\n", - "# Lower RMSE/MAE = better representation\n", - "fs_clustered.clustering.metrics.to_dataframe().style.format('{:.3f}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [ - "# Visual comparison: original vs clustered time series\n", - "fs_clustered.clustering.plot.compare()" - ] - }, - { - "cell_type": "markdown", - "id": "15", - "metadata": {}, - "source": [ - "## Inspect Clustering Input Data\n", - "\n", - "Before clustering, you can inspect which time-varying data will be used.\n", - "The `clustering_data()` method returns only the arrays that vary over time\n", - "(constant arrays are excluded since they don't affect clustering):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": {}, - "outputs": [], - "source": [ - "# See what data will be used for clustering\n", - "clustering_data = flow_system.transform.clustering_data()\n", - "print(f'Variables used for clustering ({len(clustering_data.data_vars)} total):')\n", - "for var in clustering_data.data_vars:\n", - " print(f' - {var}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize the time-varying data (select a few key variables)\n", - "key_vars = [v for v in clustering_data.data_vars if 'fixed_relative_profile' in v or 'effects_per_flow_hour' in v]\n", - "clustering_data[key_vars].plotly.line(facet_row='variable', title='Time-Varying Data Used for Clustering')" - ] - }, - { - "cell_type": "markdown", - "id": "18", - "metadata": {}, - "source": [ - "## Selective Clustering with `data_vars`\n", - "\n", - "By default, clustering uses **all** time-varying data to determine typical periods.\n", - "However, you may want to cluster based on only a **subset** of variables while still\n", - "applying the clustering to all data.\n", - "\n", - "Use the `data_vars` parameter to specify which variables determine the clustering:\n", - "\n", - "- **Cluster based on subset**: Only the specified variables affect which days are grouped together\n", - "- **Apply to all data**: The resulting clustering is applied to ALL time-varying data\n", - "\n", - "This is useful when:\n", - "- You want to cluster based on demand patterns only (ignoring price variations)\n", - "- You have dominant time series that should drive the clustering\n", - "- You want to ensure certain patterns are well-represented in typical periods" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "# Cluster based ONLY on heat demand pattern (ignore electricity prices)\n", - "demand_var = 'HeatDemand(Q_th)|fixed_relative_profile'\n", - "\n", - "fs_demand_only = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " data_vars=[demand_var], # Only this variable determines clustering\n", - " extremes=ExtremeConfig(method='new_cluster', max_value=[demand_var]),\n", - ")\n", - "\n", - "# Verify: clustering was determined by demand but applied to all data\n", - "print(f'Clustered using: {demand_var}')\n", - "print(f'But all {len(clustering_data.data_vars)} variables are included in the result')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare metrics: clustering with all data vs. demand-only\n", - "pd.DataFrame(\n", - " {\n", - " 'All Variables': fs_clustered.clustering.metrics.to_dataframe().iloc[0],\n", - " 'Demand Only': fs_demand_only.clustering.metrics.to_dataframe().iloc[0],\n", - " }\n", - ").round(4)" - ] - }, { "cell_type": "markdown", - "id": "21", - "metadata": {}, - "source": [ - "## Advanced Clustering Options\n", - "\n", - "The `cluster()` method exposes many parameters for fine-tuning:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22", - "metadata": {}, - "outputs": [], - "source": [ - "from tsam import ClusterConfig\n", - "\n", - "# Try different clustering algorithms\n", - "fs_kmeans = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " cluster=ClusterConfig(method='kmeans'), # Alternative: 'hierarchical' (default), 'kmedoids', 'averaging'\n", - ")\n", - "\n", - "fs_kmeans.clustering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare quality metrics between algorithms\n", - "pd.DataFrame(\n", - " {\n", - " 'hierarchical': fs_clustered.clustering.metrics.to_dataframe().iloc[0],\n", - " 'kmeans': fs_kmeans.clustering.metrics.to_dataframe().iloc[0],\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize cluster structure with heatmap\n", - "fs_clustered.clustering.plot.heatmap()" - ] - }, - { - "cell_type": "markdown", - "id": "25", + "id": "12", "metadata": {}, "source": [ "### Apply Existing Clustering\n", @@ -402,7 +221,7 @@ }, { "cell_type": "markdown", - "id": "26", + "id": "13", "metadata": {}, "source": [ "## Method 3: Two-Stage Workflow (Recommended)\n", @@ -420,7 +239,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27", + "id": "14", "metadata": {}, "outputs": [], "source": [ @@ -432,7 +251,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28", + "id": "15", "metadata": {}, "outputs": [], "source": [ @@ -451,7 +270,7 @@ }, { "cell_type": "markdown", - "id": "29", + "id": "16", "metadata": {}, "source": [ "## Compare Results" @@ -460,7 +279,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30", + "id": "17", "metadata": {}, "outputs": [], "source": [ @@ -509,7 +328,7 @@ }, { "cell_type": "markdown", - "id": "31", + "id": "18", "metadata": {}, "source": [ "## Expand Solution to Full Resolution\n", @@ -521,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "32", + "id": "19", "metadata": {}, "outputs": [], "source": [ @@ -532,7 +351,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33", + "id": "20", "metadata": {}, "outputs": [], "source": [ @@ -554,7 +373,7 @@ }, { "cell_type": "markdown", - "id": "34", + "id": "21", "metadata": {}, "source": [ "## Visualize Clustered Heat Balance" @@ -563,7 +382,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35", + "id": "22", "metadata": {}, "outputs": [], "source": [ @@ -573,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36", + "id": "23", "metadata": {}, "outputs": [], "source": [ @@ -582,10 +401,10 @@ }, { "cell_type": "markdown", - "id": "37", + "id": "24", "metadata": {}, "source": [ - "markdown## API Reference\n", + "## API Reference\n", "\n", "### `transform.cluster()` Parameters\n", "\n", @@ -593,25 +412,11 @@ "|-----------|------|---------|-------------|\n", "| `n_clusters` | `int` | - | Number of typical periods (e.g., 8 typical days) |\n", "| `cluster_duration` | `str \\| float` | - | Duration per cluster ('1D', '24h') or hours |\n", - "| `data_vars` | `list[str]` | None | Variables to cluster on (applies result to all) |\n", - "| `weights` | `dict[str, float]` | None | Optional weights for time series in clustering |\n", - "| `cluster` | `ClusterConfig` | None | Clustering algorithm configuration |\n", + "| `cluster` | `ClusterConfig` | None | Clustering algorithm and weights. Use `weights={var: 0}` to exclude variables. |\n", "| `extremes` | `ExtremeConfig` | None | **Essential**: Force inclusion of peak/min periods |\n", + "| `segments` | `SegmentConfig` | None | Intra-period segmentation (variable timestep durations) |\n", "| `**tsam_kwargs` | - | - | Additional tsam parameters |\n", "\n", - "### `transform.clustering_data()` Method\n", - "\n", - "Inspect which time-varying data will be used for clustering:\n", - "\n", - "```python\n", - "# Get all time-varying variables\n", - "clustering_data = flow_system.transform.clustering_data()\n", - "print(list(clustering_data.data_vars))\n", - "\n", - "# Get data for a specific period (multi-period systems)\n", - "clustering_data = flow_system.transform.clustering_data(period=2024)\n", - "```\n", - "\n", "### Clustering Object Properties\n", "\n", "After clustering, access metadata via `fs.clustering`:\n", @@ -619,31 +424,12 @@ "| Property | Description |\n", "|----------|-------------|\n", "| `n_clusters` | Number of clusters |\n", - "| `n_original_clusters` | Number of original time segments (e.g., 365 days) |\n", - "| `timesteps_per_cluster` | Timesteps in each cluster (e.g., 24 for daily) |\n", - "| `cluster_assignments` | xr.DataArray mapping original segment → cluster ID |\n", + "| `n_original_clusters` | Number of original time segments (e.g., 31 days) |\n", + "| `timesteps_per_cluster` | Timesteps in each cluster (e.g., 96 for daily at 15 min) |\n", + "| `cluster_assignments` | xr.DataArray mapping original segment to cluster ID |\n", "| `cluster_occurrences` | How many original segments each cluster represents |\n", - "| `metrics` | xr.Dataset with RMSE, MAE per time series |\n", - "| `results` | `ClusteringResults` with xarray-like interface |\n", - "| `plot.compare()` | Compare original vs clustered time series |\n", - "| `plot.heatmap()` | Visualize cluster structure |\n", - "\n", - "### ClusteringResults (xarray-like)\n", - "\n", - "Access the underlying tsam results via `clustering.results`:\n", - "\n", - "```python\n", - "# Dimension info (like xarray)\n", - "clustering.results.dims # ('period', 'scenario') or ()\n", - "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n", - "\n", - "# Select specific result (like xarray)\n", - "clustering.results.sel(period=2020, scenario='high') # Label-based\n", - "clustering.results.isel(period=0, scenario=1) # Index-based\n", - "\n", - "# Apply existing clustering to new data\n", - "agg_results = clustering.results.apply(dataset) # Returns AggregationResults\n", - "```\n", + "| `clustering_result` | Full [tsam_xarray ClusteringResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray) |\n", + "| `aggregation_result` | Full [tsam_xarray AggregationResult](https://github.com/FZJ-IEK3-VSA/tsam_xarray) (pre-IO only) |\n", "\n", "### Storage Behavior\n", "\n", @@ -656,44 +442,12 @@ "| `'cyclic'` | Each cluster is independent but cyclic (start = end) |\n", "| `'independent'` | Each cluster is independent, free start/end |\n", "\n", - "For a detailed comparison of storage modes, see [08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb).\n", - "\n", - "### Peak Forcing with ExtremeConfig\n", - "\n", - "```python\n", - "from tsam import ExtremeConfig\n", - "\n", - "extremes = ExtremeConfig(\n", - " method='new_cluster', # Creates new cluster for extremes\n", - " max_value=['ComponentName(FlowName)|fixed_relative_profile'], # Capture peak demand\n", - ")\n", - "```\n", - "\n", - "### Recommended Workflow\n", - "\n", - "```python\n", - "from tsam import ExtremeConfig\n", - "\n", - "# Stage 1: Fast sizing\n", - "fs_sizing = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand(Flow)|fixed_relative_profile']),\n", - ")\n", - "fs_sizing.optimize(solver)\n", - "\n", - "# Apply safety margin\n", - "sizes = {k: v.item() * 1.05 for k, v in fs_sizing.stats.sizes.items()}\n", - "\n", - "# Stage 2: Accurate dispatch\n", - "fs_dispatch = flow_system.transform.fix_sizes(sizes)\n", - "fs_dispatch.optimize(solver)\n", - "```" + "For a detailed comparison of storage modes, see [08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)." ] }, { "cell_type": "markdown", - "id": "38", + "id": "25", "metadata": {}, "source": [ "## Summary\n", @@ -701,30 +455,25 @@ "You learned how to:\n", "\n", "- Use **`cluster()`** to reduce time series into typical periods\n", - "- **Inspect clustering data** with `clustering_data()` before clustering\n", - "- Use **`data_vars`** to cluster based on specific variables only\n", "- Apply **peak forcing** with `ExtremeConfig` to capture extreme demand days\n", "- Use **two-stage optimization** for fast yet accurate investment decisions\n", "- **Expand solutions** back to full resolution with `expand()`\n", - "- Access **clustering metadata** via `fs.clustering` (metrics, cluster_assignments, cluster_occurrences)\n", - "- Use **advanced options** like different algorithms with `ClusterConfig`\n", + "- Access **clustering metadata** via `fs.clustering`\n", "- **Apply existing clustering** to other FlowSystems using `apply_clustering()`\n", "\n", "### Key Takeaways\n", "\n", "1. **Always use peak forcing** (`extremes=ExtremeConfig(max_value=[...])`) for demand time series\n", - "2. **Inspect data first** with `clustering_data()` to see available variables\n", - "3. **Use `data_vars`** to cluster on specific variables (e.g., demand only, ignoring prices)\n", - "4. **Add safety margin** (5-10%) when fixing sizes from clustering\n", - "5. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n", - "6. **Storage handling** is configurable via `cluster_mode`\n", - "7. **Check metrics** to evaluate clustering quality\n", - "8. **Use `apply_clustering()`** to apply the same clustering to different FlowSystem variants\n", + "2. **Add safety margin** (5-10%) when fixing sizes from clustering\n", + "3. **Two-stage is recommended**: clustering for sizing, full resolution for dispatch\n", + "4. **Storage handling** is configurable via `cluster_mode`\n", + "5. **Use `apply_clustering()`** to apply the same clustering to different FlowSystem variants\n", + "6. For advanced clustering options (weights, algorithms, segmentation, tuning), see\n", + " [tsam_xarray](https://github.com/FZJ-IEK3-VSA/tsam_xarray) and [tsam](https://github.com/FZJ-IEK3-VSA/tsam)\n", "\n", "### Next Steps\n", "\n", - "- **[08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)**: Compare storage modes using a seasonal storage system\n", - "- **[08d-clustering-multiperiod](08d-clustering-multiperiod.ipynb)**: Clustering with multiple periods and scenarios" + "- **[08c2-clustering-storage-modes](08c2-clustering-storage-modes.ipynb)**: Compare storage modes using a seasonal storage system" ] } ], diff --git a/docs/notebooks/08d-clustering-multiperiod.ipynb b/docs/notebooks/08d-clustering-multiperiod.ipynb deleted file mode 100644 index 82da05c6f..000000000 --- a/docs/notebooks/08d-clustering-multiperiod.ipynb +++ /dev/null @@ -1,609 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Multi-Period Clustering with `cluster()`\n", - "\n", - "Combine time series clustering with multi-period investment optimization.\n", - "\n", - "This notebook demonstrates:\n", - "\n", - "- **Multi-period modeling**: Optimize investments across multiple planning periods (years)\n", - "- **Scenario analysis**: Handle demand uncertainty with weighted scenarios\n", - "- **Clustering per period**: Apply typical-period clustering independently for each period/scenario\n", - "- **Scalability**: Reduce computational complexity for long-horizon planning\n", - "\n", - "!!! note \"Requirements\"\n", - " This notebook requires the `tsam` package with `ExtremeConfig` support.\n", - " Install with: `pip install \"flixopt[full]\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "import timeit\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import plotly.express as px\n", - "\n", - "import flixopt as fx\n", - "\n", - "fx.CONFIG.notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "2", - "metadata": {}, - "source": [ - "## Create the Multi-Period System\n", - "\n", - "We use a multi-period heating system with:\n", - "- **3 planning periods** (years 2024, 2025, 2026)\n", - "- **2 scenarios** (high demand 30%, low demand 70%)\n", - "- **2 weeks** at hourly resolution (336 timesteps)\n", - "\n", - "This represents a capacity expansion problem where we optimize component sizes once,\n", - "but operations are simulated across multiple future years and demand scenarios." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "from data.generate_example_systems import create_multiperiod_system\n", - "\n", - "flow_system = create_multiperiod_system()\n", - "\n", - "print(f'Timesteps: {len(flow_system.timesteps)} ({len(flow_system.timesteps) // 24} days)')\n", - "print(f'Periods: {list(flow_system.periods.values)}')\n", - "print(f'Scenarios: {list(flow_system.scenarios.values)}')\n", - "print(f'Scenario weights: {flow_system.scenario_weights.values}')\n", - "print(f'\\nComponents: {list(flow_system.components.keys())}')" - ] - }, - { - "cell_type": "markdown", - "id": "4", - "metadata": {}, - "source": [ - "## Selecting a Subset with `transform.isel()`\n", - "\n", - "For demonstration purposes, we'll use only the first week of data.\n", - "The `isel()` method (index select) lets you slice FlowSystems by time:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "# Select first week only (168 hours)\n", - "flow_system = flow_system.transform.isel(time=slice(0, 168))\n", - "\n", - "print(f'After isel: {len(flow_system.timesteps)} timesteps ({len(flow_system.timesteps) // 24} days)')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize demand scenarios (equal across periods)\n", - "heat_demand = flow_system.components['Building'].inputs[0].fixed_relative_profile\n", - "\n", - "fig = px.line(heat_demand.to_dataframe('value').reset_index(), x='time', y='value', facet_row='scenario')\n", - "\n", - "fig.update_layout(\n", - " height=350,\n", - " title='Heat Demand by Scenario (One Week)',\n", - " xaxis_title='Time',\n", - " yaxis_title='Heat Demand [kW]',\n", - ")\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "## Full Optimization (Baseline)\n", - "\n", - "First, solve the complete problem with all timesteps across all periods and scenarios:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "solver = fx.solvers.HighsSolver(mip_gap=0.01)\n", - "\n", - "start = timeit.default_timer()\n", - "fs_full = flow_system.copy()\n", - "fs_full.name = 'Full Optimization'\n", - "fs_full.optimize(solver)\n", - "time_full = timeit.default_timer() - start\n", - "\n", - "print(f'Full optimization: {time_full:.2f} seconds')\n", - "print(f'Total cost (objective): {fs_full.solution[\"objective\"].item():,.0f} €')\n", - "print('\\nOptimized sizes:')\n", - "for name, size in fs_full.stats.sizes.items():\n", - " print(f' {name}: {size.max().item():.1f}')" - ] - }, - { - "cell_type": "markdown", - "id": "9", - "metadata": {}, - "source": [ - "## Multi-Period Clustering with `cluster()`\n", - "\n", - "When applied to a multi-period system, `cluster()` clusters **each period/scenario combination independently**.\n", - "This is because demand patterns and optimal operations may differ across:\n", - "\n", - "- **Periods**: Different years may have different characteristics\n", - "- **Scenarios**: High vs low demand scenarios need different representative days\n", - "\n", - "The investment decisions (sizes) remain consistent across all periods and scenarios,\n", - "while the operational patterns are optimized for each cluster." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "from tsam import ExtremeConfig\n", - "\n", - "start = timeit.default_timer()\n", - "\n", - "# Force inclusion of peak demand periods\n", - "peak_series = ['Building(Heat)|fixed_relative_profile']\n", - "\n", - "# Cluster to 3 typical days (from 7 days)\n", - "fs_clustered = flow_system.transform.cluster(\n", - " n_clusters=3,\n", - " cluster_duration='1D',\n", - " extremes=ExtremeConfig(method='replace', max_value=peak_series),\n", - ")\n", - "\n", - "time_clustering = timeit.default_timer() - start\n", - "\n", - "print(f'Clustering time: {time_clustering:.2f} seconds')\n", - "print(f'Reduced: {len(flow_system.timesteps)} → {len(fs_clustered.timesteps)} timesteps per period')\n", - "print('Total problem reduction: 7 days × 3 periods × 2 scenarios → 3 days × 3 × 2')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "# Optimize the reduced system\n", - "start = timeit.default_timer()\n", - "fs_clustered.optimize(solver)\n", - "time_clustered = timeit.default_timer() - start\n", - "\n", - "print(f'Clustered optimization: {time_clustered:.2f} seconds')\n", - "print(f'Total cost (objective): {fs_clustered.solution[\"objective\"].item():,.0f} €')\n", - "print(f'\\nSpeedup vs full: {time_full / (time_clustering + time_clustered):.1f}x')\n", - "print('\\nOptimized sizes:')\n", - "for name, size in fs_clustered.stats.sizes.items():\n", - " print(f' {name}: {size.max().item():.1f}')" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## Visualize Clustering Quality\n", - "\n", - "The `.plot` accessor provides built-in visualizations with automatic faceting by period and scenario:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "# Duration curves show how well the distribution is preserved per period/scenario\n", - "fs_clustered.clustering.plot.compare(\n", - " kind='duration_curve',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [ - "# Heatmap shows cluster assignments - faceted by period and scenario\n", - "fs_clustered.clustering.plot.heatmap()" - ] - }, - { - "cell_type": "markdown", - "id": "15", - "metadata": {}, - "source": [ - "## Understand the Cluster Structure\n", - "\n", - "Let's inspect how days were grouped into clusters:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": {}, - "outputs": [], - "source": [ - "clustering = fs_clustered.clustering\n", - "\n", - "print('Clustering Configuration:')\n", - "print(f' Typical periods (clusters): {clustering.n_clusters}')\n", - "print(f' Timesteps per cluster: {clustering.timesteps_per_cluster}')\n", - "\n", - "# Access underlying results via xarray-like interface\n", - "print(f'\\nClusteringResults dimensions: {clustering.results.dims}')\n", - "print(f'ClusteringResults coords: {clustering.results.coords}')\n", - "\n", - "# The cluster_assignments shows which cluster each original day belongs to\n", - "# For multi-period systems, select a specific period/scenario combination\n", - "cluster_assignments = clustering.cluster_assignments.isel(period=0, scenario=0).values\n", - "day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n", - "\n", - "print('\\nCluster assignments per day (period=2024, scenario=High):')\n", - "for i, cluster_id in enumerate(cluster_assignments):\n", - " print(f' {day_names[i]}: Cluster {cluster_id}')\n", - "\n", - "# Cluster occurrences (how many original days each cluster represents)\n", - "unique, counts = np.unique(cluster_assignments, return_counts=True)\n", - "print('\\nCluster weights (days represented):')\n", - "for cluster_id, count in zip(unique, counts, strict=True):\n", - " print(f' Cluster {cluster_id}: {count} days')" - ] - }, - { - "cell_type": "markdown", - "id": "17", - "metadata": {}, - "source": [ - "## Two-Stage Workflow for Multi-Period\n", - "\n", - "For investment optimization across multiple periods, the recommended workflow is:\n", - "\n", - "1. **Stage 1**: Fast sizing with clustering (reduced timesteps)\n", - "2. **Stage 2**: Fix sizes and run full-resolution dispatch\n", - "\n", - "This gives accurate investment decisions while maintaining computational tractability.\n", - "\n", - "### Safety Margin Rationale\n", - "\n", - "A 10% safety margin is applied to compensate for:\n", - "\n", - "- **Peak underestimation**: Clustering averages similar days, potentially underestimating true peak demands\n", - "- **Temporal detail loss**: Representative periods may miss short-duration extreme events\n", - "- **Scenario averaging**: Weighted scenarios smooth out worst-case conditions\n", - "\n", - "For critical applications, consider 15-20% margins or validate with full-resolution runs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "# Stage 1 already done - apply safety margin\n", - "SAFETY_MARGIN = 1.10 # 10% buffer for multi-period uncertainty\n", - "\n", - "sizes_with_margin = {name: size.max().item() * SAFETY_MARGIN for name, size in fs_clustered.stats.sizes.items()}\n", - "\n", - "print('Stage 1: Sizing with clustering')\n", - "print(f' Time: {time_clustering + time_clustered:.2f} seconds')\n", - "print(f' Cost estimate: {fs_clustered.solution[\"objective\"].item():,.0f} €')\n", - "print(f'\\nSizes with {(SAFETY_MARGIN - 1) * 100:.0f}% safety margin:')\n", - "for name, size in sizes_with_margin.items():\n", - " original = fs_clustered.stats.sizes[name].max().item()\n", - " print(f' {name}: {original:.1f} → {size:.1f}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "# Stage 2: Full resolution dispatch with fixed sizes\n", - "print('Stage 2: Full resolution dispatch')\n", - "start = timeit.default_timer()\n", - "\n", - "fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin)\n", - "fs_dispatch.name = 'Two-Stage'\n", - "fs_dispatch.optimize(solver)\n", - "\n", - "time_dispatch = timeit.default_timer() - start\n", - "\n", - "print(f' Time: {time_dispatch:.2f} seconds')\n", - "print(f' Actual cost: {fs_dispatch.solution[\"objective\"].item():,.0f} €')\n", - "\n", - "# Total comparison\n", - "total_two_stage = time_clustering + time_clustered + time_dispatch\n", - "print(f'\\nTotal two-stage time: {total_two_stage:.2f} seconds')\n", - "print(f'Speedup vs full: {time_full / total_two_stage:.1f}x')" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": {}, - "source": [ - "## Compare Results Across Methods" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": {}, - "outputs": [], - "source": [ - "results = {\n", - " 'Full (baseline)': {\n", - " 'Time [s]': time_full,\n", - " 'Cost [€]': fs_full.solution['objective'].item(),\n", - " 'Boiler': fs_full.stats.sizes['Boiler(Heat)'].max().item(),\n", - " 'Storage': fs_full.stats.sizes['ThermalStorage'].max().item(),\n", - " },\n", - " 'Clustered (3 days)': {\n", - " 'Time [s]': time_clustering + time_clustered,\n", - " 'Cost [€]': fs_clustered.solution['objective'].item(),\n", - " 'Boiler': fs_clustered.stats.sizes['Boiler(Heat)'].max().item(),\n", - " 'Storage': fs_clustered.stats.sizes['ThermalStorage'].max().item(),\n", - " },\n", - " 'Two-Stage': {\n", - " 'Time [s]': total_two_stage,\n", - " 'Cost [€]': fs_dispatch.solution['objective'].item(),\n", - " 'Boiler': sizes_with_margin['Boiler(Heat)'],\n", - " 'Storage': sizes_with_margin['ThermalStorage'],\n", - " },\n", - "}\n", - "\n", - "comparison = pd.DataFrame(results).T\n", - "baseline_cost = comparison.loc['Full (baseline)', 'Cost [€]']\n", - "baseline_time = comparison.loc['Full (baseline)', 'Time [s]']\n", - "comparison['Cost Gap [%]'] = ((comparison['Cost [€]'] - baseline_cost) / abs(baseline_cost) * 100).round(2)\n", - "comparison['Speedup'] = (baseline_time / comparison['Time [s]']).round(1)\n", - "\n", - "comparison.style.format(\n", - " {\n", - " 'Time [s]': '{:.2f}',\n", - " 'Cost [€]': '{:,.0f}',\n", - " 'Boiler': '{:.1f}',\n", - " 'Storage': '{:.0f}',\n", - " 'Cost Gap [%]': '{:.2f}',\n", - " 'Speedup': '{:.1f}x',\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "22", - "metadata": {}, - "source": [ - "## Visualize Optimization Results\n", - "\n", - "Use the built-in statistics plotting to compare results across periods and scenarios:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "# Plot flow rates with automatic faceting by period and scenario\n", - "fs_full.stats.plot.flows(component='Boiler')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "# Side-by-side comparison using the Comparison class\n", - "comp = fx.Comparison([fs_full, fs_dispatch])\n", - "comp.stats.plot.balance('Heat')" - ] - }, - { - "cell_type": "markdown", - "id": "25", - "metadata": {}, - "source": [ - "## Expand Clustered Solution to Full Resolution\n", - "\n", - "Use `expand()` to map the clustered results back to all original timesteps:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "26", - "metadata": {}, - "outputs": [], - "source": [ - "# Expand the clustered solution\n", - "fs_expanded = fs_clustered.transform.expand()\n", - "\n", - "print(f'Expanded: {len(fs_clustered.timesteps)} → {len(fs_expanded.timesteps)} timesteps')\n", - "print(f'Cost (objective): {fs_expanded.solution[\"objective\"].item():,.0f} €')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare expanded solution - shows the repeated cluster patterns\n", - "fs_expanded.stats.plot.flows(component='Boiler')" - ] - }, - { - "cell_type": "markdown", - "id": "28", - "metadata": {}, - "source": [ - "## Key Considerations for Multi-Period Clustering\n", - "\n", - "### 1. Independent Clustering per Period/Scenario\n", - "\n", - "Each period and scenario combination is clustered independently because:\n", - "- Demand patterns may differ across years (growth, seasonality)\n", - "- Scenarios represent different futures that shouldn't be mixed\n", - "- Investment decisions must be robust across all combinations\n", - "\n", - "### 2. Safety Margins\n", - "\n", - "Multi-period systems often warrant larger safety margins (10-15%) because:\n", - "- More uncertainty across multiple years\n", - "- Investments made once must work for all periods\n", - "- Scenario weights may not perfectly represent actual outcomes\n", - "\n", - "### 3. Computational Benefits\n", - "\n", - "Clustering becomes more valuable as problem size grows:\n", - "\n", - "| Scenario | Full Problem | With Clustering |\n", - "|----------|--------------|----------------|\n", - "| 1 period, 1 scenario, 365 days | 8,760 timesteps | ~730 (10 typical days) |\n", - "| 3 periods, 2 scenarios, 365 days | 52,560 timesteps | ~4,380 |\n", - "| 10 periods, 3 scenarios, 365 days | 262,800 timesteps | ~21,900 |\n", - "\n", - "The speedup factor increases with problem size." - ] - }, - { - "cell_type": "markdown", - "id": "29", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "You learned how to:\n", - "\n", - "- Load **multi-period systems** with periods and scenarios\n", - "- Use **`transform.isel()`** to select time subsets\n", - "- Apply **`cluster()`** to multi-dimensional FlowSystems\n", - "- **Visualize clustering** with the `.plot` accessor (compare, duration curves, heatmaps)\n", - "- Use the **two-stage workflow** for robust investment optimization\n", - "- **Expand solutions** back to full resolution with `expand()`\n", - "\n", - "### Key Takeaways\n", - "\n", - "1. **Clustering is applied per period/scenario**: Each combination gets independent typical periods\n", - "2. **Investments are shared**: Component sizes are optimized once across all periods/scenarios\n", - "3. **Use larger safety margins**: Multi-period uncertainty warrants 10-15% buffers\n", - "4. **Two-stage is recommended**: Fast sizing with clustering, accurate dispatch at full resolution\n", - "5. **Built-in plotting**: Use `.plot` accessor for automatic faceting by period/scenario\n", - "\n", - "### API Reference\n", - "\n", - "```python\n", - "from tsam import ExtremeConfig\n", - "\n", - "# Load multi-period system\n", - "fs = fx.FlowSystem.from_netcdf('multiperiod_system.nc4')\n", - "\n", - "# Select time subset (optional)\n", - "fs = fs.transform.isel(time=slice(0, 168)) # First 168 timesteps\n", - "\n", - "# Cluster (applies per period/scenario)\n", - "# Note: For multi-period systems, only method='replace' is supported\n", - "fs_clustered = fs.transform.cluster(\n", - " n_clusters=10,\n", - " cluster_duration='1D',\n", - " extremes=ExtremeConfig(method='replace', max_value=['Demand(Flow)|fixed_relative_profile']),\n", - ")\n", - "\n", - "# Visualize clustering quality\n", - "fs_clustered.clustering.plot.compare(variable='Demand(Flow)|profile')\n", - "fs_clustered.clustering.plot.heatmap()\n", - "\n", - "# Access underlying results (xarray-like interface)\n", - "fs_clustered.clustering.results.dims # ('period', 'scenario')\n", - "fs_clustered.clustering.results.coords # {'period': [...], 'scenario': [...]}\n", - "fs_clustered.clustering.results.sel(period=2024, scenario='High') # Label-based\n", - "fs_clustered.clustering.results.isel(period=0, scenario=0) # Index-based\n", - "\n", - "# Two-stage workflow\n", - "fs_clustered.optimize(solver)\n", - "sizes = {k: v.max().item() * 1.10 for k, v in fs_clustered.stats.sizes.items()}\n", - "fs_dispatch = fs.transform.fix_sizes(sizes)\n", - "fs_dispatch.optimize(solver)\n", - "\n", - "# Visualize results\n", - "fs_dispatch.stats.plot.flows(component='Boiler')\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/notebooks/08e-clustering-internals.ipynb b/docs/notebooks/08e-clustering-internals.ipynb deleted file mode 100644 index 2d099ff34..000000000 --- a/docs/notebooks/08e-clustering-internals.ipynb +++ /dev/null @@ -1,540 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Clustering Internals\n", - "\n", - "Understanding the data structures and visualization tools behind time series clustering.\n", - "\n", - "This notebook demonstrates:\n", - "\n", - "- **Data structure**: The `Clustering` class that stores all clustering information\n", - "- **Plot accessor**: Built-in visualizations via `.plot`\n", - "- **Data expansion**: Using `expand_data()` to map aggregated data back to original timesteps\n", - "- **IO workflow**: What's preserved and lost when saving/loading clustered systems\n", - "\n", - "!!! note \"Requirements\"\n", - " This notebook requires the `tsam` package for time series aggregation.\n", - " Install with: `pip install \"flixopt[full]\"`\n", - "\n", - "!!! note \"Prerequisites\"\n", - " This notebook assumes familiarity with [08c-clustering](08c-clustering.ipynb)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "from data.generate_example_systems import create_district_heating_system\n", - "\n", - "import flixopt as fx\n", - "\n", - "fx.CONFIG.notebook()\n", - "\n", - "flow_system = create_district_heating_system()\n", - "flow_system.connect_and_transform()" - ] - }, - { - "cell_type": "markdown", - "id": "2", - "metadata": {}, - "source": [ - "## Clustering Metadata\n", - "\n", - "After calling `cluster()`, metadata is stored in `fs.clustering`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3", - "metadata": {}, - "outputs": [], - "source": [ - "from tsam import ExtremeConfig\n", - "\n", - "fs_clustered = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " extremes=ExtremeConfig(method='new_cluster', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n", - ")\n", - "\n", - "fs_clustered.clustering" - ] - }, - { - "cell_type": "markdown", - "id": "4", - "metadata": {}, - "source": [ - "The `Clustering` object contains:\n", - "- **`cluster_assignments`**: Which cluster each original period maps to\n", - "- **`cluster_occurrences`**: How many original periods each cluster represents\n", - "- **`timestep_mapping`**: Maps each original timestep to its representative\n", - "- **`original_data`** / **`aggregated_data`**: The data before and after clustering\n", - "- **`results`**: `ClusteringResults` object with xarray-like interface (`.dims`, `.coords`, `.sel()`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "# Cluster order shows which cluster each original period maps to\n", - "fs_clustered.clustering.cluster_assignments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6", - "metadata": {}, - "outputs": [], - "source": [ - "# Cluster occurrences shows how many original periods each cluster represents\n", - "fs_clustered.clustering.cluster_occurrences" - ] - }, - { - "cell_type": "markdown", - "id": "7", - "metadata": {}, - "source": [ - "## Visualizing Clustering\n", - "\n", - "The `.plot` accessor provides built-in visualizations for understanding clustering results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare original vs aggregated data as timeseries\n", - "# By default, plots all time-varying variables\n", - "fs_clustered.clustering.plot.compare()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "# Use a different approach of visualizing the data using normalize heatmaps\n", - "ds = fs_clustered.clustering.plot.compare(data_only=True).data\n", - "\n", - "ds_normalized = (ds - ds.min()) / (ds.max() - ds.min())\n", - "ds_normalized.to_array().plotly.imshow(\n", - " x='time',\n", - " animation_frame='representation',\n", - " zmin=0,\n", - " zmax=1,\n", - " color_continuous_scale='viridis',\n", - " title='Normalized Comparison',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare specific variables only\n", - "fs_clustered.clustering.plot.compare(variables='HeatDemand(Q_th)|fixed_relative_profile')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "# Duration curves show how well the aggregated data preserves the distribution\n", - "fs_clustered.clustering.plot.compare(kind='duration_curve').data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12", - "metadata": {}, - "outputs": [], - "source": [ - "# View typical period profiles for each cluster\n", - "# Each line represents a cluster's representative day\n", - "fs_clustered.clustering.plot.clusters(variables='HeatDemand(Q_th)|fixed_relative_profile', color='cluster')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "# Heatmap shows cluster assignments for each original period\n", - "fs_clustered.clustering.plot.heatmap()" - ] - }, - { - "cell_type": "markdown", - "id": "14", - "metadata": {}, - "source": [ - "## Expanding Aggregated Data\n", - "\n", - "The `Clustering.expand_data()` method maps aggregated data back to original timesteps.\n", - "This is useful for comparing clustering results before optimization:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "15", - "metadata": {}, - "outputs": [], - "source": [ - "# Get original and aggregated data\n", - "clustering = fs_clustered.clustering\n", - "original = clustering.original_data['HeatDemand(Q_th)|fixed_relative_profile']\n", - "aggregated = clustering.aggregated_data['HeatDemand(Q_th)|fixed_relative_profile']\n", - "\n", - "# Expand aggregated data back to original timesteps\n", - "expanded = clustering.expand_data(aggregated)\n", - "\n", - "print(f'Original: {len(original.time)} timesteps')\n", - "print(f'Aggregated: {len(aggregated.time)} timesteps')\n", - "print(f'Expanded: {len(expanded.time)} timesteps')" - ] - }, - { - "cell_type": "markdown", - "id": "16", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "| Property | Description |\n", - "|----------|-------------|\n", - "| `clustering.n_clusters` | Number of representative clusters |\n", - "| `clustering.timesteps_per_cluster` | Timesteps in each cluster period |\n", - "| `clustering.cluster_assignments` | Maps original periods to clusters |\n", - "| `clustering.cluster_occurrences` | Count of original periods per cluster |\n", - "| `clustering.timestep_mapping` | Maps original timesteps to representative indices |\n", - "| `clustering.original_data` | Dataset before clustering |\n", - "| `clustering.aggregated_data` | Dataset after clustering |\n", - "| `clustering.results` | `ClusteringResults` with xarray-like interface |\n", - "\n", - "### ClusteringResults (xarray-like)\n", - "\n", - "Access the underlying tsam results via `clustering.results`:\n", - "\n", - "```python\n", - "# Dimension info (like xarray)\n", - "clustering.results.dims # ('period', 'scenario') or ()\n", - "clustering.results.coords # {'period': [2020, 2030], 'scenario': ['high', 'low']}\n", - "\n", - "# Select specific result (like xarray)\n", - "clustering.results.sel(period=2020, scenario='high') # Label-based\n", - "clustering.results.isel(period=0, scenario=1) # Index-based\n", - "```\n", - "\n", - "### Plot Accessor Methods\n", - "\n", - "| Method | Description |\n", - "|--------|-------------|\n", - "| `plot.compare()` | Compare original vs aggregated data (timeseries) |\n", - "| `plot.compare(kind='duration_curve')` | Compare as duration curves |\n", - "| `plot.clusters()` | View each cluster's profile |\n", - "| `plot.heatmap()` | Visualize cluster assignments |\n", - "\n", - "### Key Parameters\n", - "\n", - "```python\n", - "# Compare with options\n", - "clustering.plot.compare(\n", - " variables='Demand|profile', # Single variable, list, or None (all)\n", - " kind='timeseries', # 'timeseries' or 'duration_curve'\n", - " select={'scenario': 'Base'}, # xarray-style selection\n", - " colors='viridis', # Colorscale name, list, or dict\n", - " facet_col='period', # Facet by period if present\n", - " facet_row='scenario', # Facet by scenario if present\n", - ")\n", - "\n", - "# Heatmap shows cluster assignments (no variable needed)\n", - "clustering.plot.heatmap()\n", - "\n", - "# Expand aggregated data to original timesteps\n", - "expanded = clustering.expand_data(aggregated_data)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "17", - "metadata": {}, - "source": [ - "## Cluster Weights\n", - "\n", - "Each representative timestep has a weight equal to the number of original periods it represents.\n", - "This ensures operational costs scale correctly:\n", - "\n", - "$$\\text{Objective} = \\sum_{t \\in \\text{typical}} w_t \\cdot c_t$$\n", - "\n", - "The weights sum to the original timestep count:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "print(f'Sum of weights: {fs_clustered.cluster_weight.sum().item():.0f}')\n", - "print(f'Original timesteps: {len(flow_system.timesteps)}')" - ] - }, - { - "cell_type": "markdown", - "id": "19", - "metadata": {}, - "source": [ - "## Solution Expansion\n", - "\n", - "After optimization, `expand()` maps results back to full resolution:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20", - "metadata": {}, - "outputs": [], - "source": [ - "solver = fx.solvers.HighsSolver(mip_gap=0.01, log_to_console=False)\n", - "fs_clustered.optimize(solver)\n", - "\n", - "fs_expanded = fs_clustered.transform.expand()\n", - "\n", - "print(f'Clustered: {len(fs_clustered.timesteps)} timesteps')\n", - "print(f'Expanded: {len(fs_expanded.timesteps)} timesteps')" - ] - }, - { - "cell_type": "markdown", - "id": "21", - "metadata": {}, - "source": [ - "## IO Workflow\n", - "\n", - "When saving and loading a clustered FlowSystem, most clustering information is preserved.\n", - "However, some methods that access tsam's internal `AggregationResult` objects are not available after IO.\n", - "\n", - "### What's Preserved After IO\n", - "\n", - "- **Structure**: `n_clusters`, `timesteps_per_cluster`, `dims`, `coords`\n", - "- **Mappings**: `cluster_assignments`, `cluster_occurrences`, `timestep_mapping`\n", - "- **Data**: `original_data`, `aggregated_data`\n", - "- **Original timesteps**: `original_timesteps`\n", - "- **Results structure**: `results.sel()`, `results.isel()` for `ClusteringResult` access\n", - "\n", - "### What's Lost After IO\n", - "\n", - "- **`clustering.sel()`**: Accessing full `AggregationResult` objects\n", - "- **`clustering.items()`**: Iterating over `AggregationResult` objects\n", - "- **tsam internals**: `AggregationResult.accuracy`, `AggregationResult.plot`, etc." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22", - "metadata": {}, - "outputs": [], - "source": [ - "# Before IO: Full tsam access is available\n", - "result = fs_clustered.clustering.sel() # Get the AggregationResult\n", - "print(f'Before IO - AggregationResult available: {type(result).__name__}')\n", - "print(f' - n_clusters: {result.n_clusters}')\n", - "print(f' - accuracy.rmse (mean): {result.accuracy.rmse.mean():.4f}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23", - "metadata": {}, - "outputs": [], - "source": [ - "# Save and load the clustered system\n", - "import tempfile\n", - "from pathlib import Path\n", - "\n", - "try:\n", - " with tempfile.TemporaryDirectory() as tmpdir:\n", - " path = Path(tmpdir) / 'clustered_system.nc'\n", - " fs_clustered.to_netcdf(path)\n", - " fs_loaded = fx.FlowSystem.from_netcdf(path)\n", - "\n", - " # Structure is preserved\n", - " print('After IO - Structure preserved:')\n", - " print(f' - n_clusters: {fs_loaded.clustering.n_clusters}')\n", - " print(f' - dims: {fs_loaded.clustering.dims}')\n", - " print(f' - original_data variables: {list(fs_loaded.clustering.original_data.data_vars)[:3]}...')\n", - "except OSError as e:\n", - " print(f'Note: NetCDF save/load skipped due to environment issue: {type(e).__name__}')\n", - " print('This can happen in some CI environments. The functionality works locally.')\n", - " fs_loaded = fs_clustered # Use original for subsequent cells" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "# After IO: sel() raises ValueError because AggregationResult is not preserved\n", - "try:\n", - " fs_loaded.clustering.sel()\n", - "except ValueError as e:\n", - " print('After IO - sel() raises ValueError:')\n", - " print(f' \"{e}\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25", - "metadata": {}, - "outputs": [], - "source": [ - "# Key operations still work after IO:\n", - "# - Optimization\n", - "# - Expansion back to full resolution\n", - "# - Accessing original_data and aggregated_data\n", - "\n", - "fs_loaded.optimize(solver)\n", - "fs_loaded_expanded = fs_loaded.transform.expand()\n", - "\n", - "print('Loaded system can still be:')\n", - "print(f' - Optimized: {fs_loaded.solution is not None}')\n", - "print(f' - Expanded: {len(fs_loaded_expanded.timesteps)} timesteps')" - ] - }, - { - "cell_type": "markdown", - "id": "26", - "metadata": {}, - "source": [ - "### IO Workflow Summary\n", - "\n", - "```\n", - "┌─────────────────┐ to_netcdf() ┌─────────────────┐\n", - "│ fs_clustered │ ─────────────────► │ NetCDF file │\n", - "│ │ │ │\n", - "│ ✓ clustering │ │ ✓ structure │\n", - "│ ✓ sel() │ │ ✓ mappings │\n", - "│ ✓ items() │ │ ✓ data │\n", - "│ ✓ AggregationResult │ ✗ AggregationResult\n", - "└─────────────────┘ └─────────────────┘\n", - " │\n", - " │ from_netcdf()\n", - " ▼\n", - " ┌─────────────────┐\n", - " │ fs_loaded │\n", - " │ │\n", - " │ ✓ optimize() │\n", - " │ ✓ expand() │\n", - " │ ✓ original_data │\n", - " │ ✗ sel() │\n", - " │ ✗ items() │\n", - " └─────────────────┘\n", - "```\n", - "\n", - "!!! tip \"Best Practice\"\n", - " If you need tsam's `AggregationResult` for analysis (accuracy metrics, built-in plots),\n", - " do this **before** saving the FlowSystem. After loading, the core workflow\n", - " (optimize → expand) works normally." - ] - }, - { - "cell_type": "markdown", - "id": "27", - "metadata": {}, - "source": [ - "### Reducing File Size\n", - "\n", - "For smaller files (~38% reduction), use `include_original_data=False` when saving.\n", - "This disables `plot.compare()` after loading, but the core workflow still works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28", - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "from pathlib import Path\n", - "\n", - "# Compare file sizes with and without original_data\n", - "try:\n", - " with tempfile.TemporaryDirectory() as tmpdir:\n", - " path_full = Path(tmpdir) / 'full.nc'\n", - " path_small = Path(tmpdir) / 'small.nc'\n", - "\n", - " fs_clustered.to_netcdf(path_full, include_original_data=True)\n", - " fs_clustered.to_netcdf(path_small, include_original_data=False)\n", - "\n", - " size_full = path_full.stat().st_size / 1024\n", - " size_small = path_small.stat().st_size / 1024\n", - "\n", - " print(f'With original_data: {size_full:.1f} KB')\n", - " print(f'Without original_data: {size_small:.1f} KB')\n", - " print(f'Size reduction: {(1 - size_small / size_full) * 100:.0f}%')\n", - "except OSError as e:\n", - " print(f'Note: File size comparison skipped due to environment issue: {type(e).__name__}')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/notebooks/08f-clustering-segmentation.ipynb b/docs/notebooks/08f-clustering-segmentation.ipynb deleted file mode 100644 index bc1915de4..000000000 --- a/docs/notebooks/08f-clustering-segmentation.ipynb +++ /dev/null @@ -1,647 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0", - "metadata": {}, - "source": [ - "# Intra-Period Segmentation with `cluster()`\n", - "\n", - "Reduce timesteps within each typical period using segmentation.\n", - "\n", - "This notebook demonstrates:\n", - "\n", - "- **Segmentation**: Aggregate timesteps within each cluster into fewer segments\n", - "- **Variable durations**: Each segment can have different duration (hours)\n", - "- **Combined reduction**: Use clustering AND segmentation for maximum speedup\n", - "- **Expansion**: Map segmented results back to original timesteps\n", - "\n", - "!!! note \"Requirements\"\n", - " This notebook requires the `tsam` package with `SegmentConfig` and `ExtremeConfig` support.\n", - " Install with: `pip install \"flixopt[full]\"`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1", - "metadata": {}, - "outputs": [], - "source": [ - "import timeit\n", - "\n", - "import pandas as pd\n", - "import plotly.express as px\n", - "\n", - "import flixopt as fx\n", - "\n", - "fx.CONFIG.notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "2", - "metadata": {}, - "source": [ - "## What is Segmentation?\n", - "\n", - "**Clustering** groups similar time periods (e.g., days) into representative clusters.\n", - "\n", - "**Segmentation** goes further by aggregating timesteps *within* each cluster into fewer segments with variable durations.\n", - "\n", - "```\n", - "Original: | Day 1 (24h) | Day 2 (24h) | Day 3 (24h) | ... | Day 365 (24h) |\n", - " ↓ ↓ ↓ ↓\n", - "Clustered: | Typical Day A (24h) | Typical Day B (24h) | Typical Day C (24h) |\n", - " ↓ ↓ ↓\n", - "Segmented: | Seg1 (4h) | Seg2 (8h) | Seg3 (8h) | Seg4 (4h) | (per typical day)\n", - "```\n", - "\n", - "This can dramatically reduce problem size:\n", - "- **Original**: 365 days × 24 hours = 8,760 timesteps\n", - "- **Clustered (8 days)**: 8 × 24 = 192 timesteps\n", - "- **Segmented (6 segments)**: 8 × 6 = 48 timesteps" - ] - }, - { - "cell_type": "markdown", - "id": "3", - "metadata": {}, - "source": [ - "## Create the FlowSystem\n", - "\n", - "We use a district heating system with one month of data at 15-min resolution:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "from data.generate_example_systems import create_district_heating_system\n", - "\n", - "flow_system = create_district_heating_system()\n", - "flow_system.connect_and_transform()\n", - "\n", - "print(f'Timesteps: {len(flow_system.timesteps)}')\n", - "print(f'Duration: {(flow_system.timesteps[-1] - flow_system.timesteps[0]).days + 1} days')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize input data\n", - "heat_demand = flow_system.components['HeatDemand'].inputs[0].fixed_relative_profile\n", - "heat_demand.plotly.line(title='Heat Demand Profile')" - ] - }, - { - "cell_type": "markdown", - "id": "6", - "metadata": {}, - "source": [ - "## Full Optimization (Baseline)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7", - "metadata": {}, - "outputs": [], - "source": [ - "solver = fx.solvers.HighsSolver(mip_gap=0.01)\n", - "\n", - "start = timeit.default_timer()\n", - "fs_full = flow_system.copy()\n", - "fs_full.name = 'Full Optimization'\n", - "fs_full.optimize(solver)\n", - "time_full = timeit.default_timer() - start\n", - "\n", - "print(f'Full optimization: {time_full:.2f} seconds')\n", - "print(f'Total cost: {fs_full.solution[\"costs\"].item():,.0f} €')" - ] - }, - { - "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## Clustering with Segmentation\n", - "\n", - "Use `SegmentConfig` to enable intra-period segmentation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "from tsam import ExtremeConfig, SegmentConfig\n", - "\n", - "start = timeit.default_timer()\n", - "\n", - "# Cluster into 8 typical days with 6 segments each\n", - "fs_segmented = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " segments=SegmentConfig(n_segments=6), # 6 segments per day instead of 96 quarter-hours\n", - " extremes=ExtremeConfig(method='replace', max_value=['HeatDemand(Q_th)|fixed_relative_profile']),\n", - ")\n", - "\n", - "time_clustering = timeit.default_timer() - start\n", - "\n", - "print(f'Clustering time: {time_clustering:.2f} seconds')\n", - "print(f'Original timesteps: {len(flow_system.timesteps)}')\n", - "print(\n", - " f'Segmented timesteps: {len(fs_segmented.timesteps)} × {len(fs_segmented.clusters)} clusters = {len(fs_segmented.timesteps) * len(fs_segmented.clusters)}'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## Understanding Segmentation Properties\n", - "\n", - "After segmentation, the clustering object has additional properties:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "clustering = fs_segmented.clustering\n", - "\n", - "print('Segmentation Properties:')\n", - "print(f' is_segmented: {clustering.is_segmented}')\n", - "print(f' n_segments: {clustering.n_segments}')\n", - "print(f' n_clusters: {clustering.n_clusters}')\n", - "print(f' timesteps_per_cluster (original): {clustering.timesteps_per_cluster}')\n", - "print(f'\\nTime dimension uses RangeIndex: {type(fs_segmented.timesteps)}')" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## Variable Timestep Durations\n", - "\n", - "Each segment has a different duration, determined by how many original timesteps it represents:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "# Timestep duration is now a DataArray with (cluster, time) dimensions\n", - "timestep_duration = fs_segmented.timestep_duration\n", - "\n", - "print(f'Timestep duration shape: {dict(timestep_duration.sizes)}')\n", - "print('\\nSegment durations for cluster 0:')\n", - "cluster_0_durations = timestep_duration.sel(cluster=0).values\n", - "for i, dur in enumerate(cluster_0_durations):\n", - " print(f' Segment {i}: {dur:.2f} hours')\n", - "print(f' Total: {cluster_0_durations.sum():.2f} hours (should be 24h)')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize segment durations across clusters\n", - "duration_df = timestep_duration.to_dataframe('duration').reset_index()\n", - "fig = px.bar(\n", - " duration_df,\n", - " x='time',\n", - " y='duration',\n", - " facet_col='cluster',\n", - " facet_col_wrap=4,\n", - " title='Segment Durations by Cluster',\n", - " labels={'time': 'Segment', 'duration': 'Duration [hours]'},\n", - ")\n", - "fig.update_layout(height=400)\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "id": "15", - "metadata": {}, - "source": [ - "## Optimize the Segmented System" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": {}, - "outputs": [], - "source": [ - "start = timeit.default_timer()\n", - "fs_segmented.optimize(solver)\n", - "time_segmented = timeit.default_timer() - start\n", - "\n", - "print(f'Segmented optimization: {time_segmented:.2f} seconds')\n", - "print(f'Total cost: {fs_segmented.solution[\"costs\"].item():,.0f} €')\n", - "print(f'\\nSpeedup vs full: {time_full / (time_clustering + time_segmented):.1f}x')" - ] - }, - { - "cell_type": "markdown", - "id": "17", - "metadata": {}, - "source": [ - "## Compare Clustering Quality\n", - "\n", - "View how well the segmented data represents the original:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18", - "metadata": {}, - "outputs": [], - "source": [ - "# Duration curves show how well the distribution is preserved\n", - "fs_segmented.clustering.plot.compare(kind='duration_curve')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19", - "metadata": {}, - "outputs": [], - "source": [ - "# Clustering quality metrics\n", - "fs_segmented.clustering.metrics.to_dataframe().style.format('{:.3f}')" - ] - }, - { - "cell_type": "markdown", - "id": "20", - "metadata": {}, - "source": [ - "## Expand to Original Timesteps\n", - "\n", - "Use `expand()` to map the segmented solution back to all original timesteps:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21", - "metadata": {}, - "outputs": [], - "source": [ - "start = timeit.default_timer()\n", - "fs_expanded = fs_segmented.transform.expand()\n", - "time_expand = timeit.default_timer() - start\n", - "\n", - "print(f'Expansion time: {time_expand:.3f} seconds')\n", - "print(f'Expanded timesteps: {len(fs_expanded.timesteps)}')\n", - "print(f'Objective preserved: {fs_expanded.solution[\"costs\"].item():,.0f} €')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22", - "metadata": {}, - "outputs": [], - "source": [ - "# Compare flow rates: Full vs Expanded\n", - "import xarray as xr\n", - "\n", - "flow_var = 'CHP(Q_th)|flow_rate'\n", - "comparison_ds = xr.concat(\n", - " [fs_full.solution[flow_var], fs_expanded.solution[flow_var]],\n", - " dim=pd.Index(['Full', 'Expanded'], name='method'),\n", - ")\n", - "comparison_ds.plotly.line(color='method', title='CHP Heat Output Comparison')" - ] - }, - { - "cell_type": "markdown", - "id": "23", - "metadata": {}, - "source": [ - "## Two-Stage Workflow with Segmentation\n", - "\n", - "For investment optimization, use segmentation for fast sizing, then dispatch at full resolution:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24", - "metadata": {}, - "outputs": [], - "source": [ - "# Stage 1: Sizing with segmentation (already done)\n", - "SAFETY_MARGIN = 1.05\n", - "sizes_with_margin = {name: float(size.item()) * SAFETY_MARGIN for name, size in fs_segmented.stats.sizes.items()}\n", - "\n", - "print('Optimized sizes with safety margin:')\n", - "for name, size in sizes_with_margin.items():\n", - " print(f' {name}: {size:.1f}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25", - "metadata": {}, - "outputs": [], - "source": [ - "# Stage 2: Full resolution dispatch with fixed sizes\n", - "start = timeit.default_timer()\n", - "fs_dispatch = flow_system.transform.fix_sizes(sizes_with_margin)\n", - "fs_dispatch.name = 'Two-Stage'\n", - "fs_dispatch.optimize(solver)\n", - "time_dispatch = timeit.default_timer() - start\n", - "\n", - "print(f'Dispatch time: {time_dispatch:.2f} seconds')\n", - "print(f'Final cost: {fs_dispatch.solution[\"costs\"].item():,.0f} €')" - ] - }, - { - "cell_type": "markdown", - "id": "26", - "metadata": {}, - "source": [ - "## Compare Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27", - "metadata": {}, - "outputs": [], - "source": [ - "total_segmented = time_clustering + time_segmented\n", - "total_two_stage = total_segmented + time_dispatch\n", - "\n", - "results = {\n", - " 'Full (baseline)': {\n", - " 'Time [s]': time_full,\n", - " 'Cost [€]': fs_full.solution['costs'].item(),\n", - " 'CHP': fs_full.stats.sizes['CHP(Q_th)'].item(),\n", - " 'Boiler': fs_full.stats.sizes['Boiler(Q_th)'].item(),\n", - " 'Storage': fs_full.stats.sizes['Storage'].item(),\n", - " },\n", - " 'Segmented (8×6)': {\n", - " 'Time [s]': total_segmented,\n", - " 'Cost [€]': fs_segmented.solution['costs'].item(),\n", - " 'CHP': fs_segmented.stats.sizes['CHP(Q_th)'].item(),\n", - " 'Boiler': fs_segmented.stats.sizes['Boiler(Q_th)'].item(),\n", - " 'Storage': fs_segmented.stats.sizes['Storage'].item(),\n", - " },\n", - " 'Two-Stage': {\n", - " 'Time [s]': total_two_stage,\n", - " 'Cost [€]': fs_dispatch.solution['costs'].item(),\n", - " 'CHP': sizes_with_margin['CHP(Q_th)'],\n", - " 'Boiler': sizes_with_margin['Boiler(Q_th)'],\n", - " 'Storage': sizes_with_margin['Storage'],\n", - " },\n", - "}\n", - "\n", - "comparison = pd.DataFrame(results).T\n", - "baseline_cost = comparison.loc['Full (baseline)', 'Cost [€]']\n", - "baseline_time = comparison.loc['Full (baseline)', 'Time [s]']\n", - "comparison['Cost Gap [%]'] = ((comparison['Cost [€]'] - baseline_cost) / abs(baseline_cost) * 100).round(2)\n", - "comparison['Speedup'] = (baseline_time / comparison['Time [s]']).round(1)\n", - "\n", - "comparison.style.format(\n", - " {\n", - " 'Time [s]': '{:.2f}',\n", - " 'Cost [€]': '{:,.0f}',\n", - " 'CHP': '{:.1f}',\n", - " 'Boiler': '{:.1f}',\n", - " 'Storage': '{:.0f}',\n", - " 'Cost Gap [%]': '{:.2f}',\n", - " 'Speedup': '{:.1f}x',\n", - " }\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "28", - "metadata": {}, - "source": [ - "## Segmentation with Multi-Period Systems\n", - "\n", - "Segmentation works with multi-period systems (multiple years, scenarios).\n", - "Each period/scenario combination is segmented independently:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29", - "metadata": {}, - "outputs": [], - "source": [ - "from data.generate_example_systems import create_multiperiod_system\n", - "\n", - "fs_multi = create_multiperiod_system()\n", - "# Use first week only for faster demo\n", - "fs_multi = fs_multi.transform.isel(time=slice(0, 168))\n", - "\n", - "print(f'Periods: {list(fs_multi.periods.values)}')\n", - "print(f'Scenarios: {list(fs_multi.scenarios.values)}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30", - "metadata": {}, - "outputs": [], - "source": [ - "# Cluster with segmentation\n", - "fs_multi_seg = fs_multi.transform.cluster(\n", - " n_clusters=3,\n", - " cluster_duration='1D',\n", - " segments=SegmentConfig(n_segments=6),\n", - " extremes=ExtremeConfig(method='replace', max_value=['Building(Heat)|fixed_relative_profile']),\n", - ")\n", - "\n", - "print(f'Original: {len(fs_multi.timesteps)} timesteps')\n", - "print(f'Segmented: {len(fs_multi_seg.timesteps)} × {len(fs_multi_seg.clusters)} clusters')\n", - "print(f'is_segmented: {fs_multi_seg.clustering.is_segmented}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31", - "metadata": {}, - "outputs": [], - "source": [ - "# Cluster assignments have period/scenario dimensions\n", - "fs_multi_seg.clustering.cluster_assignments" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32", - "metadata": {}, - "outputs": [], - "source": [ - "# Optimize and expand\n", - "fs_multi_seg.optimize(solver)\n", - "fs_multi_expanded = fs_multi_seg.transform.expand()\n", - "\n", - "print(f'Expanded timesteps: {len(fs_multi_expanded.timesteps)}')\n", - "print(f'Objective: {fs_multi_expanded.solution[\"objective\"].item():,.0f} €')" - ] - }, - { - "cell_type": "markdown", - "id": "33", - "metadata": {}, - "source": [ - "## API Reference\n", - "\n", - "### SegmentConfig Parameters\n", - "\n", - "```python\n", - "from tsam import SegmentConfig\n", - "\n", - "segments = SegmentConfig(\n", - " n_segments=6, # Number of segments per cluster period\n", - " representation_method='mean', # How to represent segment values ('mean', 'medoid', etc.)\n", - ")\n", - "```\n", - "\n", - "### Segmentation Properties\n", - "\n", - "After segmentation, `fs.clustering` has additional properties:\n", - "\n", - "| Property | Description |\n", - "|----------|-------------|\n", - "| `is_segmented` | `True` if segmentation was used |\n", - "| `n_segments` | Number of segments per cluster |\n", - "| `timesteps_per_cluster` | Original timesteps per cluster (before segmentation) |\n", - "\n", - "### Timestep Duration\n", - "\n", - "For segmented systems, `fs.timestep_duration` is a DataArray with `(cluster, time)` dimensions:\n", - "\n", - "```python\n", - "# Each segment has different duration\n", - "fs_segmented.timestep_duration # Shape: (n_clusters, n_segments)\n", - "\n", - "# Sum should equal original period duration\n", - "fs_segmented.timestep_duration.sum('time') # Should be 24h for daily clusters\n", - "```\n", - "\n", - "### Example Workflow\n", - "\n", - "```python\n", - "from tsam import ExtremeConfig, SegmentConfig\n", - "\n", - "# Cluster with segmentation\n", - "fs_segmented = flow_system.transform.cluster(\n", - " n_clusters=8,\n", - " cluster_duration='1D',\n", - " segments=SegmentConfig(n_segments=6),\n", - " extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|profile']),\n", - ")\n", - "\n", - "# Optimize\n", - "fs_segmented.optimize(solver)\n", - "\n", - "# Expand back to original timesteps\n", - "fs_expanded = fs_segmented.transform.expand()\n", - "\n", - "# Two-stage workflow\n", - "sizes = {k: v.item() * 1.05 for k, v in fs_segmented.stats.sizes.items()}\n", - "fs_dispatch = flow_system.transform.fix_sizes(sizes)\n", - "fs_dispatch.optimize(solver)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "34", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "You learned how to:\n", - "\n", - "- Use **`SegmentConfig`** to enable intra-period segmentation\n", - "- Work with **variable timestep durations** for each segment\n", - "- **Combine clustering and segmentation** for maximum problem size reduction\n", - "- **Expand segmented solutions** back to original timesteps\n", - "- Use segmentation with **multi-period systems**\n", - "\n", - "### Key Takeaways\n", - "\n", - "1. **Segmentation reduces problem size further**: From 8×24=192 to 8×6=48 timesteps\n", - "2. **Variable durations preserve accuracy**: Important periods get more timesteps\n", - "3. **Works with multi-period**: Each period/scenario is segmented independently\n", - "4. **expand() works correctly**: Maps segment values to all original timesteps\n", - "5. **Two-stage is still recommended**: Use segmentation for sizing, full resolution for dispatch\n", - "\n", - "### Trade-offs\n", - "\n", - "| More Segments | Fewer Segments |\n", - "|---------------|----------------|\n", - "| Higher accuracy | Lower accuracy |\n", - "| Slower solve | Faster solve |\n", - "| More memory | Less memory |\n", - "\n", - "Start with 6-12 segments and adjust based on your accuracy needs." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/flixopt/clustering/__init__.py b/flixopt/clustering/__init__.py index 07e6e775f..9f12f2a02 100644 --- a/flixopt/clustering/__init__.py +++ b/flixopt/clustering/__init__.py @@ -1,13 +1,11 @@ """ Time Series Aggregation Module for flixopt. -This module provides wrapper classes around tsam's clustering functionality: -- Clustering: Top-level class stored on FlowSystem after clustering -- ClusteringResults: Manages collection of tsam ClusteringResult objects (for IO) +This module provides the Clustering class stored on FlowSystem after clustering, +wrapping tsam_xarray's ClusteringResult. Example usage: - # Cluster a FlowSystem to reduce timesteps from tsam import ExtremeConfig fs_clustered = flow_system.transform.cluster( @@ -16,36 +14,21 @@ extremes=ExtremeConfig(method='new_cluster', max_value=['Demand|fixed_relative_profile']), ) - # Access clustering structure (available before AND after IO) clustering = fs_clustered.clustering print(f'Number of clusters: {clustering.n_clusters}') - print(f'Dims: {clustering.dims}') # e.g., ('period', 'scenario') - print(f'Coords: {clustering.coords}') # e.g., {'period': [2024, 2025]} + print(f'Clustering result: {clustering.clustering_result}') - # Access tsam AggregationResult for detailed analysis - # NOTE: Only available BEFORE saving/loading. Lost after IO. - result = clustering.sel(period=2024, scenario='high') - result.cluster_representatives # DataFrame with aggregated time series - result.accuracy # AccuracyMetrics (rmse, mae) - result.plot.compare() # tsam's built-in comparison plot - - # Iterate over all results (only before IO) - for key, result in clustering.items(): - print(f'{key}: {result.n_clusters} clusters') - - # Save and load - structure preserved, AggregationResult access lost - fs_clustered.to_netcdf('system.nc') - # Use include_original_data=False for smaller files (~38% reduction) - fs_clustered.to_netcdf('system.nc', include_original_data=False) + # Access tsam_xarray AggregationResult (only before saving/loading) + result = clustering.aggregation_result + result.cluster_representatives # DataArray + result.accuracy # AccuracyMetrics # Expand back to full resolution fs_expanded = fs_clustered.transform.expand() """ -from .base import AggregationResults, Clustering, ClusteringResults +from .base import Clustering __all__ = [ - 'ClusteringResults', - 'AggregationResults', 'Clustering', ] diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py index 7082929c3..b78bf89b3 100644 --- a/flixopt/clustering/base.py +++ b/flixopt/clustering/base.py @@ -1,960 +1,286 @@ """ Clustering classes for time series aggregation. -This module provides wrapper classes around tsam's clustering functionality: -- `ClusteringResults`: Collection of tsam ClusteringResult objects for multi-dim (period, scenario) data -- `Clustering`: Top-level class stored on FlowSystem after clustering +This module provides the `Clustering` class stored on FlowSystem after clustering, +wrapping tsam_xarray's ClusteringResult for structure access and AggregationResult +for full data access (pre-serialization only). """ from __future__ import annotations -import functools import json -from collections import Counter from typing import TYPE_CHECKING, Any -import numpy as np import pandas as pd -import xarray as xr if TYPE_CHECKING: from pathlib import Path - from tsam import AggregationResult - from tsam import ClusteringResult as TsamClusteringResult + import xarray as xr + from tsam_xarray import AggregationResult as TsamXarrayAggregationResult + from tsam_xarray import ClusteringResult - from ..color_processing import ColorType - from ..plot_result import PlotResult - from ..statistics_accessor import SelectType -from ..statistics_accessor import _build_color_kwargs - - -def _apply_slot_defaults(plotly_kwargs: dict, defaults: dict[str, str | None]) -> None: - """Apply default slot assignments to plotly kwargs. - - Args: - plotly_kwargs: The kwargs dict to update (modified in place). - defaults: Default slot assignments. None values block slots. - """ - for slot, value in defaults.items(): - plotly_kwargs.setdefault(slot, value) - - -def _select_dims(da: xr.DataArray, period: Any = None, scenario: Any = None) -> xr.DataArray: - """Select from DataArray by period/scenario if those dimensions exist.""" - if 'period' in da.dims and period is not None: - da = da.sel(period=period) - if 'scenario' in da.dims and scenario is not None: - da = da.sel(scenario=scenario) - return da - - -def _cluster_occurrences(cr: TsamClusteringResult) -> np.ndarray: - """Compute cluster occurrences from ClusteringResult.""" - counts = Counter(cr.cluster_assignments) - return np.array([counts.get(i, 0) for i in range(cr.n_clusters)]) +class Clustering: + """Clustering information for a FlowSystem. + Wraps tsam_xarray's ClusteringResult for structure access and optionally + AggregationResult for full data access (pre-serialization only). -def _build_timestep_mapping(cr: TsamClusteringResult, n_timesteps: int) -> np.ndarray: - """Build mapping from original timesteps to representative timestep indices. - - For segmented systems, the mapping uses segment_assignments from tsam to map - each original timestep position to its corresponding segment index. - """ - timesteps_per_cluster = cr.n_timesteps_per_period - # For segmented systems, representative time dimension has n_segments entries - # For non-segmented, it has timesteps_per_cluster entries - n_segments = cr.n_segments - is_segmented = n_segments is not None - time_dim_size = n_segments if is_segmented else timesteps_per_cluster - - # For segmented systems, tsam provides segment_assignments which maps - # each position within a period to its segment index - segment_assignments = cr.segment_assignments if is_segmented else None - - mapping = np.zeros(n_timesteps, dtype=np.int32) - for period_idx, cluster_id in enumerate(cr.cluster_assignments): - for pos in range(timesteps_per_cluster): - orig_idx = period_idx * timesteps_per_cluster + pos - if orig_idx < n_timesteps: - if is_segmented and segment_assignments is not None: - # For segmented: use tsam's segment_assignments to get segment index - # segment_assignments[cluster_id][pos] gives the segment index - segment_idx = segment_assignments[cluster_id][pos] - mapping[orig_idx] = int(cluster_id) * time_dim_size + segment_idx - else: - # Non-segmented: direct position mapping - mapping[orig_idx] = int(cluster_id) * time_dim_size + pos - return mapping - - -class ClusteringResults: - """Collection of tsam ClusteringResult objects for multi-dimensional data. - - Manages multiple ClusteringResult objects keyed by (period, scenario) tuples - and provides convenient access and multi-dimensional DataArray building. - - Follows xarray-like patterns with `.dims`, `.coords`, `.sel()`, and `.isel()`. - - Attributes: - dims: Tuple of dimension names, e.g., ('period', 'scenario'). - coords: Dict mapping dimension names to their coordinate values. + For advanced access to clustering structure (dims, coords, cluster_centers, + segment_centers, etc.), use ``clustering_result`` directly. Example: - >>> results = ClusteringResults({(): cr}, dim_names=[]) - >>> results.n_clusters - 2 - >>> results.cluster_assignments # Returns DataArray - - - >>> # Multi-dimensional case - >>> results = ClusteringResults( - ... {(2024, 'high'): cr1, (2024, 'low'): cr2}, - ... dim_names=['period', 'scenario'], - ... ) - >>> results.dims - ('period', 'scenario') - >>> results.coords - {'period': [2024], 'scenario': ['high', 'low']} - >>> results.sel(period=2024, scenario='high') # Label-based - - >>> results.isel(period=0, scenario=1) # Index-based - + >>> clustering = fs_clustered.clustering + >>> clustering.n_clusters + 8 + >>> clustering.clustering_result # tsam_xarray ClusteringResult for full access """ def __init__( self, - results: dict[tuple, TsamClusteringResult], - dim_names: list[str], + clustering_result: ClusteringResult | dict | None = None, + original_timesteps: pd.DatetimeIndex | list[str] | None = None, + # Internal: tsam_xarray AggregationResult for full data access + _aggregation_result: TsamXarrayAggregationResult | None = None, + # Internal: mapping from renamed dims back to originals (e.g., _period -> period) + _unrename_map: dict[str, str] | None = None, + # Legacy: accept 'results' kwarg for netcdf files saved before this refactor. + # The IO resolver passes serialized dict keys as kwargs to __init__(). + # Remove once all users have re-saved their netcdf files with the new format. + results: Any = None, + # Legacy kwargs ignored (removed: original_data, aggregated_data, _metrics, refs) + **_ignored: Any, ): - """Initialize ClusteringResults. - - Args: - results: Dict mapping (period, scenario) tuples to tsam ClusteringResult objects. - For simple cases without periods/scenarios, use {(): result}. - dim_names: Names of extra dimensions, e.g., ['period', 'scenario']. - """ - if not results: - raise ValueError('results cannot be empty') - self._results = results - self._dim_names = dim_names - - # ========================================================================== - # xarray-like interface - # ========================================================================== - - @property - def dims(self) -> tuple[str, ...]: - """Dimension names as tuple (xarray-like).""" - return tuple(self._dim_names) - - @property - def dim_names(self) -> list[str]: - """Dimension names as list (backwards compatibility).""" - return list(self._dim_names) - - @property - def coords(self) -> dict[str, list]: - """Coordinate values for each dimension (xarray-like). - - Returns: - Dict mapping dimension names to lists of coordinate values. - """ - return {dim: self._get_dim_values(dim) for dim in self._dim_names} - - def sel(self, **kwargs: Any) -> TsamClusteringResult: - """Select result by dimension labels (xarray-like). - - Args: - **kwargs: Dimension name=value pairs, e.g., period=2024, scenario='high'. - - Returns: - The tsam ClusteringResult for the specified combination. - - Raises: - KeyError: If no result found for the specified combination. - - Example: - >>> results.sel(period=2024, scenario='high') - - """ - key = self._make_key(**kwargs) - if key not in self._results: - raise KeyError(f'No result found for {kwargs}') - return self._results[key] - - def isel(self, **kwargs: int) -> TsamClusteringResult: - """Select result by dimension indices (xarray-like). - - Args: - **kwargs: Dimension name=index pairs, e.g., period=0, scenario=1. - - Returns: - The tsam ClusteringResult for the specified combination. - - Raises: - IndexError: If index is out of range for a dimension. - - Example: - >>> results.isel(period=0, scenario=1) - - """ - label_kwargs = {} - for dim, idx in kwargs.items(): - coord_values = self._get_dim_values(dim) - if coord_values is None: - raise KeyError(f"Dimension '{dim}' not found in dims {self.dims}") - if idx < 0 or idx >= len(coord_values): - raise IndexError(f"Index {idx} out of range for dimension '{dim}' with {len(coord_values)} values") - label_kwargs[dim] = coord_values[idx] - return self.sel(**label_kwargs) - - def __getitem__(self, key: tuple) -> TsamClusteringResult: - """Get result by key tuple.""" - return self._results[key] - - # === Iteration === - - def __iter__(self): - """Iterate over ClusteringResult objects.""" - return iter(self._results.values()) - - def __len__(self) -> int: - """Number of ClusteringResult objects.""" - return len(self._results) - - def items(self): - """Iterate over (key, ClusteringResult) pairs.""" - return self._results.items() - - def keys(self): - """Iterate over keys.""" - return self._results.keys() - - def values(self): - """Iterate over ClusteringResult objects.""" - return self._results.values() - - # === Properties from first result === - - @property - def _first_result(self) -> TsamClusteringResult: - """Get the first ClusteringResult (for structure info).""" - return next(iter(self._results.values())) - - @property - def n_clusters(self) -> int: - """Number of clusters (same for all results).""" - return self._first_result.n_clusters - - @property - def timesteps_per_cluster(self) -> int: - """Number of timesteps per cluster (same for all results).""" - return self._first_result.n_timesteps_per_period - - @property - def n_original_periods(self) -> int: - """Number of original periods (same for all results).""" - return self._first_result.n_original_periods - - @property - def n_segments(self) -> int | None: - """Number of segments per cluster, or None if not segmented.""" - return self._first_result.n_segments - - # === Multi-dim DataArrays === - - @property - def cluster_assignments(self) -> xr.DataArray: - """Maps each original cluster to its typical cluster index. - - Returns: - DataArray with dims [original_cluster, period?, scenario?]. - """ - # Note: No coords on original_cluster - they cause issues when used as isel() indexer - return self._build_property_array( - lambda cr: np.array(cr.cluster_assignments), - base_dims=['original_cluster'], - name='cluster_assignments', - ) - - @property - def cluster_occurrences(self) -> xr.DataArray: - """How many original clusters map to each typical cluster. - - Returns: - DataArray with dims [cluster, period?, scenario?]. - """ - return self._build_property_array( - _cluster_occurrences, - base_dims=['cluster'], - base_coords={'cluster': range(self.n_clusters)}, - name='cluster_occurrences', - ) - - @property - def cluster_centers(self) -> xr.DataArray: - """Which original cluster is the representative (center) for each typical cluster. - - Returns: - DataArray with dims [cluster, period?, scenario?]. - """ - return self._build_property_array( - lambda cr: np.array(cr.cluster_centers), - base_dims=['cluster'], - base_coords={'cluster': range(self.n_clusters)}, - name='cluster_centers', - ) - - @property - def segment_assignments(self) -> xr.DataArray | None: - """For each timestep within a cluster, which segment it belongs to. - - Returns: - DataArray with dims [cluster, time, period?, scenario?], or None if not segmented. - """ - if self._first_result.segment_assignments is None: - return None - timesteps = self._first_result.n_timesteps_per_period - return self._build_property_array( - lambda cr: np.array(cr.segment_assignments), - base_dims=['cluster', 'time'], - base_coords={'cluster': range(self.n_clusters), 'time': range(timesteps)}, - name='segment_assignments', - ) - - @property - def segment_durations(self) -> xr.DataArray | None: - """Duration of each segment in timesteps. - - Returns: - DataArray with dims [cluster, segment, period?, scenario?], or None if not segmented. - """ - if self._first_result.segment_durations is None: - return None - n_segments = self._first_result.n_segments - - def _get_padded_durations(cr: TsamClusteringResult) -> np.ndarray: - """Pad ragged segment durations to uniform shape.""" - return np.array([list(d) + [np.nan] * (n_segments - len(d)) for d in cr.segment_durations]) - - return self._build_property_array( - _get_padded_durations, - base_dims=['cluster', 'segment'], - base_coords={'cluster': range(self.n_clusters), 'segment': range(n_segments)}, - name='segment_durations', - ) - - @property - def segment_centers(self) -> xr.DataArray | None: - """Center of each intra-period segment. - - Only available if segmentation was configured during clustering. - - Returns: - DataArray or None if no segmentation. - """ - first = self._first_result - if first.segment_centers is None: - return None - - n_segments = first.n_segments - return self._build_property_array( - lambda cr: np.array(cr.segment_centers), - base_dims=['cluster', 'segment'], - base_coords={'cluster': range(self.n_clusters), 'segment': range(n_segments)}, - name='segment_centers', - ) - - @property - def position_within_segment(self) -> xr.DataArray | None: - """Position of each timestep within its segment (0-indexed). - - For each (cluster, time) position, returns how many timesteps into the - segment that position is. Used for interpolation within segments. - - Returns: - DataArray with dims [cluster, time] or [cluster, time, period?, scenario?]. - Returns None if no segmentation. - """ - segment_assignments = self.segment_assignments - if segment_assignments is None: - return None - - def _compute_positions(seg_assigns: np.ndarray) -> np.ndarray: - """Compute position within segment for each (cluster, time).""" - n_clusters, n_times = seg_assigns.shape - positions = np.zeros_like(seg_assigns) - for c in range(n_clusters): - pos = 0 - prev_seg = -1 - for t in range(n_times): - seg = seg_assigns[c, t] - if seg != prev_seg: - pos = 0 - prev_seg = seg - positions[c, t] = pos - pos += 1 - return positions - - # Handle extra dimensions by applying _compute_positions to each slice - extra_dims = [d for d in segment_assignments.dims if d not in ('cluster', 'time')] - - if not extra_dims: - positions = _compute_positions(segment_assignments.values) - return xr.DataArray( - positions, - dims=['cluster', 'time'], - coords=segment_assignments.coords, - name='position_within_segment', - ) - - # Multi-dimensional case: compute for each period/scenario slice - result = xr.apply_ufunc( - _compute_positions, - segment_assignments, - input_core_dims=[['cluster', 'time']], - output_core_dims=[['cluster', 'time']], - vectorize=True, - ) - return result.rename('position_within_segment') - - # === Serialization === - - def to_dict(self) -> dict: - """Serialize to dict. + from tsam_xarray import ClusteringResult as ClusteringResultClass - The dict can be used to reconstruct via from_dict(). - """ - return { - 'dim_names': list(self._dim_names), - 'results': {self._key_to_str(key): result.to_dict() for key, result in self._results.items()}, - } - - @classmethod - def from_dict(cls, d: dict) -> ClusteringResults: - """Reconstruct from dict. - - Args: - d: Dict from to_dict(). + # Handle ISO timestamp strings from serialization + if ( + isinstance(original_timesteps, list) + and len(original_timesteps) > 0 + and isinstance(original_timesteps[0], str) + ): + original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in original_timesteps]) - Returns: - Reconstructed ClusteringResults. - """ - from tsam import ClusteringResult - - dim_names = d['dim_names'] - results = {} - for key_str, result_dict in d['results'].items(): - key = cls._str_to_key(key_str, dim_names) - results[key] = ClusteringResult.from_dict(result_dict) - return cls(results, dim_names) - - # === Private helpers === - - def _make_key(self, **kwargs: Any) -> tuple: - """Create a key tuple from dimension keyword arguments.""" - key_parts = [] - for dim in self._dim_names: - if dim in kwargs: - key_parts.append(kwargs[dim]) - return tuple(key_parts) - - def _get_dim_values(self, dim: str) -> list | None: - """Get unique values for a dimension, or None if dimension not present. - - Preserves insertion order to ensure .isel() positional indexing matches - the original FlowSystem dimension order. - """ - if dim not in self._dim_names: - return None - idx = self._dim_names.index(dim) - # Use dict.fromkeys to preserve insertion order while removing duplicates - values = [k[idx] for k in self._results.keys()] - return list(dict.fromkeys(values)) + # Store tsam_xarray AggregationResult if provided (full data access) + self._aggregation_result = _aggregation_result - def _build_property_array( - self, - get_data: callable, - base_dims: list[str], - base_coords: dict | None = None, - name: str | None = None, - ) -> xr.DataArray: - """Build a DataArray property, handling both single and multi-dimensional cases.""" - slices = [] - for key, cr in self._results.items(): - da = xr.DataArray(get_data(cr), dims=base_dims, coords=base_coords or {}, name=name) - for dim_name, coord_val in zip(self._dim_names, key, strict=True): - da = da.expand_dims({dim_name: [coord_val]}) - slices.append(da) - - if len(slices) == 1: - result = slices[0] - else: - combined = xr.combine_by_coords(slices) - if isinstance(combined, xr.Dataset): - result = combined[name] + # Resolve ClusteringResult from various sources + if clustering_result is not None: + if isinstance(clustering_result, dict): + self._clustering_result = self._clustering_result_from_dict(clustering_result) else: - result = combined - return result.transpose(*base_dims, *self._dim_names) - - @staticmethod - def _key_to_str(key: tuple) -> str: - """Convert key tuple to string for serialization.""" - if not key: - return '__single__' - return '|'.join(str(k) for k in key) - - @staticmethod - def _str_to_key(key_str: str, dim_names: list[str]) -> tuple: - """Convert string back to key tuple.""" - if key_str == '__single__': - return () - parts = key_str.split('|') - # Try to convert to int if possible (for period years) - result = [] - for part in parts: - try: - result.append(int(part)) - except ValueError: - result.append(part) - return tuple(result) - - def __repr__(self) -> str: - if not self.dims: - return f'ClusteringResults(n_clusters={self.n_clusters})' - coords_str = ', '.join(f'{k}: {len(v)}' for k, v in self.coords.items()) - return f'ClusteringResults(dims={self.dims}, coords=({coords_str}), n_clusters={self.n_clusters})' - - def apply(self, data: xr.Dataset) -> AggregationResults: - """Apply clustering to dataset for all (period, scenario) combinations. - - Args: - data: Dataset with time-varying data. Must have 'time' dimension. - May have 'period' and/or 'scenario' dimensions matching this object. - - Returns: - AggregationResults with full access to aggregated data. - Use `.clustering` on the result to get ClusteringResults for IO. - - Example: - >>> agg_results = clustering_results.apply(dataset) - >>> agg_results.clustering # Get ClusteringResults for IO - >>> for key, result in agg_results: - ... print(result.cluster_representatives) - """ - from ..core import drop_constant_arrays - - results = {} - for key, cr in self._results.items(): - # Build selector from key based on dim_names - selector = {dim_name: key[i] for i, dim_name in enumerate(self._dim_names)} - data_slice = data.sel(**selector, drop=True) if selector else data - - # Drop constant arrays and convert to DataFrame - time_varying = drop_constant_arrays(data_slice, dim='time') - df = time_varying.to_dataframe() + self._clustering_result = clustering_result + elif _aggregation_result is not None: + self._clustering_result = _aggregation_result.clustering + elif results is not None: + # Legacy path: accept old ClusteringResults or dict + if isinstance(results, dict): + self._clustering_result = self._clustering_result_from_dict(results) + elif hasattr(results, '_results') and hasattr(results, '_dim_names'): + self._clustering_result = ClusteringResultClass( + time_dim='time', + cluster_dim=['variable'], + slice_dims=list(results._dim_names), + clusterings=dict(results._results), + ) + else: + raise TypeError(f'Cannot create ClusteringResult from {type(results)}') + else: + raise ValueError('Either clustering_result or _aggregation_result must be provided') - # Apply clustering - results[key] = cr.apply(df) + # Resolve unrename_map: if not explicitly provided, infer from slice_dims + # (e.g., '_period' in slice_dims → {'_period': 'period'}) + if _unrename_map: + self._unrename_map = _unrename_map + else: + known_renames = {'_period': 'period', '_cluster': 'cluster'} + self._unrename_map = {k: v for k, v in known_renames.items() if k in self._clustering_result.slice_dims} - return Clustering._from_aggregation_results(results, self._dim_names) + # Flag indicating this was loaded from serialization (missing full AggregationResult data) + self._from_serialization = _aggregation_result is None + self.original_timesteps = original_timesteps if original_timesteps is not None else pd.DatetimeIndex([]) -class Clustering: - """Clustering information for a FlowSystem. + # Ensure time_coords is set on ClusteringResult (needed for disaggregate) + if self._clustering_result.time_coords is None and len(self.original_timesteps) > 0: + object.__setattr__(self._clustering_result, 'time_coords', self.original_timesteps) - Thin wrapper around tsam 3.0's AggregationResult objects, providing: - 1. Multi-dimensional access for (period, scenario) combinations - 2. Structure properties (n_clusters, dims, coords, cluster_assignments) - 3. JSON persistence via ClusteringResults + @staticmethod + def _clustering_result_from_dict(d: dict) -> ClusteringResult: + """Create ClusteringResult from serialized dict.""" + from tsam_xarray import ClusteringResult as ClusteringResultClass - Use ``sel()`` to access individual tsam AggregationResult objects for - detailed analysis (cluster_representatives, accuracy, plotting). + return ClusteringResultClass.from_dict(d) - Attributes: - results: ClusteringResults for structure access (works after JSON load). - original_timesteps: Original timesteps before clustering. - dims: Dimension names, e.g., ('period', 'scenario'). - coords: Coordinate values, e.g., {'period': [2024, 2025]}. + # ========================================================================== + # Helper for dim unrenaming + # ========================================================================== - Example: - >>> clustering = fs_clustered.clustering - >>> clustering.n_clusters - 8 - >>> clustering.dims - ('period',) - - # Access tsam AggregationResult for detailed analysis - >>> result = clustering.sel(period=2024) - >>> result.cluster_representatives # DataFrame - >>> result.accuracy # AccuracyMetrics - >>> result.plot.compare() # tsam's built-in plotting - """ + def _unrename(self, da: xr.DataArray) -> xr.DataArray: + """Rename tsam_xarray output dims back to original names (e.g., _period -> period).""" + if not self._unrename_map: + return da + renames = {k: v for k, v in self._unrename_map.items() if k in da.dims} + return da.rename(renames) if renames else da # ========================================================================== - # Core properties (delegated to ClusteringResults) + # Core properties (delegated to ClusteringResult) # ========================================================================== + @property + def clustering_result(self) -> ClusteringResult: + """tsam_xarray ClusteringResult for reuse with apply_clustering().""" + return self._clustering_result + @property def n_clusters(self) -> int: """Number of clusters (typical periods).""" - return self.results.n_clusters + return self._clustering_result.n_clusters @property def timesteps_per_cluster(self) -> int: """Number of timesteps in each cluster.""" - return self.results.timesteps_per_cluster - - @property - def timesteps_per_period(self) -> int: - """Alias for timesteps_per_cluster.""" - return self.timesteps_per_cluster + return self._clustering_result.n_timesteps_per_period @property def n_original_clusters(self) -> int: """Number of original periods (before clustering).""" - return self.results.n_original_periods - - @property - def dim_names(self) -> list[str]: - """Names of extra dimensions, e.g., ['period', 'scenario'].""" - return self.results.dim_names - - @property - def dims(self) -> tuple[str, ...]: - """Dimension names as tuple (xarray-like).""" - return self.results.dims - - @property - def coords(self) -> dict[str, list]: - """Coordinate values for each dimension (xarray-like). - - Returns: - Dict mapping dimension names to lists of coordinate values. - - Example: - >>> clustering.coords - {'period': [2024, 2025], 'scenario': ['low', 'high']} - """ - return self.results.coords - - def sel( - self, - period: int | str | None = None, - scenario: str | None = None, - ) -> AggregationResult: - """Select AggregationResult by period and/or scenario. - - Access individual tsam AggregationResult objects for detailed analysis. - - Note: - This method is only available before saving/loading the FlowSystem. - After IO (to_dataset/from_dataset or to_json), the full AggregationResult - data is not preserved. Use `results.sel()` for structure-only access - after loading. - - Args: - period: Period value (e.g., 2024). Required if clustering has periods. - scenario: Scenario name (e.g., 'high'). Required if clustering has scenarios. - - Returns: - The tsam AggregationResult for the specified combination. - Access its properties like `cluster_representatives`, `accuracy`, etc. - - Raises: - KeyError: If no result found for the specified combination. - ValueError: If accessed on a Clustering loaded from JSON/NetCDF. - - Example: - >>> result = clustering.sel(period=2024, scenario='high') - >>> result.cluster_representatives # DataFrame with aggregated data - >>> result.accuracy # AccuracyMetrics - >>> result.plot.compare() # tsam's built-in comparison plot - """ - self._require_full_data('sel()') - # Build key from provided args in dim order - key_parts = [] - if 'period' in self._dim_names: - if period is None: - raise KeyError(f"'period' is required. Available: {self.coords.get('period', [])}") - key_parts.append(period) - if 'scenario' in self._dim_names: - if scenario is None: - raise KeyError(f"'scenario' is required. Available: {self.coords.get('scenario', [])}") - key_parts.append(scenario) - key = tuple(key_parts) - if key not in self._aggregation_results: - raise KeyError(f'No result found for period={period}, scenario={scenario}') - return self._aggregation_results[key] - - @property - def is_segmented(self) -> bool: - """Whether intra-period segmentation was used. - - Segmented systems have variable timestep durations within each cluster, - where each segment represents a different number of original timesteps. - """ - return self.results.n_segments is not None + return self._clustering_result.n_original_periods @property def n_segments(self) -> int | None: """Number of segments per cluster, or None if not segmented.""" - return self.results.n_segments + return self._clustering_result.n_segments @property - def cluster_assignments(self) -> xr.DataArray: - """Mapping from original periods to cluster IDs. - - Returns: - DataArray with dims [original_cluster] or [original_cluster, period?, scenario?]. - """ - return self.results.cluster_assignments + def is_segmented(self) -> bool: + """Whether intra-period segmentation was used.""" + return self._clustering_result.n_segments is not None @property - def n_representatives(self) -> int: - """Number of representative timesteps after clustering.""" - if self.is_segmented: - return self.n_clusters * self.n_segments - return self.n_clusters * self.timesteps_per_cluster + def dim_names(self) -> list[str]: + """Names of extra dimensions, e.g., ['period', 'scenario'].""" + return [self._unrename_map.get(d, d) for d in self._clustering_result.slice_dims] # ========================================================================== - # Derived properties + # DataArray properties (delegated to ClusteringResult with unrename) # ========================================================================== @property - def cluster_occurrences(self) -> xr.DataArray: - """Count of how many original periods each cluster represents. - - Returns: - DataArray with dims [cluster] or [cluster, period?, scenario?]. - """ - return self.results.cluster_occurrences - - @property - def representative_weights(self) -> xr.DataArray: - """Weight for each cluster (number of original periods it represents). - - This is the same as cluster_occurrences but named for API consistency. - Used as cluster_weight in FlowSystem. - """ - return self.cluster_occurrences.rename('representative_weights') - - @functools.cached_property - def timestep_mapping(self) -> xr.DataArray: - """Mapping from original timesteps to representative timestep indices. - - Each value indicates which representative timestep index (0 to n_representatives-1) - corresponds to each original timestep. - - Note: This property is cached for performance since it's accessed frequently - during expand() operations. - """ - return self._build_timestep_mapping() - - @property - def metrics(self) -> xr.Dataset: - """Clustering quality metrics (RMSE, MAE, etc.). - - Returns: - Dataset with dims [time_series, period?, scenario?], or empty Dataset if no metrics. - """ - if self._metrics is None: - return xr.Dataset() - return self._metrics - - @property - def cluster_start_positions(self) -> np.ndarray: - """Integer positions where clusters start in reduced timesteps. + def cluster_assignments(self) -> xr.DataArray: + """Mapping from original periods to cluster IDs. Returns: - 1D array: [0, T, 2T, ...] where T = timesteps_per_cluster (or n_segments if segmented). + DataArray with dims [original_cluster, period?, scenario?]. """ - if self.is_segmented: - n_timesteps = self.n_clusters * self.n_segments - return np.arange(0, n_timesteps, self.n_segments) - n_timesteps = self.n_clusters * self.timesteps_per_cluster - return np.arange(0, n_timesteps, self.timesteps_per_cluster) + da = self._clustering_result.cluster_assignments + # Rename tsam_xarray's 'period' dim to our 'original_cluster' convention + # (must happen before _unrename to avoid conflict with _period → period rename) + if 'period' in da.dims: + da = da.rename({'period': 'original_cluster'}) + da = self._unrename(da) + # Ensure original_cluster is first dim (tsam_xarray puts slice dims first) + if 'original_cluster' in da.dims and da.dims[0] != 'original_cluster': + other_dims = [d for d in da.dims if d != 'original_cluster'] + da = da.transpose('original_cluster', *other_dims) + return da @property - def cluster_centers(self) -> xr.DataArray: - """Which original period is the representative (center) for each cluster. + def cluster_occurrences(self) -> xr.DataArray: + """How many original clusters map to each typical cluster. Returns: - DataArray with dims [cluster] containing original period indices. + DataArray with dims [cluster, period?, scenario?]. """ - return self.results.cluster_centers + return self._unrename(self._clustering_result.cluster_occurrences) @property def segment_assignments(self) -> xr.DataArray | None: - """For each timestep within a cluster, which intra-period segment it belongs to. - - Only available if segmentation was configured during clustering. + """For each timestep within a cluster, which segment it belongs to. Returns: - DataArray with dims [cluster, time] or None if no segmentation. + DataArray with dims [cluster, time, period?, scenario?], or None if not segmented. """ - return self.results.segment_assignments + result = self._clustering_result.segment_assignments + if result is None: + return None + # tsam_xarray uses 'timestep', we use 'time' + if 'timestep' in result.dims: + result = result.rename({'timestep': 'time'}) + return self._unrename(result) @property def segment_durations(self) -> xr.DataArray | None: - """Duration of each intra-period segment in hours. - - Only available if segmentation was configured during clustering. - - Returns: - DataArray with dims [cluster, segment] or None if no segmentation. - """ - return self.results.segment_durations - - @property - def segment_centers(self) -> xr.DataArray | None: - """Center of each intra-period segment. - - Only available if segmentation was configured during clustering. + """Duration of each segment in timesteps. Returns: - DataArray with dims [cluster, segment] or None if no segmentation. + DataArray with dims [cluster, segment, period?, scenario?], or None if not segmented. """ - return self.results.segment_centers + result = self._clustering_result.segment_durations + if result is None: + return None + # tsam_xarray uses 'timestep', we use 'segment' + if 'timestep' in result.dims: + result = result.rename({'timestep': 'segment'}) + return self._unrename(result) # ========================================================================== # Methods # ========================================================================== - def expand_data( - self, - aggregated: xr.DataArray, - original_time: pd.DatetimeIndex | None = None, - ) -> xr.DataArray: - """Expand aggregated data back to original timesteps. - - Uses the timestep_mapping to map each original timestep to its - representative value from the aggregated data. Fully vectorized using - xarray's advanced indexing - no loops over period/scenario dimensions. + def disaggregate(self, data: xr.DataArray) -> xr.DataArray: + """Expand clustered data back to original timesteps. - Args: - aggregated: DataArray with aggregated (cluster, time) or (time,) dimension. - original_time: Original time coordinates. Defaults to self.original_timesteps. - - Returns: - DataArray expanded to original timesteps. - """ - if original_time is None: - original_time = self.original_timesteps + Delegates to tsam_xarray's ClusteringResult.disaggregate(). Handles + the dim rename from flixopt's ``(cluster, time)`` to tsam_xarray's + ``(cluster, timestep)`` convention. - timestep_mapping = self.timestep_mapping # Already multi-dimensional DataArray - - if 'cluster' not in aggregated.dims: - # No cluster dimension: use mapping directly as time index - expanded = aggregated.isel(time=timestep_mapping) - else: - # Has cluster dimension: compute cluster and time indices from mapping - # For segmented systems, time dimension is n_segments, not timesteps_per_cluster - if self.is_segmented and self.n_segments is not None: - time_dim_size = self.n_segments - else: - time_dim_size = self.timesteps_per_cluster - - cluster_indices = timestep_mapping // time_dim_size - time_indices = timestep_mapping % time_dim_size - - # xarray's advanced indexing handles broadcasting across period/scenario dims - expanded = aggregated.isel(cluster=cluster_indices, time=time_indices) - - # Clean up: drop coordinate artifacts from isel, then rename original_time -> time - # The isel operation may leave 'cluster' and 'time' as non-dimension coordinates - expanded = expanded.drop_vars(['cluster', 'time'], errors='ignore') - expanded = expanded.rename({'original_time': 'time'}).assign_coords(time=original_time) - - return expanded.transpose('time', ...).assign_attrs(aggregated.attrs) - - def build_expansion_divisor( - self, - original_time: pd.DatetimeIndex | None = None, - ) -> xr.DataArray: - """Build divisor for correcting segment totals when expanding to hourly. - - For segmented systems, each segment value is a total that gets repeated N times - when expanded to hourly resolution (where N = segment duration in timesteps). - This divisor allows converting those totals back to hourly rates during expansion. - - For each original timestep, returns the number of original timesteps that map - to the same (cluster, segment) - i.e., the segment duration in timesteps. - - Fully vectorized using xarray's advanced indexing - no loops over period/scenario. - - Args: - original_time: Original time coordinates. Defaults to self.original_timesteps. - - Returns: - DataArray with dims ['time'] or ['time', 'period'?, 'scenario'?] containing - the number of timesteps in each segment, aligned to original timesteps. - """ - if not self.is_segmented or self.n_segments is None: - raise ValueError('build_expansion_divisor requires a segmented clustering') - - if original_time is None: - original_time = self.original_timesteps - - timestep_mapping = self.timestep_mapping # Already multi-dimensional - segment_durations = self.results.segment_durations # [cluster, segment, period?, scenario?] - - # Decode cluster and segment indices from timestep_mapping - # For segmented systems, encoding is: cluster_id * n_segments + segment_idx - time_dim_size = self.n_segments - cluster_indices = timestep_mapping // time_dim_size - segment_indices = timestep_mapping % time_dim_size # This IS the segment index - - # Get duration for each segment directly - # segment_durations[cluster, segment] -> duration - divisor = segment_durations.isel(cluster=cluster_indices, segment=segment_indices) - - # Clean up coordinates and rename - divisor = divisor.drop_vars(['cluster', 'time', 'segment'], errors='ignore') - divisor = divisor.rename({'original_time': 'time'}).assign_coords(time=original_time) - - return divisor.transpose('time', ...).rename('expansion_divisor') - - def get_result( - self, - period: Any = None, - scenario: Any = None, - ) -> TsamClusteringResult: - """Get the tsam ClusteringResult for a specific (period, scenario). + For non-segmented systems, values are repeated for each timestep in the period. + For segmented systems, values are placed at segment boundaries with NaN + elsewhere — use ``.ffill()``, ``.interpolate_na()``, or ``.fillna()`` + on the result. Args: - period: Period label (if applicable). - scenario: Scenario label (if applicable). + data: DataArray with ``(cluster, time)`` or ``(cluster, segment)`` dims. Returns: - The tsam ClusteringResult for the specified combination. + DataArray with ``time`` dim restored to original timesteps. """ - return self.results.sel(period=period, scenario=scenario) + # Rename flixopt dim names to tsam_xarray's 'timestep' convention + flixopt_to_tsam = {'time': 'timestep', 'segment': 'timestep'} + renames_to_tsam = {k: v for k, v in flixopt_to_tsam.items() if k in data.dims} + if renames_to_tsam: + data = data.rename(renames_to_tsam) + # Rename period/scenario dims to internal names (_period, _scenario) + reverse_unrename = {v: k for k, v in self._unrename_map.items()} + renames = {k: v for k, v in reverse_unrename.items() if k in data.dims} + if renames: + data = data.rename(renames) + result = self._clustering_result.disaggregate(data) + return self._unrename(result) def apply( self, - data: pd.DataFrame, - period: Any = None, - scenario: Any = None, - ) -> AggregationResult: + data: xr.DataArray, + ) -> TsamXarrayAggregationResult: """Apply the saved clustering to new data. Args: - data: DataFrame with time series data to cluster. - period: Period label (if applicable). - scenario: Scenario label (if applicable). + data: DataArray with time series data to cluster. Returns: - tsam AggregationResult with the clustering applied. + tsam_xarray AggregationResult with the clustering applied. """ - return self.results.sel(period=period, scenario=scenario).apply(data) + return self._clustering_result.apply(data) + + # ========================================================================== + # Serialization + # ========================================================================== def to_json(self, path: str | Path) -> None: """Save the clustering for reuse. - Uses ClusteringResults.to_dict() which preserves full tsam ClusteringResult. Can be loaded later with Clustering.from_json() and used with flow_system.transform.apply_clustering(). @@ -962,7 +288,7 @@ def to_json(self, path: str | Path) -> None: path: Path to save the JSON file. """ data = { - 'results': self.results.to_dict(), + 'clustering_result': self._clustering_result.to_dict(), 'original_timesteps': [ts.isoformat() for ts in self.original_timesteps], } @@ -977,8 +303,8 @@ def from_json( ) -> Clustering: """Load a clustering from JSON. - The loaded Clustering has full apply() support because ClusteringResult - is fully preserved via tsam's serialization. + The loaded Clustering has full apply() and disaggregate() support + because ClusteringResult is fully preserved via serialization. Args: path: Path to the JSON file. @@ -991,286 +317,59 @@ def from_json( with open(path) as f: data = json.load(f) - results = ClusteringResults.from_dict(data['results']) + # Support both new format (clustering_result) and legacy format (results) + if 'clustering_result' in data: + clustering_result = data['clustering_result'] + elif 'results' in data: + clustering_result = data['results'] # Legacy format, handled by __init__ + else: + raise ValueError('JSON file must contain "clustering_result" or "results" key') if original_timesteps is None: original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in data['original_timesteps']]) return cls( - results=results, + clustering_result=clustering_result, original_timesteps=original_timesteps, ) - # ========================================================================== - # Visualization - # ========================================================================== - - @property - def plot(self) -> ClusteringPlotAccessor: - """Access plotting methods for clustering visualization. - - Returns: - ClusteringPlotAccessor with compare(), heatmap(), and clusters() methods. - """ - return ClusteringPlotAccessor(self) - - # ========================================================================== - # Private helpers - # ========================================================================== - - def _build_timestep_mapping(self) -> xr.DataArray: - """Build timestep_mapping DataArray.""" - n_original = len(self.original_timesteps) - original_time_coord = self.original_timesteps.rename('original_time') - return self.results._build_property_array( - lambda cr: _build_timestep_mapping(cr, n_original), - base_dims=['original_time'], - base_coords={'original_time': original_time_coord}, - name='timestep_mapping', - ) - - def _create_reference_structure(self, include_original_data: bool = True) -> tuple[dict, dict[str, xr.DataArray]]: + def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: """Create serialization structure for to_dataset(). - Args: - include_original_data: Whether to include original_data in serialization. - Set to False for smaller files when plot.compare() isn't needed after IO. - Defaults to True. - Returns: Tuple of (reference_dict, arrays_dict). """ - arrays = {} - - # Collect original_data arrays - # Rename 'time' to 'original_time' to avoid conflict with clustered FlowSystem's time coord - original_data_refs = None - if include_original_data and self.original_data is not None: - original_data_refs = [] - # Use variables for faster access (avoids _construct_dataarray overhead) - variables = self.original_data.variables - for name in self.original_data.data_vars: - var = variables[name] - ref_name = f'original_data|{name}' - # Rename time dim to avoid xarray alignment issues - if 'time' in var.dims: - new_dims = tuple('original_time' if d == 'time' else d for d in var.dims) - arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs) - else: - arrays[ref_name] = var - original_data_refs.append(f':::{ref_name}') - - # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's - # main data arrays and would be redundant. After loading, aggregated_data is - # reconstructed from the FlowSystem's dataset. - - # Collect metrics arrays - metrics_refs = None - if self._metrics is not None: - metrics_refs = [] - # Use variables for faster access (avoids _construct_dataarray overhead) - metrics_vars = self._metrics.variables - for name in self._metrics.data_vars: - ref_name = f'metrics|{name}' - arrays[ref_name] = metrics_vars[name] - metrics_refs.append(f':::{ref_name}') - reference = { '__class__': 'Clustering', - 'results': self.results.to_dict(), # Full ClusteringResults serialization + 'clustering_result': self._clustering_result.to_dict(), 'original_timesteps': [ts.isoformat() for ts in self.original_timesteps], - '_original_data_refs': original_data_refs, - '_metrics_refs': metrics_refs, } - - return reference, arrays - - def __init__( - self, - results: ClusteringResults | dict | None = None, - original_timesteps: pd.DatetimeIndex | list[str] | None = None, - original_data: xr.Dataset | None = None, - aggregated_data: xr.Dataset | None = None, - _metrics: xr.Dataset | None = None, - # These are for reconstruction from serialization - _original_data_refs: list[str] | None = None, - _metrics_refs: list[str] | None = None, - # Internal: AggregationResult dict for full data access - _aggregation_results: dict[tuple, AggregationResult] | None = None, - _dim_names: list[str] | None = None, - ): - """Initialize Clustering object. - - Args: - results: ClusteringResults instance, or dict from to_dict() (for deserialization). - Not needed if _aggregation_results is provided. - original_timesteps: Original timesteps before clustering. - original_data: Original dataset before clustering (for expand/plotting). - aggregated_data: Aggregated dataset after clustering (for plotting). - After loading from file, this is reconstructed from FlowSystem data. - _metrics: Pre-computed metrics dataset. - _original_data_refs: Internal: resolved DataArrays from serialization. - _metrics_refs: Internal: resolved DataArrays from serialization. - _aggregation_results: Internal: dict of AggregationResult for full data access. - _dim_names: Internal: dimension names when using _aggregation_results. - """ - # Handle ISO timestamp strings from serialization - if ( - isinstance(original_timesteps, list) - and len(original_timesteps) > 0 - and isinstance(original_timesteps[0], str) - ): - original_timesteps = pd.DatetimeIndex([pd.Timestamp(ts) for ts in original_timesteps]) - - # Store AggregationResults if provided (full data access) - self._aggregation_results = _aggregation_results - self._dim_names = _dim_names or [] - - # Handle results - only needed for serialization path - if results is not None: - if isinstance(results, dict): - results = ClusteringResults.from_dict(results) - self._results_cache = results - else: - self._results_cache = None - - # Flag indicating this was loaded from serialization (missing full AggregationResult data) - self._from_serialization = _aggregation_results is None and results is not None - - self.original_timesteps = original_timesteps if original_timesteps is not None else pd.DatetimeIndex([]) - self._metrics = _metrics - - # Handle reconstructed data from refs (list of DataArrays) - if _original_data_refs is not None and isinstance(_original_data_refs, list): - # These are resolved DataArrays from the structure resolver - if all(isinstance(da, xr.DataArray) for da in _original_data_refs): - # Rename 'original_time' back to 'time' and strip 'original_data|' prefix - data_vars = {} - for da in _original_data_refs: - if 'original_time' in da.dims: - da = da.rename({'original_time': 'time'}) - # Strip 'original_data|' prefix from name (added during serialization) - name = da.name - if name.startswith('original_data|'): - name = name[14:] # len('original_data|') = 14 - data_vars[name] = da.rename(name) - self.original_data = xr.Dataset(data_vars) - else: - self.original_data = original_data - else: - self.original_data = original_data - - self.aggregated_data = aggregated_data - - if _metrics_refs is not None and isinstance(_metrics_refs, list): - if all(isinstance(da, xr.DataArray) for da in _metrics_refs): - # Strip 'metrics|' prefix from name (added during serialization) - data_vars = {} - for da in _metrics_refs: - name = da.name - if name.startswith('metrics|'): - name = name[8:] # len('metrics|') = 8 - data_vars[name] = da.rename(name) - self._metrics = xr.Dataset(data_vars) - - @property - def results(self) -> ClusteringResults: - """ClusteringResults for structure access (derived from AggregationResults or cached).""" - if self._results_cache is not None: - return self._results_cache - if self._aggregation_results is not None: - # Derive from AggregationResults (cached on first access) - self._results_cache = ClusteringResults( - {k: r.clustering for k, r in self._aggregation_results.items()}, - self._dim_names, - ) - return self._results_cache - raise ValueError('No results available - neither AggregationResults nor ClusteringResults set') - - @classmethod - def _from_aggregation_results( - cls, - aggregation_results: dict[tuple, AggregationResult], - dim_names: list[str], - original_timesteps: pd.DatetimeIndex | None = None, - original_data: xr.Dataset | None = None, - ) -> Clustering: - """Create Clustering from AggregationResult dict. - - This is the primary way to create a Clustering with full data access. - Called by ClusteringResults.apply() and TransformAccessor. - - Args: - aggregation_results: Dict mapping (period, scenario) tuples to AggregationResult. - dim_names: Dimension names, e.g., ['period', 'scenario']. - original_timesteps: Original timesteps (optional, for expand). - original_data: Original dataset (optional, for plotting). - - Returns: - Clustering with full AggregationResult access. - """ - return cls( - original_timesteps=original_timesteps, - original_data=original_data, - _aggregation_results=aggregation_results, - _dim_names=dim_names, - ) + return reference, {} # ========================================================================== - # Iteration over AggregationResults (for direct access to tsam results) + # Access to tsam_xarray AggregationResult # ========================================================================== - def __iter__(self): - """Iterate over (key, AggregationResult) pairs. + @property + def aggregation_result(self) -> TsamXarrayAggregationResult: + """The tsam_xarray AggregationResult for full data access. + + Only available before serialization. After loading from file, + use clustering_result for structure-only access. Raises: - ValueError: If accessed on a Clustering loaded from JSON. + ValueError: If accessed on a Clustering loaded from JSON/NetCDF. """ - self._require_full_data('iteration') - return iter(self._aggregation_results.items()) + self._require_full_data('aggregation_result') + return self._aggregation_result def __len__(self) -> int: """Number of (period, scenario) combinations.""" - if self._aggregation_results is not None: - return len(self._aggregation_results) - return len(list(self.results.keys())) - - def __getitem__(self, key: tuple) -> AggregationResult: - """Get AggregationResult by (period, scenario) key. - - Raises: - ValueError: If accessed on a Clustering loaded from JSON. - """ - self._require_full_data('item access') - return self._aggregation_results[key] - - def items(self): - """Iterate over (key, AggregationResult) pairs. - - Raises: - ValueError: If accessed on a Clustering loaded from JSON. - """ - self._require_full_data('items()') - return self._aggregation_results.items() - - def keys(self): - """Iterate over (period, scenario) keys.""" - if self._aggregation_results is not None: - return self._aggregation_results.keys() - return self.results.keys() - - def values(self): - """Iterate over AggregationResult objects. - - Raises: - ValueError: If accessed on a Clustering loaded from JSON. - """ - self._require_full_data('values()') - return self._aggregation_results.values() + return len(self._clustering_result.clusterings) def _require_full_data(self, operation: str) -> None: """Raise error if full AggregationResult data is not available.""" - if self._from_serialization: + if self._from_serialization or self._aggregation_result is None: raise ValueError( f'{operation} requires full AggregationResult data, ' f'but this Clustering was loaded from JSON. ' @@ -1287,376 +386,6 @@ def __repr__(self) -> str: ) -class ClusteringPlotAccessor: - """Plot accessor for Clustering objects. - - Provides visualization methods for comparing original vs aggregated data - and understanding the clustering structure. - """ - - def __init__(self, clustering: Clustering): - self._clustering = clustering - - def compare( - self, - kind: str = 'timeseries', - variables: str | list[str] | None = None, - *, - select: SelectType | None = None, - colors: ColorType | None = None, - show: bool | None = None, - data_only: bool = False, - **plotly_kwargs: Any, - ) -> PlotResult: - """Compare original vs aggregated data. - - Args: - kind: Type of comparison plot. - - 'timeseries': Time series comparison (default) - - 'duration_curve': Sorted duration curve comparison - variables: Variable(s) to plot. Can be a string, list of strings, - or None to plot all time-varying variables. - select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}. - colors: Color specification (colorscale name, color list, or label-to-color dict). - show: Whether to display the figure. - Defaults to CONFIG.Plotting.default_show. - data_only: If True, skip figure creation and return only data. - **plotly_kwargs: Additional arguments passed to plotly (e.g., color, line_dash, - facet_col, facet_row). Defaults: x='time'/'duration', color='variable', - line_dash='representation', symbol=None. - - Returns: - PlotResult containing the comparison figure and underlying data. - """ - import plotly.graph_objects as go - - from ..config import CONFIG - from ..plot_result import PlotResult - from ..statistics_accessor import _apply_selection - - if kind not in ('timeseries', 'duration_curve'): - raise ValueError(f"Unknown kind '{kind}'. Use 'timeseries' or 'duration_curve'.") - - clustering = self._clustering - if clustering.original_data is None or clustering.aggregated_data is None: - raise ValueError('No original/aggregated data available for comparison') - - resolved_variables = self._resolve_variables(variables) - - # Build Dataset with variables as data_vars - data_vars = {} - for var in resolved_variables: - original = clustering.original_data[var] - clustered = clustering.expand_data(clustering.aggregated_data[var]) - combined = xr.concat([original, clustered], dim=pd.Index(['Original', 'Clustered'], name='representation')) - data_vars[var] = combined - ds = xr.Dataset(data_vars) - - ds = _apply_selection(ds, select) - - if kind == 'duration_curve': - sorted_vars = {} - # Use variables for faster access (avoids _construct_dataarray overhead) - variables = ds.variables - rep_values = ds.coords['representation'].values - rep_idx = {rep: i for i, rep in enumerate(rep_values)} - for var in ds.data_vars: - data = variables[var].values - for rep in rep_values: - # Direct numpy indexing instead of .sel() - values = np.sort(data[rep_idx[rep]].flatten())[::-1] - sorted_vars[(var, rep)] = values - # Get length from first sorted array - n = len(next(iter(sorted_vars.values()))) - ds = xr.Dataset( - { - var: xr.DataArray( - [sorted_vars[(var, r)] for r in ['Original', 'Clustered']], - dims=['representation', 'duration'], - coords={'representation': ['Original', 'Clustered'], 'duration': range(n)}, - ) - for var in resolved_variables - } - ) - - title = ( - ( - 'Original vs Clustered' - if len(resolved_variables) > 1 - else f'Original vs Clustered: {resolved_variables[0]}' - ) - if kind == 'timeseries' - else ('Duration Curve' if len(resolved_variables) > 1 else f'Duration Curve: {resolved_variables[0]}') - ) - - # Early return for data_only mode - if data_only: - return PlotResult(data=ds, figure=go.Figure()) - - # Apply slot defaults - defaults = { - 'x': 'duration' if kind == 'duration_curve' else 'time', - 'color': 'variable', - 'line_dash': 'representation', - 'line_dash_map': {'Original': 'dot', 'Clustered': 'solid'}, - 'symbol': None, # Block symbol slot - } - _apply_slot_defaults(plotly_kwargs, defaults) - - color_kwargs = _build_color_kwargs(colors, list(ds.data_vars)) - fig = ds.plotly.line( - title=title, - **color_kwargs, - **plotly_kwargs, - ) - fig.update_yaxes(matches=None) - fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) - - plot_result = PlotResult(data=ds, figure=fig) - - if show is None: - show = CONFIG.Plotting.default_show - if show: - plot_result.show() - - return plot_result - - def _get_time_varying_variables(self) -> list[str]: - """Get list of time-varying variables from original data that also exist in aggregated data.""" - if self._clustering.original_data is None: - return [] - # Get variables that exist in both original and aggregated data - aggregated_vars = ( - set(self._clustering.aggregated_data.data_vars) - if self._clustering.aggregated_data is not None - else set(self._clustering.original_data.data_vars) - ) - return [ - name - for name in self._clustering.original_data.data_vars - if name in aggregated_vars - and 'time' in self._clustering.original_data[name].dims - and not np.isclose( - self._clustering.original_data[name].min(), - self._clustering.original_data[name].max(), - ) - ] - - def _resolve_variables(self, variables: str | list[str] | None) -> list[str]: - """Resolve variables parameter to a list of valid variable names.""" - time_vars = self._get_time_varying_variables() - if not time_vars: - raise ValueError('No time-varying variables found') - - if variables is None: - return time_vars - elif isinstance(variables, str): - if variables not in time_vars: - raise ValueError(f"Variable '{variables}' not found. Available: {time_vars}") - return [variables] - else: - invalid = [v for v in variables if v not in time_vars] - if invalid: - raise ValueError(f'Variables {invalid} not found. Available: {time_vars}') - return list(variables) - - def heatmap( - self, - *, - select: SelectType | None = None, - colors: str | list[str] | None = None, - show: bool | None = None, - data_only: bool = False, - **plotly_kwargs: Any, - ) -> PlotResult: - """Plot cluster assignments over time as a heatmap timeline. - - Shows which cluster each timestep belongs to as a horizontal color bar. - The x-axis is time, color indicates cluster assignment. This visualization - aligns with time series data, making it easy to correlate cluster - assignments with other plots. - - For multi-period/scenario data, uses faceting and/or animation. - - Args: - select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}. - colors: Colorscale name (str) or list of colors for heatmap coloring. - Dicts are not supported for heatmaps. - Defaults to plotly template's sequential colorscale. - show: Whether to display the figure. - Defaults to CONFIG.Plotting.default_show. - data_only: If True, skip figure creation and return only data. - **plotly_kwargs: Additional arguments passed to plotly (e.g., facet_col, animation_frame). - - Returns: - PlotResult containing the heatmap figure and cluster assignment data. - The data has 'cluster' variable with time dimension, matching original timesteps. - """ - import plotly.graph_objects as go - - from ..config import CONFIG - from ..plot_result import PlotResult - from ..statistics_accessor import _apply_selection - - clustering = self._clustering - cluster_assignments = clustering.cluster_assignments - timesteps_per_cluster = clustering.timesteps_per_cluster - original_time = clustering.original_timesteps - - if select: - cluster_assignments = _apply_selection(cluster_assignments.to_dataset(name='cluster'), select)['cluster'] - - # Expand cluster_assignments to per-timestep - extra_dims = [d for d in cluster_assignments.dims if d != 'original_cluster'] - expanded_values = np.repeat(cluster_assignments.values, timesteps_per_cluster, axis=0) - - coords = {'time': original_time} - coords.update({d: cluster_assignments.coords[d].values for d in extra_dims}) - cluster_da = xr.DataArray(expanded_values, dims=['time'] + extra_dims, coords=coords) - cluster_da.name = 'cluster' - - # Early return for data_only mode - if data_only: - return PlotResult(data=xr.Dataset({'cluster': cluster_da}), figure=go.Figure()) - - heatmap_da = cluster_da.expand_dims('y', axis=-1).assign_coords(y=['Cluster']) - heatmap_da.name = 'cluster_assignment' - heatmap_da = heatmap_da.transpose('time', 'y', ...) - - # Use plotly.imshow for heatmap - # Only pass color_continuous_scale if explicitly provided (template handles default) - if colors is not None: - plotly_kwargs.setdefault('color_continuous_scale', colors) - fig = heatmap_da.plotly.imshow( - title='Cluster Assignments', - aspect='auto', - **plotly_kwargs, - ) - - fig.update_yaxes(showticklabels=False) - fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) - - # Data is exactly what we plotted (without dummy y dimension) - data = xr.Dataset({'cluster': cluster_da}) - plot_result = PlotResult(data=data, figure=fig) - - if show is None: - show = CONFIG.Plotting.default_show - if show: - plot_result.show() - - return plot_result - - def clusters( - self, - variables: str | list[str] | None = None, - *, - select: SelectType | None = None, - colors: ColorType | None = None, - show: bool | None = None, - data_only: bool = False, - **plotly_kwargs: Any, - ) -> PlotResult: - """Plot each cluster's typical period profile. - - Shows each cluster as a separate faceted subplot with all variables - colored differently. Useful for understanding what each cluster represents. - - Args: - variables: Variable(s) to plot. Can be a string, list of strings, - or None to plot all time-varying variables. - select: xarray-style selection dict, e.g. {'scenario': 'Base Case'}. - colors: Color specification (colorscale name, color list, or label-to-color dict). - show: Whether to display the figure. - Defaults to CONFIG.Plotting.default_show. - data_only: If True, skip figure creation and return only data. - **plotly_kwargs: Additional arguments passed to plotly (e.g., color, facet_col, - facet_col_wrap). Defaults: x='time', color='variable', symbol=None. - - Returns: - PlotResult containing the figure and underlying data. - """ - import plotly.graph_objects as go - - from ..config import CONFIG - from ..plot_result import PlotResult - from ..statistics_accessor import _apply_selection - - clustering = self._clustering - if clustering.aggregated_data is None: - raise ValueError('No aggregated data available') - - aggregated_data = _apply_selection(clustering.aggregated_data, select) - resolved_variables = self._resolve_variables(variables) - - n_clusters = clustering.n_clusters - timesteps_per_cluster = clustering.timesteps_per_cluster - cluster_occurrences = clustering.cluster_occurrences - - # Build cluster labels - occ_extra_dims = [d for d in cluster_occurrences.dims if d != 'cluster'] - if occ_extra_dims: - cluster_labels = [f'Cluster {c}' for c in range(n_clusters)] - else: - cluster_labels = [ - f'Cluster {c} (×{int(cluster_occurrences.sel(cluster=c).values)})' for c in range(n_clusters) - ] - - data_vars = {} - for var in resolved_variables: - da = aggregated_data[var] - if 'cluster' in da.dims: - data_by_cluster = da.values - else: - data_by_cluster = da.values.reshape(n_clusters, timesteps_per_cluster) - data_vars[var] = xr.DataArray( - data_by_cluster, - dims=['cluster', 'time'], - coords={'cluster': cluster_labels, 'time': range(timesteps_per_cluster)}, - ) - - ds = xr.Dataset(data_vars) - - # Early return for data_only mode (include occurrences in result) - if data_only: - data_vars['occurrences'] = cluster_occurrences - return PlotResult(data=xr.Dataset(data_vars), figure=go.Figure()) - - title = 'Clusters' if len(resolved_variables) > 1 else f'Clusters: {resolved_variables[0]}' - - # Apply slot defaults - defaults = { - 'x': 'time', - 'color': 'variable', - 'symbol': None, # Block symbol slot - } - _apply_slot_defaults(plotly_kwargs, defaults) - - color_kwargs = _build_color_kwargs(colors, list(ds.data_vars)) - fig = ds.plotly.line( - title=title, - **color_kwargs, - **plotly_kwargs, - ) - fig.update_yaxes(matches=None) - fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) - - data_vars['occurrences'] = cluster_occurrences - result_data = xr.Dataset(data_vars) - plot_result = PlotResult(data=result_data, figure=fig) - - if show is None: - show = CONFIG.Plotting.default_show - if show: - plot_result.show() - - return plot_result - - -# Backwards compatibility alias -AggregationResults = Clustering - - def _register_clustering_classes(): """Register clustering classes for IO.""" from ..structure import CLASS_REGISTRY diff --git a/flixopt/core.py b/flixopt/core.py index aca380f5e..c2a32349d 100644 --- a/flixopt/core.py +++ b/flixopt/core.py @@ -40,32 +40,11 @@ class TimeSeriesData(xr.DataArray): def __init__( self, *args: Any, - clustering_group: str | None = None, - clustering_weight: float | None = None, **kwargs: Any, ): - """ - Args: - *args: Arguments passed to DataArray - clustering_group: Clustering group name. Use this when multiple time series should share the same - clustering weight (1/n where n is the number of series in the group). Mutually exclusive with clustering_weight. - clustering_weight: Clustering weight (0-1). Use this to assign a specific weight to a single time series. - Mutually exclusive with clustering_group. - **kwargs: Additional arguments passed to DataArray - """ - - if (clustering_group is not None) and (clustering_weight is not None): - raise ValueError('Use either clustering_group or clustering_weight, not both') - # Let xarray handle all the initialization complexity super().__init__(*args, **kwargs) - # Add our metadata to attrs after initialization - if clustering_group is not None: - self.attrs['clustering_group'] = clustering_group - if clustering_weight is not None: - self.attrs['clustering_weight'] = clustering_weight - # Always mark as TimeSeriesData self.attrs['__timeseries_data__'] = True @@ -81,33 +60,16 @@ def fit_to_coords( da = DataConverter.to_dataarray(self.data, coords=coords) return self.__class__( da, - clustering_group=self.clustering_group, - clustering_weight=self.clustering_weight, name=name if name is not None else self.name, ) - @property - def clustering_group(self) -> str | None: - return self.attrs.get('clustering_group') - - @property - def clustering_weight(self) -> float | None: - return self.attrs.get('clustering_weight') - @classmethod def from_dataarray( cls, da: xr.DataArray, - clustering_group: str | None = None, - clustering_weight: float | None = None, ): """Create TimeSeriesData from DataArray, extracting metadata from attrs.""" - final_clustering_group = clustering_group if clustering_group is not None else da.attrs.get('clustering_group') - final_clustering_weight = ( - clustering_weight if clustering_weight is not None else da.attrs.get('clustering_weight') - ) - - return cls(da, clustering_group=final_clustering_group, clustering_weight=final_clustering_weight) + return cls(da) @classmethod def is_timeseries_data(cls, obj) -> bool: @@ -115,13 +77,7 @@ def is_timeseries_data(cls, obj) -> bool: return isinstance(obj, xr.DataArray) and obj.attrs.get('__timeseries_data__', False) def __repr__(self): - clustering_info = [] - if self.clustering_group: - clustering_info.append(f"clustering_group='{self.clustering_group}'") - if self.clustering_weight is not None: - clustering_info.append(f'clustering_weight={self.clustering_weight}') - - info_str = f'TimeSeriesData({", ".join(clustering_info)})' if clustering_info else 'TimeSeriesData' + info_str = 'TimeSeriesData' return f'{info_str}\n{super().__repr__()}' diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py index e838c5480..df71e2ff5 100644 --- a/flixopt/flow_system.py +++ b/flixopt/flow_system.py @@ -696,7 +696,7 @@ def _create_reference_structure(self) -> tuple[dict, dict[str, xr.DataArray]]: return reference_structure, all_extracted_arrays - def to_dataset(self, include_solution: bool = True, include_original_data: bool = True) -> xr.Dataset: + def to_dataset(self, include_solution: bool = True) -> xr.Dataset: """ Convert the FlowSystem to an xarray Dataset. Ensures FlowSystem is connected before serialization. @@ -714,10 +714,6 @@ def to_dataset(self, include_solution: bool = True, include_original_data: bool include_solution: Whether to include the optimization solution in the dataset. Defaults to True. Set to False to get only the FlowSystem structure without solution data (useful for copying or saving templates). - include_original_data: Whether to include clustering.original_data in the dataset. - Defaults to True. Set to False for smaller files (~38% reduction) when - clustering.plot.compare() isn't needed after loading. The core workflow - (optimize → expand) works without original_data. Returns: xr.Dataset: Dataset containing all DataArrays with structure in attributes @@ -734,7 +730,7 @@ def to_dataset(self, include_solution: bool = True, include_original_data: bool base_ds = super().to_dataset() # Add FlowSystem-specific data (solution, clustering, metadata) - return fx_io.flow_system_to_dataset(self, base_ds, include_solution, include_original_data) + return fx_io.flow_system_to_dataset(self, base_ds, include_solution) @classmethod def from_dataset(cls, ds: xr.Dataset) -> FlowSystem: @@ -766,7 +762,6 @@ def to_netcdf( path: str | pathlib.Path, compression: int = 5, overwrite: bool = False, - include_original_data: bool = True, ): """ Save the FlowSystem to a NetCDF file. @@ -779,9 +774,6 @@ def to_netcdf( path: The path to the netCDF file. Parent directories are created if they don't exist. compression: The compression level to use when saving the file (0-9). overwrite: If True, overwrite existing file. If False, raise error if file exists. - include_original_data: Whether to include clustering.original_data in the file. - Defaults to True. Set to False for smaller files (~38% reduction) when - clustering.plot.compare() isn't needed after loading. Raises: FileExistsError: If overwrite=False and file already exists. @@ -801,7 +793,7 @@ def to_netcdf( self.name = path.stem try: - ds = self.to_dataset(include_original_data=include_original_data) + ds = self.to_dataset() fx_io.save_dataset_to_netcdf(ds, path, compression=compression) logger.info(f'Saved FlowSystem to {path}') except Exception as e: diff --git a/flixopt/io.py b/flixopt/io.py index c9ce26919..20d302204 100644 --- a/flixopt/io.py +++ b/flixopt/io.py @@ -1858,16 +1858,9 @@ def _restore_clustering( clustering = fs_cls._resolve_reference_structure(clustering_structure, clustering_arrays) flow_system.clustering = clustering - # Reconstruct aggregated_data from FlowSystem's main data arrays - if clustering.aggregated_data is None and main_var_names: - from .core import drop_constant_arrays - - main_vars = {name: arrays_dict[name] for name in main_var_names} - clustering.aggregated_data = drop_constant_arrays(xr.Dataset(main_vars), dim='time') - - # Restore cluster_weight from clustering's representative_weights - if hasattr(clustering, 'representative_weights'): - flow_system.cluster_weight = clustering.representative_weights + # Restore cluster_weight from clustering's cluster_occurrences + if hasattr(clustering, 'cluster_occurrences'): + flow_system.cluster_weight = clustering.cluster_occurrences.rename('cluster_weight') @staticmethod def _restore_metadata( @@ -1904,7 +1897,6 @@ def to_dataset( flow_system: FlowSystem, base_dataset: xr.Dataset, include_solution: bool = True, - include_original_data: bool = True, ) -> xr.Dataset: """Convert FlowSystem-specific data to dataset. @@ -1915,7 +1907,6 @@ def to_dataset( flow_system: The FlowSystem to serialize base_dataset: Dataset from parent class with basic structure include_solution: Whether to include optimization solution - include_original_data: Whether to include clustering.original_data Returns: Complete dataset with all FlowSystem data @@ -1931,7 +1922,7 @@ def to_dataset( ds = cls._add_carriers_to_dataset(ds, flow_system._carriers) # Add clustering - ds = cls._add_clustering_to_dataset(ds, flow_system.clustering, include_original_data) + ds = cls._add_clustering_to_dataset(ds, flow_system.clustering) # Add variable categories ds = cls._add_variable_categories_to_dataset(ds, flow_system._variable_categories) @@ -1996,17 +1987,13 @@ def _add_clustering_to_dataset( cls, ds: xr.Dataset, clustering: Any, - include_original_data: bool, ) -> xr.Dataset: """Add clustering object to dataset.""" if clustering is not None: - clustering_ref, clustering_arrays = clustering._create_reference_structure( - include_original_data=include_original_data - ) - # Add clustering arrays with prefix using batch assignment - # (individual ds[name] = arr assignments are slow) - prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()} - ds = ds.assign(prefixed_arrays) + clustering_ref, clustering_arrays = clustering._create_reference_structure() + if clustering_arrays: + prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()} + ds = ds.assign(prefixed_arrays) ds.attrs['clustering'] = json.dumps(clustering_ref, ensure_ascii=False) return ds @@ -2064,7 +2051,6 @@ def flow_system_to_dataset( flow_system: FlowSystem, base_dataset: xr.Dataset, include_solution: bool = True, - include_original_data: bool = True, ) -> xr.Dataset: """Convert FlowSystem-specific data to dataset. @@ -2075,7 +2061,6 @@ def flow_system_to_dataset( flow_system: The FlowSystem to serialize base_dataset: Dataset from parent class with basic structure include_solution: Whether to include optimization solution - include_original_data: Whether to include clustering.original_data Returns: Complete dataset with all FlowSystem data @@ -2083,4 +2068,4 @@ def flow_system_to_dataset( See Also: FlowSystemDatasetIO: Class containing the implementation """ - return FlowSystemDatasetIO.to_dataset(flow_system, base_dataset, include_solution, include_original_data) + return FlowSystemDatasetIO.to_dataset(flow_system, base_dataset, include_solution) diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index f7a3698cc..b200fa9cb 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -28,102 +28,37 @@ logger = logging.getLogger('flixopt') -def _combine_dataarray_slices( - slices: list[xr.DataArray], - base_dims: list[str], - extra_dims: list[str], - name: str | None = None, -) -> xr.DataArray: - """Combine DataArray slices with extra dimensions into a single DataArray. - - Args: - slices: List of DataArrays, each with extra dims already expanded. - base_dims: Base dimension names (e.g., ['cluster', 'time']). - extra_dims: Extra dimension names (e.g., ['period', 'scenario']). - name: Optional name for the result. - - Returns: - Combined DataArray with dims [*base_dims, *extra_dims]. - """ - if len(slices) == 1: - result = slices[0] - else: - combined = xr.combine_by_coords(slices) - # combine_by_coords returns Dataset when DataArrays have names - if isinstance(combined, xr.Dataset): - result = list(combined.data_vars.values())[0] - else: - result = combined - - # Ensure consistent dimension order for both single and multi-slice paths - result = result.transpose(*base_dims, *extra_dims) - - if name is not None: - result = result.rename(name) - return result - - -def _expand_dims_for_key(da: xr.DataArray, dim_names: list[str], key: tuple) -> xr.DataArray: - """Add dimensions to a DataArray based on key values. - - Args: - da: DataArray without extra dimensions. - dim_names: Names of dimensions to add (e.g., ['period', 'scenario']). - key: Tuple of coordinate values matching dim_names. - - Returns: - DataArray with extra dimensions added. - """ - for dim_name, coord_val in zip(dim_names, key, strict=True): - da = da.expand_dims({dim_name: [coord_val]}) - return da - - class _ReducedFlowSystemBuilder: - """Builds a reduced FlowSystem from tsam aggregation results. + """Builds a reduced FlowSystem from a tsam_xarray AggregationResult. This class encapsulates the construction of reduced FlowSystem datasets, - pre-computing shared coordinates and providing methods for building - each component (weights, typical periods, segment durations, metrics). + extracting cluster representatives, weights, and metrics from the + tsam_xarray result. Args: fs: The original FlowSystem being reduced. - aggregation_results: Dict mapping key tuples to tsam AggregationResult. + agg_result: tsam_xarray AggregationResult with DataArray-based results. timesteps_per_cluster: Number of timesteps per cluster. dt: Hours per timestep. - dim_names: Names of extra dimensions (e.g., ['period', 'scenario']). """ def __init__( self, fs: FlowSystem, - aggregation_results: dict[tuple, Any], + agg_result: Any, # tsam_xarray.AggregationResult timesteps_per_cluster: int, dt: float, - dim_names: list[str], + unrename_map: dict[str, str] | None = None, ): self._fs = fs - self._aggregation_results = aggregation_results + self._agg_result = agg_result self._timesteps_per_cluster = timesteps_per_cluster self._dt = dt - self._dim_names = dim_names - - # Extract info from first result (all should be consistent) - first_result = next(iter(aggregation_results.values())) - self._n_reduced_timesteps = len(first_result.cluster_representatives) - self._n_clusters = first_result.n_clusters - self._is_segmented = first_result.n_segments is not None - self._n_segments = first_result.n_segments - - # Validate all results have consistent structure - for key, result in aggregation_results.items(): - if result.n_clusters != self._n_clusters: - key_str = dict(zip(dim_names, key, strict=False)) if dim_names else key - raise ValueError( - f'Inconsistent cluster counts across periods/scenarios: ' - f'{key_str} has {result.n_clusters} clusters, but expected {self._n_clusters}. ' - f'This can happen when ExtremeConfig does not preserve cluster counts.' - ) + self._unrename_map = unrename_map or {} + + self._n_clusters = agg_result.n_clusters + self._is_segmented = agg_result.n_segments is not None + self._n_segments = agg_result.n_segments # Pre-compute coordinates self._cluster_coords = np.arange(self._n_clusters) @@ -142,135 +77,65 @@ def __init__( self._base_coords = {'cluster': self._cluster_coords, 'time': self._time_coords} - def _expand_and_combine( - self, - data_per_key: dict[tuple, xr.DataArray], - base_dims: list[str], - name: str | None = None, - ) -> xr.DataArray: - """Expand dims for each key and combine slices. - - Args: - data_per_key: Dict mapping keys to DataArrays without extra dims. - base_dims: Base dimension names (e.g., ['cluster'] or ['cluster', 'time']). - name: Optional name for the result. - - Returns: - Combined DataArray with dims [*base_dims, *dim_names]. - """ - slices = [_expand_dims_for_key(da, self._dim_names, key) for key, da in data_per_key.items()] - return _combine_dataarray_slices(slices, base_dims, self._dim_names, name=name) + def _unrename(self, da: xr.DataArray) -> xr.DataArray: + """Rename tsam_xarray output dims back to original names (e.g., _period -> period).""" + renames = {k: v for k, v in self._unrename_map.items() if k in da.dims} + return da.rename(renames) if renames else da def build_cluster_weights(self) -> xr.DataArray: - """Build cluster_weight DataArray from aggregation results. + """Build cluster_weight DataArray from aggregation result. Returns: - DataArray with dims [cluster, *dim_names]. + DataArray with dims [cluster, period?, scenario?]. """ - data_per_key = {} - for key, result in self._aggregation_results.items(): - weights = np.array([result.cluster_weights.get(c, 0) for c in range(self._n_clusters)]) - data_per_key[key] = xr.DataArray(weights, dims=['cluster'], coords={'cluster': self._cluster_coords}) - return self._expand_and_combine(data_per_key, ['cluster'], name='cluster_weight') + weights = self._agg_result.cluster_weights.rename(cluster='cluster') + return self._unrename(weights.rename('cluster_weight')) def build_typical_periods(self) -> dict[str, xr.DataArray]: - """Build typical periods DataArrays with (cluster, time, *dim_names) shape. + """Build typical periods DataArrays with (cluster, time, ...) shape. Returns: - Dict mapping column names to combined DataArrays. + Dict mapping column names to DataArrays. """ - column_slices: dict[str, dict[tuple, xr.DataArray]] = {} - - for key, tsam_result in self._aggregation_results.items(): - typical_df = tsam_result.cluster_representatives - for col in typical_df.columns: - series = typical_df[col] - if self._is_segmented: - # Segmented: MultiIndex (cluster, segment_step, segment_duration) - # Drop duration level and unstack by segment step - unstacked = series.droplevel('Segment Duration').unstack(level='Segment Step') - else: - # Non-segmented: MultiIndex (cluster, timestep) - unstacked = series.unstack(level='timestep') - da = xr.DataArray(unstacked.values, dims=['cluster', 'time'], coords=self._base_coords) - column_slices.setdefault(col, {})[key] = da - - return { - col: self._expand_and_combine(data_per_key, ['cluster', 'time']) - for col, data_per_key in column_slices.items() - } + representatives = self._agg_result.cluster_representatives + # representatives has dims: (cluster, timestep, variable, _period?, scenario?) + # We need to split by variable and rename timestep -> time + result = {} + # Exclude known dims (including renamed variants like _period, _cluster) + known_dims = {'cluster', 'timestep', 'period', 'scenario'} | set(self._unrename_map.keys()) + unknown_dims = [d for d in representatives.dims if d not in known_dims] + assert len(unknown_dims) == 1, ( + f'Expected exactly 1 variable dim, got {unknown_dims} (known: {known_dims}, all: {representatives.dims})' + ) + variable_dim = unknown_dims[0] + for var_name in representatives.coords[variable_dim].values: + da = representatives.sel({variable_dim: var_name}, drop=True) + # Rename timestep -> time and assign our coordinates + da = da.rename({'timestep': 'time'}) + da = da.assign_coords(cluster=self._cluster_coords, time=self._time_coords) + # Ensure cluster and time are first two dims + other_dims = [d for d in da.dims if d not in ('cluster', 'time')] + da = da.transpose('cluster', 'time', *other_dims) + result[str(var_name)] = self._unrename(da) + return result def build_segment_durations(self) -> xr.DataArray: """Build timestep_duration DataArray from segment durations. Returns: - DataArray with dims [cluster, time, *dim_names]. - - Raises: - ValueError: If not a segmented system. + DataArray with dims [cluster, time, period?, scenario?]. """ if not self._is_segmented: raise ValueError('build_segment_durations() requires a segmented system') - data_per_key = {} - for key, tsam_result in self._aggregation_results.items(): - seg_durs = tsam_result.segment_durations - data = np.array( - [[seg_durs[c][s] * self._dt for s in range(self._n_segments)] for c in range(self._n_clusters)] - ) - data_per_key[key] = xr.DataArray(data, dims=['cluster', 'time'], coords=self._base_coords) - - return self._expand_and_combine(data_per_key, ['cluster', 'time'], name='timestep_duration') - - def build_metrics(self) -> xr.Dataset: - """Build clustering metrics Dataset from aggregation results. - - Returns: - Dataset with RMSE, MAE, RMSE_duration metrics. - """ - # Convert accuracy to DataFrames, filtering out failures - metrics_dfs: dict[tuple, pd.DataFrame] = {} - for key, result in self._aggregation_results.items(): - try: - metrics_dfs[key] = _accuracy_to_dataframe(result.accuracy) - except Exception as e: - logger.warning(f'Failed to compute clustering metrics for {key}: {e}') - metrics_dfs[key] = pd.DataFrame() - - non_empty_metrics = {k: v for k, v in metrics_dfs.items() if not v.empty} - - if not non_empty_metrics: - return xr.Dataset() - - # Single slice case - if len(metrics_dfs) == 1 and len(non_empty_metrics) == 1: - metrics_df = next(iter(non_empty_metrics.values())) - return xr.Dataset( - { - col: xr.DataArray( - metrics_df[col].values, - dims=['time_series'], - coords={'time_series': metrics_df.index}, - ) - for col in metrics_df.columns - } - ) - - # Multi-dim case - all periods have same time series - sample_df = next(iter(non_empty_metrics.values())) - time_series_index = list(sample_df.index) - data_vars = {} - - for metric in sample_df.columns: - data_per_key = {} - for key, df in metrics_dfs.items(): - values = np.full(len(time_series_index), np.nan) if df.empty else df[metric].values - data_per_key[key] = xr.DataArray( - values, dims=['time_series'], coords={'time_series': time_series_index} - ) - data_vars[metric] = self._expand_and_combine(data_per_key, ['time_series'], name=metric) - - return xr.Dataset(data_vars) + seg_durs = self._agg_result.segment_durations + # Convert from timestep counts to hours + da = seg_durs * self._dt + # Rename dims to match our convention + da = da.rename({'timestep': 'time'}) + da = da.assign_coords(cluster=self._cluster_coords, time=self._time_coords) + other_dims = [d for d in da.dims if d not in ('cluster', 'time')] + return self._unrename(da.transpose('cluster', 'time', *other_dims).rename('timestep_duration')) def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataArray]) -> xr.Dataset: """Build the reduced dataset with (cluster, time) structure. @@ -284,6 +149,8 @@ def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataAr """ from .core import TimeSeriesData + n_reduced_timesteps = self._n_clusters * self._n_time_points + ds_new_vars = {} variables = ds.variables coord_cache = {k: ds.coords[k].values for k in ds.coords} @@ -298,7 +165,7 @@ def build_reduced_dataset(self, ds: xr.Dataset, typical_das: dict[str, xr.DataAr # Time-dependent but constant: reshape to (cluster, time, ...) time_idx = var.dims.index('time') slices = [slice(None)] * len(var.dims) - slices[time_idx] = slice(0, self._n_reduced_timesteps) + slices[time_idx] = slice(0, n_reduced_timesteps) sliced_values = var.values[tuple(slices)] other_dims = [d for d in var.dims if d != 'time'] @@ -337,13 +204,11 @@ def build(self, ds: xr.Dataset) -> FlowSystem: Reduced FlowSystem with clustering metadata attached. """ from .clustering import Clustering - from .core import drop_constant_arrays from .flow_system import FlowSystem # Build all components cluster_weight = self.build_cluster_weights() typical_das = self.build_typical_periods() - metrics = self.build_metrics() ds_new = self.build_reduced_dataset(ds, typical_das) # Add segment durations if segmented @@ -374,34 +239,13 @@ def build(self, ds: xr.Dataset) -> FlowSystem: # Create Clustering object with full AggregationResult access reduced_fs.clustering = Clustering( original_timesteps=self._fs.timesteps, - original_data=drop_constant_arrays(ds, dim='time'), - aggregated_data=drop_constant_arrays(ds_new, dim='time'), - _metrics=metrics if metrics.data_vars else None, - _aggregation_results=self._aggregation_results, - _dim_names=self._dim_names, + _aggregation_result=self._agg_result, + _unrename_map=self._unrename_map, ) return reduced_fs -def _accuracy_to_dataframe(accuracy: Any) -> pd.DataFrame: - """Convert tsam ClusteringAccuracy to DataFrame with metrics. - - Args: - accuracy: tsam ClusteringAccuracy object. - - Returns: - DataFrame with RMSE, MAE, RMSE_duration columns indexed by time series name. - """ - return pd.DataFrame( - { - 'RMSE': accuracy.rmse, - 'MAE': accuracy.mae, - 'RMSE_duration': accuracy.rmse_duration, - } - ) - - class _Expander: """Handles expansion of clustered FlowSystem to original timesteps. @@ -419,8 +263,6 @@ def __init__(self, fs: FlowSystem, clustering: Clustering): # Pre-compute clustering dimensions self._timesteps_per_cluster = clustering.timesteps_per_cluster - self._n_segments = clustering.n_segments - self._time_dim_size = self._n_segments if self._n_segments else self._timesteps_per_cluster self._n_clusters = clustering.n_clusters self._n_original_clusters = clustering.n_original_clusters @@ -439,69 +281,16 @@ def __init__(self, fs: FlowSystem, clustering: Clustering): self._n_original_clusters - 1, ) - # Build variable category sets - self._variable_categories = getattr(fs, '_variable_categories', {}) - if self._variable_categories: - self._state_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_INTERPOLATE} - self._first_timestep_vars = { - name for name, cat in self._variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP - } - self._segment_total_vars = {name for name, cat in self._variable_categories.items() if cat in EXPAND_DIVIDE} - else: - # Fallback to pattern matching for old FlowSystems without categories - self._state_vars = set() - self._first_timestep_vars = set() - self._segment_total_vars = self._build_segment_total_varnames() if clustering.is_segmented else set() + # Build variable category sets from registered categories + variable_categories = fs._variable_categories + self._state_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_INTERPOLATE} + self._first_timestep_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_FIRST_TIMESTEP} + self._segment_total_vars = {name for name, cat in variable_categories.items() if cat in EXPAND_DIVIDE} - # Build expansion divisor for segmented systems + # Pre-compute expansion divisor for segmented systems (segment durations on original time) self._expansion_divisor = None if clustering.is_segmented: - self._expansion_divisor = clustering.build_expansion_divisor(original_time=self._original_timesteps) - - def _is_state_variable(self, var_name: str) -> bool: - """Check if variable is a state variable requiring interpolation.""" - return var_name in self._state_vars or (not self._variable_categories and var_name.endswith('|charge_state')) - - def _is_first_timestep_variable(self, var_name: str) -> bool: - """Check if variable is a first-timestep-only variable (startup/shutdown).""" - return var_name in self._first_timestep_vars or ( - not self._variable_categories and (var_name.endswith('|startup') or var_name.endswith('|shutdown')) - ) - - def _build_segment_total_varnames(self) -> set[str]: - """Build segment total variable names - BACKWARDS COMPATIBILITY FALLBACK. - - This method is only used when variable_categories is empty (old FlowSystems - saved before category registration was implemented). New FlowSystems use - the VariableCategory registry with EXPAND_DIVIDE categories (PER_TIMESTEP, SHARE). - - Returns: - Set of variable names that should be divided by expansion divisor. - """ - segment_total_vars: set[str] = set() - effect_names = list(self._fs.effects.keys()) - - # 1. Per-timestep totals for each effect - for effect in effect_names: - segment_total_vars.add(f'{effect}(temporal)|per_timestep') - - # 2. Flow contributions to effects - for flow_label in self._fs.flows: - for effect in effect_names: - segment_total_vars.add(f'{flow_label}->{effect}(temporal)') - - # 3. Component contributions to effects - for component_label in self._fs.components: - for effect in effect_names: - segment_total_vars.add(f'{component_label}->{effect}(temporal)') - - # 4. Effect-to-effect contributions - for target_effect_name, target_effect in self._fs.effects.items(): - if target_effect.share_from_temporal: - for source_effect_name in target_effect.share_from_temporal: - segment_total_vars.add(f'{source_effect_name}(temporal)->{target_effect_name}(temporal)') - - return segment_total_vars + self._expansion_divisor = clustering.disaggregate(clustering.segment_durations).ffill(dim='time') def _append_final_state(self, expanded: xr.DataArray, da: xr.DataArray) -> xr.DataArray: """Append final state value from original data to expanded data.""" @@ -516,113 +305,15 @@ def _append_final_state(self, expanded: xr.DataArray, da: xr.DataArray) -> xr.Da extra_val = extra_val.expand_dims(time=[self._original_timesteps_extra[-1]]) return xr.concat([expanded, extra_val], dim='time') - def _interpolate_charge_state_segmented(self, da: xr.DataArray) -> xr.DataArray: - """Interpolate charge_state values within segments for segmented systems. - - For segmented systems, charge_state has values at segment boundaries (n_segments+1). - This method interpolates between start and end boundary values to show the - actual charge trajectory as the storage charges/discharges. - - Args: - da: charge_state DataArray with dims (cluster, time) where time has n_segments+1 entries. - - Returns: - Interpolated charge_state with dims (time, ...) for original timesteps. - """ - clustering = self._clustering - - # Get multi-dimensional properties from Clustering - segment_assignments = clustering.results.segment_assignments - segment_durations = clustering.results.segment_durations - position_within_segment = clustering.results.position_within_segment - cluster_assignments = clustering.cluster_assignments - - # Compute original period index and position within period - original_period_indices = np.minimum( - np.arange(self._n_original_timesteps) // self._timesteps_per_cluster, - self._n_original_clusters - 1, - ) - positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster - - # Create DataArrays for indexing - original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) - position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) - - # Map original period to cluster - cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) - - # Get segment index and position for each original timestep - seg_indices = segment_assignments.isel(cluster=cluster_indices, time=position_in_period_da) - positions = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) - durations = segment_durations.isel(cluster=cluster_indices, segment=seg_indices) - - # Calculate interpolation factor: position within segment (0 to 1) - factor = xr.where(durations > 1, (positions + 0.5) / durations, 0.5) - - # Get start and end boundary values from charge_state - start_vals = da.isel(cluster=cluster_indices, time=seg_indices) - end_vals = da.isel(cluster=cluster_indices, time=seg_indices + 1) - - # Linear interpolation - interpolated = start_vals + (end_vals - start_vals) * factor - - # Clean up coordinate artifacts and rename - interpolated = interpolated.drop_vars(['cluster', 'time', 'segment'], errors='ignore') - interpolated = interpolated.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps) - - return interpolated.transpose('time', ...).assign_attrs(da.attrs) - - def _expand_first_timestep_only(self, da: xr.DataArray) -> xr.DataArray: - """Expand binary event variables to first timestep of each segment only. - - For segmented systems, binary event variables like startup and shutdown indicate - that an event occurred somewhere in the segment. When expanded, the event is placed - at the first timestep of each segment, with zeros elsewhere. - - Args: - da: Binary event DataArray with dims including (cluster, time). - - Returns: - Expanded DataArray with event values only at first timestep of each segment. - """ - clustering = self._clustering - - # First expand normally (repeats values) - expanded = clustering.expand_data(da, original_time=self._original_timesteps) - - # Build mask: True only at first timestep of each segment - position_within_segment = clustering.results.position_within_segment - cluster_assignments = clustering.cluster_assignments - - # Compute original period index and position within period - original_period_indices = np.minimum( - np.arange(self._n_original_timesteps) // self._timesteps_per_cluster, - self._n_original_clusters - 1, - ) - positions_in_period = np.arange(self._n_original_timesteps) % self._timesteps_per_cluster - - # Create DataArrays for indexing - original_period_da = xr.DataArray(original_period_indices, dims=['original_time']) - position_in_period_da = xr.DataArray(positions_in_period, dims=['original_time']) - - # Map to cluster and get position within segment - cluster_indices = cluster_assignments.isel(original_cluster=original_period_da) - pos_in_segment = position_within_segment.isel(cluster=cluster_indices, time=position_in_period_da) - - # Clean up and create mask - pos_in_segment = pos_in_segment.drop_vars(['cluster', 'time'], errors='ignore') - pos_in_segment = pos_in_segment.rename({'original_time': 'time'}).assign_coords(time=self._original_timesteps) - - # First timestep of segment has position 0 - is_first = pos_in_segment == 0 - - # Apply mask: keep value at first timestep, zero elsewhere - result = xr.where(is_first, expanded, 0) - return result.assign_attrs(da.attrs) - def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bool = False) -> xr.DataArray: """Expand a DataArray from clustered to original timesteps. + Uses clustering.disaggregate() as the core expansion, then applies + post-processing based on variable category: + - State variables (segmented): interpolate within segments + - First-timestep variables (segmented): value at segment start, zero elsewhere + - Segment totals: divide by segment duration for hourly rate + Args: da: DataArray to expand. var_name: Variable name for category-based expansion handling. @@ -636,19 +327,30 @@ def expand_dataarray(self, da: xr.DataArray, var_name: str = '', is_solution: bo clustering = self._clustering has_cluster_dim = 'cluster' in da.dims - is_state = self._is_state_variable(var_name) and has_cluster_dim - is_first_timestep = self._is_first_timestep_variable(var_name) and has_cluster_dim + is_state = var_name in self._state_vars and has_cluster_dim + is_first_timestep = var_name in self._first_timestep_vars and has_cluster_dim is_segment_total = is_solution and var_name in self._segment_total_vars - # Choose expansion method - if is_state and clustering.is_segmented: - expanded = self._interpolate_charge_state_segmented(da) - elif is_first_timestep and is_solution and clustering.is_segmented: - return self._expand_first_timestep_only(da) - else: - expanded = clustering.expand_data(da, original_time=self._original_timesteps) - if is_segment_total and self._expansion_divisor is not None: - expanded = expanded / self._expansion_divisor + # Solution variables have n+1 timesteps (extra boundary value). + # Strip it before disaggregating — it will be appended back for state variables. + expected_time = clustering.n_segments if clustering.is_segmented else clustering.timesteps_per_cluster + has_extra = has_cluster_dim and da.sizes.get('time', 0) > expected_time + da_for_disagg = da.isel(time=slice(None, expected_time)) if has_extra else da + + # Disaggregate: map (cluster, time) back to original time axis. + # For non-segmented: values are repeated. For segmented: NaN between boundaries. + expanded = clustering.disaggregate(da_for_disagg) + + # Post-processing for segmented systems + if clustering.is_segmented and has_cluster_dim: + if is_state: + expanded = expanded.interpolate_na(dim='time') + elif is_first_timestep and is_solution: + return expanded.fillna(0).assign_attrs(da.attrs) + else: + expanded = expanded.ffill(dim='time') + if is_segment_total and self._expansion_divisor is not None: + expanded = expanded / self._expansion_divisor # State variables need final state appended if is_state: @@ -822,8 +524,10 @@ def expand_flow_system(self) -> FlowSystem: n_combinations = (len(self._fs.periods) if has_periods else 1) * ( len(self._fs.scenarios) if has_scenarios else 1 ) - n_reduced_timesteps = self._n_clusters * self._time_dim_size - segmented_info = f' ({self._n_segments} segments)' if self._n_segments else '' + n_segments = self._clustering.n_segments + time_dim_size = n_segments if n_segments else self._timesteps_per_cluster + n_reduced_timesteps = self._n_clusters * time_dim_size + segmented_info = f' ({n_segments} segments)' if n_segments else '' logger.info( f'Expanded FlowSystem from {n_reduced_timesteps} to {self._n_original_timesteps} timesteps ' f'({self._n_clusters} clusters{segmented_info}' @@ -866,81 +570,6 @@ def __init__(self, flow_system: FlowSystem) -> None: """ self._fs = flow_system - @staticmethod - def _calculate_clustering_weights(ds) -> dict[str, float]: - """Calculate weights for clustering based on dataset attributes.""" - from collections import Counter - - import numpy as np - - groups = [da.attrs.get('clustering_group') for da in ds.data_vars.values() if 'clustering_group' in da.attrs] - group_counts = Counter(groups) - - # Calculate weight for each group (1/count) - group_weights = {group: 1 / count for group, count in group_counts.items()} - - weights = {} - variables = ds.variables - for name in ds.data_vars: - var_attrs = variables[name].attrs - clustering_group = var_attrs.get('clustering_group') - group_weight = group_weights.get(clustering_group) - if group_weight is not None: - weights[name] = group_weight - else: - weights[name] = var_attrs.get('clustering_weight', 1) - - if np.all(np.isclose(list(weights.values()), 1, atol=1e-6)): - logger.debug('All Clustering weights were set to 1') - - return weights - - @staticmethod - def _build_cluster_config_with_weights( - cluster: ClusterConfig | None, - auto_weights: dict[str, float], - available_columns: set[str] | None = None, - ) -> ClusterConfig: - """Merge auto-calculated weights into ClusterConfig. - - Args: - cluster: Optional user-provided ClusterConfig. - auto_weights: Automatically calculated weights based on data variance. - available_columns: Column names present in the clustering DataFrame. - If provided, weights are filtered to only include these columns. - This prevents tsam errors when some time series are dropped - (e.g., constant arrays removed before clustering). - - Returns: - ClusterConfig with weights set (either user-provided or auto-calculated). - """ - from tsam import ClusterConfig - - # Determine weights: user-provided take priority over auto-calculated - if cluster is not None and cluster.weights is not None: - weights = dict(cluster.weights) - else: - weights = auto_weights - - # Filter weights to only include columns present in the clustering data - if available_columns is not None: - weights = {name: w for name, w in weights.items() if name in available_columns} - - # No ClusterConfig provided - use defaults with weights - if cluster is None: - return ClusterConfig(weights=weights) - - # ClusterConfig provided - use its settings with (possibly filtered) weights - return ClusterConfig( - method=cluster.method, - representation=cluster.representation, - weights=weights, - normalize_column_means=cluster.normalize_column_means, - use_duration_curves=cluster.use_duration_curves, - include_period_sums=cluster.include_period_sums, - solver=cluster.solver, - ) - def sel( self, time: str | slice | list[str] | pd.Timestamp | pd.DatetimeIndex | None = None, @@ -1465,90 +1094,10 @@ def fix_sizes( return new_fs - def clustering_data( - self, - period: Any | None = None, - scenario: Any | None = None, - ) -> xr.Dataset: - """ - Get the time-varying data that would be used for clustering. - - This method extracts only the data arrays that vary over time, which is - the data that clustering algorithms use to identify typical periods. - Constant arrays (same value for all timesteps) are excluded since they - don't contribute to pattern identification. - - Use this to inspect or pre-process the data before clustering, or to - understand which variables influence the clustering result. - - Args: - period: Optional period label to select. If None and the FlowSystem - has multiple periods, returns data for all periods. - scenario: Optional scenario label to select. If None and the FlowSystem - has multiple scenarios, returns data for all scenarios. - - Returns: - xr.Dataset containing only time-varying data arrays. The dataset - includes arrays like demand profiles, price profiles, and other - time series that vary over the time dimension. - - Examples: - Inspect clustering input data: - - >>> data = flow_system.transform.clustering_data() - >>> print(f'Variables used for clustering: {list(data.data_vars)}') - >>> data['HeatDemand(Q)|fixed_relative_profile'].plot() - - Get data for a specific period/scenario: - - >>> data_2024 = flow_system.transform.clustering_data(period=2024) - >>> data_high = flow_system.transform.clustering_data(scenario='high') - - Convert to DataFrame for external tools: - - >>> df = flow_system.transform.clustering_data().to_dataframe() - """ - from .core import drop_constant_arrays - - if not self._fs.connected_and_transformed: - self._fs.connect_and_transform() - - ds = self._fs.to_dataset(include_solution=False) - - # Build selector for period/scenario - selector = {} - if period is not None: - selector['period'] = period - if scenario is not None: - selector['scenario'] = scenario - - # Apply selection if specified - if selector: - ds = ds.sel(**selector, drop=True) - - # Filter to only time-varying arrays - result = drop_constant_arrays(ds, dim='time') - - # Guard against empty dataset (all variables are constant) - if not result.data_vars: - selector_info = f' for {selector}' if selector else '' - raise ValueError( - f'No time-varying data found{selector_info}. ' - f'All variables are constant over time. Check your period/scenario filter or input data.' - ) - - # Remove attrs for cleaner output - result.attrs = {} - for var in result.data_vars: - result[var].attrs = {} - - return result - def cluster( self, n_clusters: int, cluster_duration: str | float, - data_vars: list[str] | None = None, cluster: ClusterConfig | None = None, extremes: ExtremeConfig | None = None, segments: SegmentConfig | None = None, @@ -1580,16 +1129,11 @@ def cluster( n_clusters: Number of clusters (typical periods) to extract (e.g., 8 typical days). cluster_duration: Duration of each cluster. Can be a pandas-style string ('1D', '24h', '6h') or a numeric value in hours. - data_vars: Optional list of variable names to use for clustering. If specified, - only these variables are used to determine cluster assignments, but the - clustering is then applied to ALL time-varying data in the FlowSystem. - Use ``transform.clustering_data()`` to see available variables. - Example: ``data_vars=['HeatDemand(Q)|fixed_relative_profile']`` to cluster - based only on heat demand patterns. cluster: Optional tsam ``ClusterConfig`` object specifying clustering algorithm, - representation method, and weights. If None, uses default settings (hierarchical - clustering with medoid representation) and automatically calculated weights - based on data variance. + representation method, and weights. Use ``weights={var: 0}`` to exclude + specific variables from influencing cluster assignments while still + aggregating them. If None, uses default settings (hierarchical clustering + with medoid representation). extremes: Optional tsam ``ExtremeConfig`` object specifying how to handle extreme periods (peaks). Use this to ensure peak demand days are captured. Example: ``ExtremeConfig(method='new_cluster', max_value=['demand'])``. @@ -1632,16 +1176,18 @@ def cluster( ... ) >>> fs_clustered.optimize(solver) - Clustering based on specific variables only: + Clustering based on specific variables only (zero-weight the rest): - >>> # See available variables for clustering - >>> print(flow_system.transform.clustering_data().data_vars) - >>> - >>> # Cluster based only on demand profile + >>> from tsam import ClusterConfig >>> fs_clustered = flow_system.transform.cluster( ... n_clusters=8, ... cluster_duration='1D', - ... data_vars=['HeatDemand(Q)|fixed_relative_profile'], + ... cluster=ClusterConfig( + ... weights={ + ... 'HeatDemand(Q)|fixed_relative_profile': 1, + ... 'GasSource(Gas)|costs|per_flow_hour': 0, # ignored for clustering + ... } + ... ), ... ) Note: @@ -1651,10 +1197,7 @@ def cluster( - For seasonal storage (e.g., hydrogen, thermal storage), set ``Storage.cluster_mode='intercluster'`` or ``'intercluster_cyclic'`` """ - import tsam - - from .clustering import ClusteringResults - from .core import drop_constant_arrays + import tsam_xarray # Parse cluster_duration to hours hours_per_cluster = ( @@ -1677,50 +1220,19 @@ def cluster( has_periods = self._fs.periods is not None has_scenarios = self._fs.scenarios is not None - # Determine iteration dimensions - periods = list(self._fs.periods) if has_periods else [None] - scenarios = list(self._fs.scenarios) if has_scenarios else [None] - ds = self._fs.to_dataset(include_solution=False) - # Validate and prepare data_vars for clustering - if data_vars is not None: - missing = set(data_vars) - set(ds.data_vars) - if missing: - raise ValueError( - f'data_vars not found in FlowSystem: {missing}. ' - f'Available time-varying variables can be found via transform.clustering_data().' - ) - ds_for_clustering = ds[list(data_vars)] - else: - ds_for_clustering = ds - - # Validate user-provided weight keys against the selected clustering input - if cluster is not None and cluster.weights is not None: - selected_vars = set(ds_for_clustering.data_vars) - unknown = sorted(set(cluster.weights) - selected_vars) - if unknown: - raise ValueError( - f'ClusterConfig weights reference unknown variables: {unknown}. ' - f'Available variables can be found via transform.clustering_data().' - ) + # Only keep variables with a time dimension for clustering + ds_for_clustering = ds[[name for name in ds.data_vars if 'time' in ds[name].dims]] - # Filter constant arrays once on the full dataset (not per slice) - # This ensures all slices have the same variables for consistent metrics - ds_for_clustering = drop_constant_arrays(ds_for_clustering, dim='time') - - # Guard against empty dataset after removing constant arrays if not ds_for_clustering.data_vars: - filter_info = f'data_vars={data_vars}' if data_vars else 'all variables' - raise ValueError( - f'No time-varying data found for clustering ({filter_info}). ' - f'All variables are constant over time. Check your data_vars filter or input data.' - ) + raise ValueError('No time-varying data found for clustering. Check your input data.') # Validate tsam_kwargs doesn't override explicit parameters reserved_tsam_keys = { 'n_clusters', 'period_duration', # exposed as cluster_duration + 'temporal_resolution', # computed automatically 'timestep_duration', # computed automatically 'cluster', 'segments', @@ -1738,9 +1250,8 @@ def cluster( ) # Validate ExtremeConfig compatibility with multi-period/scenario systems - # Only method='replace' reliably produces consistent cluster counts across all slices. - total_slices = len(periods) * len(scenarios) - if total_slices > 1 and extremes is not None: + has_slices = has_periods or has_scenarios + if has_slices and extremes is not None: if extremes.method != 'replace': raise ValueError( f"ExtremeConfig method='{extremes.method}' is not supported for multi-period " @@ -1749,75 +1260,90 @@ def cluster( "ExtremeConfig(..., method='replace')" ) - # Build dim_names and clean key helper - dim_names: list[str] = [] + # Rename reserved dimension names to avoid conflict with tsam_xarray + # tsam_xarray reserves: 'period', 'cluster', 'timestep' + reserved_renames = {'period': '_period', 'cluster': '_cluster'} + # Check against full ds dims (period/cluster may only exist as coords, not in ds_for_clustering) + rename_map = {k: v for k, v in reserved_renames.items() if k in ds.dims} + unrename_map = {v: k for k, v in rename_map.items()} + + if rename_map: + # Only rename dims that exist in each dataset + clustering_renames = {k: v for k, v in rename_map.items() if k in ds_for_clustering.dims} + if clustering_renames: + ds_for_clustering = ds_for_clustering.rename(clustering_renames) + ds = ds.rename(rename_map) + + # Stack Dataset into a single DataArray with 'variable' dimension + da_for_clustering = ds_for_clustering.to_dataarray(dim='variable') + + # Ensure period/scenario dimensions are present in the DataArray + # even if the data doesn't vary across them (tsam_xarray needs them for slicing) + extra_dims = [] if has_periods: - dim_names.append('period') + extra_dims.append(rename_map.get('period', 'period')) if has_scenarios: - dim_names.append('scenario') - - def to_clean_key(period_label, scenario_label) -> tuple: - """Convert (period, scenario) to clean key based on which dims exist.""" - key_parts = [] - if has_periods: - key_parts.append(period_label) - if has_scenarios: - key_parts.append(scenario_label) - return tuple(key_parts) - - # Cluster each (period, scenario) combination using tsam directly - aggregation_results: dict[tuple, Any] = {} - - for period_label in periods: - for scenario_label in scenarios: - key = to_clean_key(period_label, scenario_label) - selector = {k: v for k, v in [('period', period_label), ('scenario', scenario_label)] if v is not None} - - # Select data slice for clustering - ds_slice = ds_for_clustering.sel(**selector, drop=True) if selector else ds_for_clustering - df_for_clustering = ds_slice.to_dataframe() - - if selector: - logger.info(f'Clustering {", ".join(f"{k}={v}" for k, v in selector.items())}...') - - # Suppress tsam warning about minimal value constraints (informational, not actionable) - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') - - # Build ClusterConfig with auto-calculated weights, filtered to available columns - clustering_weights = self._calculate_clustering_weights(ds_slice) - cluster_config = self._build_cluster_config_with_weights( - cluster, clustering_weights, available_columns=set(df_for_clustering.columns) - ) - - # Perform clustering based on selected data_vars (or all if not specified) - aggregation_results[key] = tsam.aggregate( - df_for_clustering, - n_clusters=n_clusters, - period_duration=hours_per_cluster, - temporal_resolution=dt, - cluster=cluster_config, - extremes=extremes, - segments=segments, - preserve_column_means=preserve_column_means, - rescale_exclude_columns=rescale_exclude_columns, - round_decimals=round_decimals, - numerical_tolerance=numerical_tolerance, - **tsam_kwargs, - ) - - # If data_vars was specified, apply clustering to FULL data - if data_vars is not None: - # Build ClusteringResults from subset clustering - clustering_results = ClusteringResults( - {k: r.clustering for k, r in aggregation_results.items()}, - dim_names, + extra_dims.append(rename_map.get('scenario', 'scenario')) + for dim_name in extra_dims: + if dim_name not in da_for_clustering.dims and dim_name in ds.dims: + # Drop as non-dim coordinate first (to_dataarray may keep it as scalar coord) + if dim_name in da_for_clustering.coords: + da_for_clustering = da_for_clustering.drop_vars(dim_name) + da_for_clustering = da_for_clustering.expand_dims({dim_name: ds.coords[dim_name].values}) + + # Pass user-specified weights to tsam_xarray (validates unknown keys) + if cluster is not None and cluster.weights is not None: + weights = dict(cluster.weights) + else: + weights = {} + + # Build tsam_kwargs with explicit parameters + tsam_kwargs_full = { + 'period_duration': hours_per_cluster, + 'temporal_resolution': dt, + 'extremes': extremes, + 'segments': segments, + 'preserve_column_means': preserve_column_means, + 'rescale_exclude_columns': rescale_exclude_columns, + 'round_decimals': round_decimals, + 'numerical_tolerance': numerical_tolerance, + **tsam_kwargs, + } + + # Pass cluster config settings (without weights, which go to tsam_xarray directly) + if cluster is not None: + from tsam import ClusterConfig + + cluster_config = ClusterConfig( + method=cluster.method, + representation=cluster.representation, + normalize_column_means=cluster.normalize_column_means, + use_duration_curves=cluster.use_duration_curves, + include_period_sums=cluster.include_period_sums, + solver=cluster.solver, ) - # Apply to full data and replace results - aggregation_results = dict(clustering_results.apply(ds)) + tsam_kwargs_full['cluster'] = cluster_config + + # Suppress tsam warning about minimal value constraints (informational, not actionable) + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') + + # Single call: tsam_xarray handles (period, scenario) slicing automatically + agg_result = tsam_xarray.aggregate( + da_for_clustering, + time_dim='time', + cluster_dim='variable', + n_clusters=n_clusters, + weights=weights, + **tsam_kwargs_full, + ) + + # Rename reserved dims back to original names in the dataset + if unrename_map: + ds = ds.rename(unrename_map) # Build and return the reduced FlowSystem - builder = _ReducedFlowSystemBuilder(self._fs, aggregation_results, timesteps_per_cluster, dt, dim_names) + builder = _ReducedFlowSystemBuilder(self._fs, agg_result, timesteps_per_cluster, dt, unrename_map) return builder.build(ds) def apply_clustering( @@ -1864,7 +1390,6 @@ def apply_clustering( # Get timesteps_per_cluster from the clustering object (survives serialization) timesteps_per_cluster = clustering.timesteps_per_cluster - dim_names = clustering.results.dim_names ds = self._fs.to_dataset(include_solution=False) @@ -1877,20 +1402,54 @@ def apply_clustering( f'FlowSystem has {current_timesteps} timesteps, but clustering expects ' f'{expected_timesteps} timesteps ({clustering.n_original_clusters} clusters × ' f'{clustering.timesteps_per_cluster} timesteps/cluster). ' - f'Ensure self._fs.timesteps matches the original data used for clustering.results.apply(ds).' + f'Ensure self._fs.timesteps matches the original data used for clustering.' ) - # Apply existing clustering to all (period, scenario) combinations at once + # Rename reserved dimension names to avoid conflict with tsam_xarray + reserved_renames = {'period': '_period', 'cluster': '_cluster'} + rename_map = {k: v for k, v in reserved_renames.items() if k in ds.dims} + unrename_map = {v: k for k, v in rename_map.items()} + + if rename_map: + ds = ds.rename(rename_map) + + # Apply existing clustering to full data logger.info('Applying clustering...') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=UserWarning, message='.*minimal value.*exceeds.*') - agg_results = clustering.results.apply(ds) + da_full = ds.to_dataarray(dim='variable') + + # Ensure extra dims are present in DataArray + for _orig_name, renamed in rename_map.items(): + if renamed not in da_full.dims and renamed in ds.dims: + if renamed in da_full.coords: + da_full = da_full.drop_vars(renamed) + da_full = da_full.expand_dims({renamed: ds.coords[renamed].values}) + + # Get clustering result with correct dim names for the renamed data + from tsam_xarray import ClusteringResult as ClusteringResultClass + + cr_result = clustering.clustering_result + # Map dim names to renamed versions (e.g., period → _period) + slice_dims = [rename_map.get(d, d) for d in clustering.dim_names] + cr_result = ClusteringResultClass( + time_dim='time', + cluster_dim=['variable'], + slice_dims=slice_dims, + clusterings=dict(cr_result.clusterings), + ) + # TODO(tsam_xarray): Same workaround as in cluster() above — remove + # once tsam_xarray handles mismatched weights in apply(). + for cr in cr_result.clusterings.values(): + object.__setattr__(cr, 'weights', {}) + agg_result = cr_result.apply(da_full) - # Convert AggregationResults to dict format - aggregation_results = dict(agg_results) + # Rename back + if unrename_map: + ds = ds.rename(unrename_map) # Build and return the reduced FlowSystem - builder = _ReducedFlowSystemBuilder(self._fs, aggregation_results, timesteps_per_cluster, dt, dim_names) + builder = _ReducedFlowSystemBuilder(self._fs, agg_result, timesteps_per_cluster, dt, unrename_map) return builder.build(ds) def _validate_for_expansion(self) -> Clustering: diff --git a/mkdocs.yml b/mkdocs.yml index e827a5d89..ea81f4487 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,9 +79,6 @@ nav: - Clustering: - Introduction: notebooks/08c-clustering.ipynb - Storage Modes: notebooks/08c2-clustering-storage-modes.ipynb - - Multi-Period: notebooks/08d-clustering-multiperiod.ipynb - - Segmentation: notebooks/08f-clustering-segmentation.ipynb - - Internals: notebooks/08e-clustering-internals.ipynb - Results: - Plotting: notebooks/09-plotting-and-data-access.ipynb diff --git a/pyproject.toml b/pyproject.toml index 0431d1833..4762ff7d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,8 @@ network_viz = [ # Full feature set (everything except dev tools) full = [ - "tsam >= 3.1.2, < 4", # Time series aggregation for clustering (3.0.0 and 3.1.0 yanked) + "tsam_xarray >= 0.5.1, < 1", # Time series aggregation for clustering (wraps tsam) + "tsam >= 3.1.2, < 4", # Directly imported for ClusterConfig, ExtremeConfig, SegmentConfig "pyvis==0.3.2", # Visualizing FlowSystem Network "scipy >= 1.15.1, < 2", # Used by tsam. Prior versions have conflict with highspy. See https://github.com/scipy/scipy/issues/22257 "gurobipy >= 10.0.0, < 14; python_version < '3.14'", # No Python 3.14 wheels yet (expected Q1 2026) @@ -77,7 +78,7 @@ full = [ # Development tools and testing dev = [ - "tsam==3.1.2", # Time series aggregation for clustering + "tsam_xarray>=0.5.1", # Time series aggregation for clustering (wraps tsam) "pytest==9.0.2", "pytest-xdist==3.8.0", "nbformat==5.10.4", diff --git a/tests/conftest.py b/tests/conftest.py index 84b137c84..b3950cc35 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -566,11 +566,11 @@ def flow_system_long(): thermal_load_ts, electrical_load_ts = ( fx.TimeSeriesData(thermal_load), - fx.TimeSeriesData(electrical_load, clustering_weight=0.7), + fx.TimeSeriesData(electrical_load), ) p_feed_in, p_sell = ( - fx.TimeSeriesData(-(p_el - 0.5), clustering_group='p_el'), - fx.TimeSeriesData(p_el + 0.5, clustering_group='p_el'), + fx.TimeSeriesData(-(p_el - 0.5)), + fx.TimeSeriesData(p_el + 0.5), ) flow_system = fx.FlowSystem(pd.DatetimeIndex(data.index)) diff --git a/tests/deprecated/conftest.py b/tests/deprecated/conftest.py index efa9fa119..ff0538073 100644 --- a/tests/deprecated/conftest.py +++ b/tests/deprecated/conftest.py @@ -562,11 +562,11 @@ def flow_system_long(): thermal_load_ts, electrical_load_ts = ( fx.TimeSeriesData(thermal_load), - fx.TimeSeriesData(electrical_load, clustering_weight=0.7), + fx.TimeSeriesData(electrical_load), ) p_feed_in, p_sell = ( - fx.TimeSeriesData(-(p_el - 0.5), clustering_group='p_el'), - fx.TimeSeriesData(p_el + 0.5, clustering_group='p_el'), + fx.TimeSeriesData(-(p_el - 0.5)), + fx.TimeSeriesData(p_el + 0.5), ) flow_system = fx.FlowSystem(pd.DatetimeIndex(data.index)) diff --git a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py index bbb03f06b..95797888e 100644 --- a/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py +++ b/tests/deprecated/examples/03_Optimization_modes/example_optimization_modes.py @@ -58,9 +58,9 @@ def get_solutions(optimizations: list, variable: str) -> xr.Dataset: # TimeSeriesData objects TS_heat_demand = fx.TimeSeriesData(heat_demand) - TS_electricity_demand = fx.TimeSeriesData(electricity_demand, clustering_weight=0.7) - TS_electricity_price_sell = fx.TimeSeriesData(-(electricity_price - 0.5), clustering_group='p_el') - TS_electricity_price_buy = fx.TimeSeriesData(electricity_price + 0.5, clustering_group='p_el') + TS_electricity_demand = fx.TimeSeriesData(electricity_demand) + TS_electricity_price_sell = fx.TimeSeriesData(-(electricity_price - 0.5)) + TS_electricity_price_buy = fx.TimeSeriesData(electricity_price + 0.5) flow_system = fx.FlowSystem(timesteps) flow_system.add_elements( diff --git a/tests/test_clustering/test_base.py b/tests/test_clustering/test_base.py index 81afc2a97..f69de4cdf 100644 --- a/tests/test_clustering/test_base.py +++ b/tests/test_clustering/test_base.py @@ -5,8 +5,29 @@ import pytest import xarray as xr -from flixopt.clustering import Clustering, ClusteringResults -from flixopt.clustering.base import _build_timestep_mapping, _cluster_occurrences +from flixopt.clustering import Clustering + +tsam_xarray = pytest.importorskip('tsam_xarray') + + +def _make_clustering_result(clusterings: dict, dim_names: list[str]): + """Create a ClusteringResult from a dict of tsam ClusteringResult-like objects.""" + return tsam_xarray.ClusteringResult( + time_dim='time', + cluster_dim=['variable'], + slice_dims=dim_names, + clusterings=clusterings, + ) + + +def _make_clustering(clusterings: dict, dim_names: list[str], n_timesteps: int | None = None): + """Create a Clustering from mock ClusteringResult objects.""" + cr_result = _make_clustering_result(clusterings, dim_names) + first = next(iter(clusterings.values())) + if n_timesteps is None: + n_timesteps = first.n_original_periods * first.n_timesteps_per_period + original_timesteps = pd.date_range('2024-01-01', periods=n_timesteps, freq='h') + return Clustering(clustering_result=cr_result, original_timesteps=original_timesteps) class TestHelperFunctions: @@ -22,194 +43,28 @@ class MockClusteringResult: n_timesteps_per_period = 24 cluster_assignments = (0, 1, 0, 1, 2, 0) period_duration = 24.0 - n_segments = None # None indicates non-segmented - segment_assignments = None # None indicates non-segmented - - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - """Mock apply method.""" - return {'applied': True} + n_segments = None + segment_assignments = None + cluster_centers = (0, 1, 4) return MockClusteringResult() def test_cluster_occurrences(self, mock_clustering_result): - """Test _cluster_occurrences helper.""" - occurrences = _cluster_occurrences(mock_clustering_result) + """Test cluster_occurrences via Clustering.""" + clustering = _make_clustering({(): mock_clustering_result}, []) + occurrences = clustering.cluster_occurrences # cluster 0: 3 occurrences (indices 0, 2, 5) # cluster 1: 2 occurrences (indices 1, 3) # cluster 2: 1 occurrence (index 4) - np.testing.assert_array_equal(occurrences, [3, 2, 1]) - - def test_build_timestep_mapping(self, mock_clustering_result): - """Test _build_timestep_mapping helper.""" - mapping = _build_timestep_mapping(mock_clustering_result, n_timesteps=144) - assert len(mapping) == 144 - - # First 24 timesteps should map to cluster 0's representative (0-23) - np.testing.assert_array_equal(mapping[:24], np.arange(24)) - - # Second 24 timesteps (period 1 -> cluster 1) should map to cluster 1's representative (24-47) - np.testing.assert_array_equal(mapping[24:48], np.arange(24, 48)) - - -class TestClusteringResults: - """Tests for ClusteringResults collection class.""" - - @pytest.fixture - def mock_clustering_result_factory(self): - """Factory for creating mock ClusteringResult objects.""" - - def create_result(cluster_assignments, n_timesteps_per_period=24): - class MockClusteringResult: - n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0 - n_original_periods = len(cluster_assignments) - period_duration = 24.0 - n_segments = None # None indicates non-segmented - segment_assignments = None # None indicates non-segmented - - def __init__(self, assignments, n_timesteps): - self.cluster_assignments = tuple(assignments) - self.n_timesteps_per_period = n_timesteps - - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - return {'applied': True} - - return MockClusteringResult(cluster_assignments, n_timesteps_per_period) - - return create_result - - def test_single_result(self, mock_clustering_result_factory): - """Test ClusteringResults with single result.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(): cr}, dim_names=[]) - - assert results.n_clusters == 2 - assert results.timesteps_per_cluster == 24 - assert len(results) == 1 - - def test_multi_period_results(self, mock_clustering_result_factory): - """Test ClusteringResults with multiple periods.""" - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) - - results = ClusteringResults( - {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - - assert results.n_clusters == 2 - assert len(results) == 2 - - # Access by period - assert results.sel(period=2020) is cr_2020 - assert results.sel(period=2030) is cr_2030 - - def test_dims_property(self, mock_clustering_result_factory): - """Test dims property returns tuple (xarray-like).""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(): cr}, dim_names=[]) - assert results.dims == () - - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) - results = ClusteringResults( - {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - assert results.dims == ('period',) - - def test_coords_property(self, mock_clustering_result_factory): - """Test coords property returns dict (xarray-like).""" - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) - results = ClusteringResults( - {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - assert results.coords == {'period': [2020, 2030]} - - def test_sel_method(self, mock_clustering_result_factory): - """Test sel() method (xarray-like selection).""" - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) - results = ClusteringResults( - {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - assert results.sel(period=2020) is cr_2020 - assert results.sel(period=2030) is cr_2030 - - def test_sel_invalid_key_raises(self, mock_clustering_result_factory): - """Test sel() raises KeyError for invalid key.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(2020,): cr}, dim_names=['period']) - - with pytest.raises(KeyError): - results.sel(period=2030) - - def test_isel_method(self, mock_clustering_result_factory): - """Test isel() method (xarray-like integer selection).""" - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) - results = ClusteringResults( - {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - assert results.isel(period=0) is cr_2020 - assert results.isel(period=1) is cr_2030 - - def test_isel_invalid_index_raises(self, mock_clustering_result_factory): - """Test isel() raises IndexError for out-of-range index.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(2020,): cr}, dim_names=['period']) - - with pytest.raises(IndexError): - results.isel(period=5) - - def test_cluster_assignments_dataarray(self, mock_clustering_result_factory): - """Test cluster_assignments returns correct DataArray.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(): cr}, dim_names=[]) - - cluster_assignments = results.cluster_assignments - assert isinstance(cluster_assignments, xr.DataArray) - assert 'original_cluster' in cluster_assignments.dims - np.testing.assert_array_equal(cluster_assignments.values, [0, 1, 0]) - - def test_cluster_occurrences_dataarray(self, mock_clustering_result_factory): - """Test cluster_occurrences returns correct DataArray.""" - cr = mock_clustering_result_factory([0, 1, 0]) # 2 x cluster 0, 1 x cluster 1 - results = ClusteringResults({(): cr}, dim_names=[]) - - occurrences = results.cluster_occurrences - assert isinstance(occurrences, xr.DataArray) - assert 'cluster' in occurrences.dims - np.testing.assert_array_equal(occurrences.values, [2, 1]) + np.testing.assert_array_equal(occurrences.values, [3, 2, 1]) class TestClustering: - """Tests for Clustering dataclass.""" + """Tests for Clustering class.""" @pytest.fixture - def basic_cluster_results(self): - """Create basic ClusteringResults for testing.""" + def mock_cr(self): + """Create a mock tsam ClusteringResult.""" class MockClusteringResult: n_clusters = 3 @@ -217,33 +72,16 @@ class MockClusteringResult: n_timesteps_per_period = 24 cluster_assignments = (0, 1, 0, 1, 2, 0) period_duration = 24.0 - n_segments = None # None indicates non-segmented - segment_assignments = None # None indicates non-segmented + n_segments = None + segment_assignments = None + cluster_centers = (0, 1, 4) - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - return {'applied': True} - - mock_cr = MockClusteringResult() - return ClusteringResults({(): mock_cr}, dim_names=[]) + return MockClusteringResult() @pytest.fixture - def basic_clustering(self, basic_cluster_results): + def basic_clustering(self, mock_cr): """Create a basic Clustering instance for testing.""" - original_timesteps = pd.date_range('2024-01-01', periods=144, freq='h') - - return Clustering( - results=basic_cluster_results, - original_timesteps=original_timesteps, - ) + return _make_clustering({(): mock_cr}, []) def test_basic_creation(self, basic_clustering): """Test basic Clustering creation.""" @@ -251,10 +89,6 @@ def test_basic_creation(self, basic_clustering): assert basic_clustering.timesteps_per_cluster == 24 assert basic_clustering.n_original_clusters == 6 - def test_n_representatives(self, basic_clustering): - """Test n_representatives property.""" - assert basic_clustering.n_representatives == 72 # 3 * 24 - def test_cluster_occurrences(self, basic_clustering): """Test cluster_occurrences property returns correct values.""" occurrences = basic_clustering.cluster_occurrences @@ -265,39 +99,6 @@ def test_cluster_occurrences(self, basic_clustering): assert occurrences.sel(cluster=1).item() == 2 assert occurrences.sel(cluster=2).item() == 1 - def test_representative_weights(self, basic_clustering): - """Test representative_weights is same as cluster_occurrences.""" - weights = basic_clustering.representative_weights - occurrences = basic_clustering.cluster_occurrences - xr.testing.assert_equal( - weights.drop_vars('cluster', errors='ignore'), - occurrences.drop_vars('cluster', errors='ignore'), - ) - - def test_timestep_mapping(self, basic_clustering): - """Test timestep_mapping property.""" - mapping = basic_clustering.timestep_mapping - assert isinstance(mapping, xr.DataArray) - assert 'original_time' in mapping.dims - assert len(mapping) == 144 # Original timesteps - - def test_metrics(self, basic_clustering): - """Test metrics property returns empty Dataset when no metrics.""" - metrics = basic_clustering.metrics - assert isinstance(metrics, xr.Dataset) - # No metrics provided, so should be empty - assert len(metrics.data_vars) == 0 - - def test_cluster_start_positions(self, basic_clustering): - """Test cluster_start_positions property.""" - positions = basic_clustering.cluster_start_positions - np.testing.assert_array_equal(positions, [0, 24, 48]) - - def test_empty_results_raises(self): - """Test that empty results raises ValueError.""" - with pytest.raises(ValueError, match='cannot be empty'): - ClusteringResults({}, dim_names=[]) - def test_repr(self, basic_clustering): """Test string representation.""" repr_str = repr(basic_clustering) @@ -305,12 +106,16 @@ def test_repr(self, basic_clustering): assert '6 periods' in repr_str assert '3 clusters' in repr_str + def test_dim_names_no_extra(self, basic_clustering): + """Test dim_names with no extra dimensions.""" + assert basic_clustering.dim_names == [] + class TestClusteringMultiDim: """Tests for Clustering with period/scenario dimensions.""" @pytest.fixture - def mock_clustering_result_factory(self): + def mock_cr_factory(self): """Factory for creating mock ClusteringResult objects.""" def create_result(cluster_assignments, n_timesteps_per_period=24): @@ -318,167 +123,28 @@ class MockClusteringResult: n_clusters = max(cluster_assignments) + 1 if cluster_assignments else 0 n_original_periods = len(cluster_assignments) period_duration = 24.0 - n_segments = None # None indicates non-segmented - segment_assignments = None # None indicates non-segmented + n_segments = None + segment_assignments = None + cluster_centers = tuple(range(max(cluster_assignments) + 1)) if cluster_assignments else () def __init__(self, assignments, n_timesteps): self.cluster_assignments = tuple(assignments) self.n_timesteps_per_period = n_timesteps - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - return {'applied': True} - return MockClusteringResult(cluster_assignments, n_timesteps_per_period) return create_result - def test_multi_period_clustering(self, mock_clustering_result_factory): + def test_multi_period_clustering(self, mock_cr_factory): """Test Clustering with multiple periods.""" - cr_2020 = mock_clustering_result_factory([0, 1, 0]) - cr_2030 = mock_clustering_result_factory([1, 0, 1]) + cr_2020 = mock_cr_factory([0, 1, 0]) + cr_2030 = mock_cr_factory([1, 0, 1]) - results = ClusteringResults( + clustering = _make_clustering( {(2020,): cr_2020, (2030,): cr_2030}, - dim_names=['period'], - ) - original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - - clustering = Clustering( - results=results, - original_timesteps=original_timesteps, + ['period'], ) assert clustering.n_clusters == 2 assert 'period' in clustering.cluster_occurrences.dims - - def test_get_result(self, mock_clustering_result_factory): - """Test get_result method.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(): cr}, dim_names=[]) - original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - - clustering = Clustering( - results=results, - original_timesteps=original_timesteps, - ) - - retrieved = clustering.get_result() - assert retrieved is cr - - def test_get_result_invalid_key(self, mock_clustering_result_factory): - """Test get_result with invalid key raises KeyError.""" - cr = mock_clustering_result_factory([0, 1, 0]) - results = ClusteringResults({(2020,): cr}, dim_names=['period']) - original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - - clustering = Clustering( - results=results, - original_timesteps=original_timesteps, - ) - - with pytest.raises(KeyError): - clustering.get_result(period=2030) - - -class TestClusteringPlotAccessor: - """Tests for ClusteringPlotAccessor.""" - - @pytest.fixture - def clustering_with_data(self): - """Create Clustering with original and aggregated data.""" - - class MockClusteringResult: - n_clusters = 2 - n_original_periods = 3 - n_timesteps_per_period = 24 - cluster_assignments = (0, 1, 0) - period_duration = 24.0 - - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - return {'applied': True} - - mock_cr = MockClusteringResult() - results = ClusteringResults({(): mock_cr}, dim_names=[]) - - original_timesteps = pd.date_range('2024-01-01', periods=72, freq='h') - - original_data = xr.Dataset( - { - 'col1': xr.DataArray(np.random.randn(72), dims=['time'], coords={'time': original_timesteps}), - } - ) - aggregated_data = xr.Dataset( - { - 'col1': xr.DataArray( - np.random.randn(2, 24), - dims=['cluster', 'time'], - coords={'cluster': [0, 1], 'time': pd.date_range('2000-01-01', periods=24, freq='h')}, - ), - } - ) - - return Clustering( - results=results, - original_timesteps=original_timesteps, - original_data=original_data, - aggregated_data=aggregated_data, - ) - - def test_plot_accessor_exists(self, clustering_with_data): - """Test that plot accessor is available.""" - assert hasattr(clustering_with_data, 'plot') - assert hasattr(clustering_with_data.plot, 'compare') - assert hasattr(clustering_with_data.plot, 'heatmap') - assert hasattr(clustering_with_data.plot, 'clusters') - - def test_compare_requires_data(self): - """Test compare() raises when no data available.""" - - class MockClusteringResult: - n_clusters = 2 - n_original_periods = 2 - n_timesteps_per_period = 24 - cluster_assignments = (0, 1) - period_duration = 24.0 - - def to_dict(self): - return { - 'n_clusters': self.n_clusters, - 'n_original_periods': self.n_original_periods, - 'n_timesteps_per_period': self.n_timesteps_per_period, - 'cluster_assignments': list(self.cluster_assignments), - 'period_duration': self.period_duration, - } - - def apply(self, data): - return {'applied': True} - - mock_cr = MockClusteringResult() - results = ClusteringResults({(): mock_cr}, dim_names=[]) - original_timesteps = pd.date_range('2024-01-01', periods=48, freq='h') - - clustering = Clustering( - results=results, - original_timesteps=original_timesteps, - ) - - with pytest.raises(ValueError, match='No original/aggregated data'): - clustering.plot.compare() + assert clustering.dim_names == ['period'] diff --git a/tests/test_clustering/test_cluster_reduce_expand.py b/tests/test_clustering/test_cluster_reduce_expand.py index 679307fba..d8c9cbf74 100644 --- a/tests/test_clustering/test_cluster_reduce_expand.py +++ b/tests/test_clustering/test_cluster_reduce_expand.py @@ -915,8 +915,8 @@ def test_extremes_append_with_segments(self, solver_fixture, timesteps_8_days): n_clusters = fs_clustered.clustering.n_clusters assert n_clusters >= 2 - # n_representatives = n_clusters * n_segments - assert fs_clustered.clustering.n_representatives == n_clusters * 6 + # n_clusters * n_segments + assert n_clusters * fs_clustered.clustering.n_segments == n_clusters * 6 # Verify optimization works fs_clustered.optimize(solver_fixture) @@ -930,157 +930,6 @@ def test_extremes_append_with_segments(self, solver_fixture, timesteps_8_days): assert int(fs_clustered.clustering.cluster_occurrences.sum()) == 8 -# ==================== Data Vars Parameter Tests ==================== - - -class TestDataVarsParameter: - """Tests for data_vars parameter in cluster() method.""" - - def test_cluster_with_data_vars_subset(self, timesteps_8_days): - """Test clustering with a subset of variables.""" - # Create system with multiple time-varying data - hours = len(timesteps_8_days) - demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 - price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 # Different pattern - - fs = fx.FlowSystem(timesteps_8_days) - fs.add_elements( - fx.Bus('Heat'), - fx.Bus('Gas'), - fx.Effect('costs', '€', is_standard=True, is_objective=True), - fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), - fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), - fx.linear_converters.Boiler( - 'Boiler', - thermal_efficiency=0.9, - fuel_flow=fx.Flow('Q_fu', bus='Gas'), - thermal_flow=fx.Flow('Q_th', bus='Heat'), - ), - ) - - # Cluster based only on demand profile (not price) - fs_reduced = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - data_vars=['HeatDemand(Q)|fixed_relative_profile'], - ) - - # Should have clustered structure - assert len(fs_reduced.timesteps) == 24 - assert len(fs_reduced.clusters) == 2 - - def test_data_vars_validation_error(self, timesteps_8_days): - """Test that invalid data_vars raises ValueError.""" - fs = create_simple_system(timesteps_8_days) - - with pytest.raises(ValueError, match='data_vars not found'): - fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - data_vars=['NonExistentVariable'], - ) - - def test_data_vars_preserves_all_flowsystem_data(self, timesteps_8_days): - """Test that clustering with data_vars preserves all FlowSystem variables.""" - # Create system with multiple time-varying data - hours = len(timesteps_8_days) - demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 - price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 - - fs = fx.FlowSystem(timesteps_8_days) - fs.add_elements( - fx.Bus('Heat'), - fx.Bus('Gas'), - fx.Effect('costs', '€', is_standard=True, is_objective=True), - fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), - fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), - fx.linear_converters.Boiler( - 'Boiler', - thermal_efficiency=0.9, - fuel_flow=fx.Flow('Q_fu', bus='Gas'), - thermal_flow=fx.Flow('Q_th', bus='Heat'), - ), - ) - - # Cluster based only on demand profile - fs_reduced = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - data_vars=['HeatDemand(Q)|fixed_relative_profile'], - ) - - # Both demand and price should be preserved in the reduced FlowSystem - ds = fs_reduced.to_dataset() - assert 'HeatDemand(Q)|fixed_relative_profile' in ds.data_vars - assert 'GasSource(Gas)|costs|per_flow_hour' in ds.data_vars - - def test_data_vars_optimization_works(self, solver_fixture, timesteps_8_days): - """Test that FlowSystem clustered with data_vars can be optimized.""" - hours = len(timesteps_8_days) - demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 - price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 - - fs = fx.FlowSystem(timesteps_8_days) - fs.add_elements( - fx.Bus('Heat'), - fx.Bus('Gas'), - fx.Effect('costs', '€', is_standard=True, is_objective=True), - fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), - fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), - fx.linear_converters.Boiler( - 'Boiler', - thermal_efficiency=0.9, - fuel_flow=fx.Flow('Q_fu', bus='Gas'), - thermal_flow=fx.Flow('Q_th', bus='Heat'), - ), - ) - - fs_reduced = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - data_vars=['HeatDemand(Q)|fixed_relative_profile'], - ) - - # Should optimize successfully - fs_reduced.optimize(solver_fixture) - assert fs_reduced.solution is not None - assert 'Boiler(Q_th)|flow_rate' in fs_reduced.solution - - def test_data_vars_with_multiple_variables(self, timesteps_8_days): - """Test clustering with multiple selected variables.""" - hours = len(timesteps_8_days) - demand = np.sin(np.linspace(0, 4 * np.pi, hours)) * 10 + 15 - price = np.cos(np.linspace(0, 4 * np.pi, hours)) * 0.02 + 0.05 - - fs = fx.FlowSystem(timesteps_8_days) - fs.add_elements( - fx.Bus('Heat'), - fx.Bus('Gas'), - fx.Effect('costs', '€', is_standard=True, is_objective=True), - fx.Sink('HeatDemand', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), - fx.Source('GasSource', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=price)]), - fx.linear_converters.Boiler( - 'Boiler', - thermal_efficiency=0.9, - fuel_flow=fx.Flow('Q_fu', bus='Gas'), - thermal_flow=fx.Flow('Q_th', bus='Heat'), - ), - ) - - # Cluster based on both demand and price - fs_reduced = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - data_vars=[ - 'HeatDemand(Q)|fixed_relative_profile', - 'GasSource(Gas)|costs|per_flow_hour', - ], - ) - - assert len(fs_reduced.timesteps) == 24 - assert len(fs_reduced.clusters) == 2 - - # ==================== Segmentation Tests ==================== @@ -1249,28 +1098,6 @@ def test_segmented_statistics_after_expand(self, solver_fixture, timesteps_8_day flow_rates = stats.flow_rates assert 'time' in flow_rates.dims - def test_segmented_timestep_mapping_uses_segment_assignments(self, timesteps_8_days): - """Test that timestep_mapping correctly maps original timesteps to segments.""" - from tsam import SegmentConfig - - fs = create_simple_system(timesteps_8_days) - - fs_segmented = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - segments=SegmentConfig(n_segments=6), - ) - - mapping = fs_segmented.clustering.timestep_mapping - - # Mapping should have original timestep count - assert len(mapping.values) == 192 - - # Each mapped value should be in valid range: [0, n_clusters * n_segments) - max_valid_idx = 2 * 6 - 1 # n_clusters * n_segments - 1 - assert mapping.min().item() >= 0 - assert mapping.max().item() <= max_valid_idx - @pytest.mark.parametrize('freq', ['1h', '2h']) def test_segmented_total_effects_match_solution(self, solver_fixture, freq): """Test that total_effects matches solution Cost after expand with segmentation. @@ -1449,13 +1276,6 @@ def test_segmented_expand_maps_correctly_per_period(self, solver_fixture, timest fs_segmented.optimize(solver_fixture) - # Get the timestep_mapping which should be multi-dimensional - mapping = fs_segmented.clustering.timestep_mapping - - # Mapping should have period dimension - assert 'period' in mapping.dims - assert mapping.sizes['period'] == 2 - # Expand and verify each period has correct number of timesteps fs_expanded = fs_segmented.transform.expand() flow_var = 'Boiler(Q_th)|flow_rate' diff --git a/tests/test_clustering/test_clustering_io.py b/tests/test_clustering/test_clustering_io.py index 0e2200885..93769b167 100644 --- a/tests/test_clustering/test_clustering_io.py +++ b/tests/test_clustering/test_clustering_io.py @@ -70,13 +70,9 @@ def test_clustering_to_dataset_has_clustering_attrs(self, simple_system_8_days): ds = fs_clustered.to_dataset(include_solution=False) - # Check that clustering attrs are present + # Check that clustering attrs are present (serialized as JSON string) assert 'clustering' in ds.attrs - # Check that clustering arrays are present with prefix - clustering_vars = [name for name in ds.data_vars if name.startswith('clustering|')] - assert len(clustering_vars) > 0 - def test_clustering_roundtrip_preserves_clustering_object(self, simple_system_8_days): """Clustering object should be restored after roundtrip.""" from flixopt.clustering import Clustering @@ -124,17 +120,6 @@ def test_clustering_roundtrip_preserves_original_timesteps(self, simple_system_8 # check_names=False because index name may be lost during serialization pd.testing.assert_index_equal(fs_restored.clustering.original_timesteps, original_timesteps, check_names=False) - def test_clustering_roundtrip_preserves_timestep_mapping(self, simple_system_8_days): - """Timestep mapping should be preserved after roundtrip.""" - fs = simple_system_8_days - fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') - original_mapping = fs_clustered.clustering.timestep_mapping.values.copy() - - ds = fs_clustered.to_dataset(include_solution=False) - fs_restored = fx.FlowSystem.from_dataset(ds) - - np.testing.assert_array_equal(fs_restored.clustering.timestep_mapping.values, original_mapping) - class TestClusteringWithSolutionRoundtrip: """Test that clustering with solution survives roundtrip.""" @@ -623,23 +608,23 @@ def test_cluster_assignments_preserved_after_roundtrip(self, system_with_periods # cluster_assignments should be exactly preserved xr.testing.assert_equal(original_cluster_assignments, fs_restored.clustering.cluster_assignments) - def test_results_preserved_after_load(self, system_with_periods_and_scenarios, tmp_path): - """ClusteringResults should be preserved after loading (via ClusteringResults.to_dict()).""" + def test_clustering_result_preserved_after_load(self, system_with_periods_and_scenarios, tmp_path): + """ClusteringResult should be preserved after loading.""" fs = system_with_periods_and_scenarios fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') - # Before save, results exists - assert fs_clustered.clustering.results is not None + # Before save, clustering_result exists + assert fs_clustered.clustering.clustering_result is not None # Roundtrip nc_path = tmp_path / 'multi_dim_clustering.nc' fs_clustered.to_netcdf(nc_path) fs_restored = fx.FlowSystem.from_netcdf(nc_path) - # After load, results should be reconstructed - assert fs_restored.clustering.results is not None - # The restored results should have the same structure - assert len(fs_restored.clustering.results) == len(fs_clustered.clustering.results) + # After load, clustering_result should be reconstructed + assert fs_restored.clustering.clustering_result is not None + # The restored clustering should have the same structure + assert len(fs_restored.clustering) == len(fs_clustered.clustering) def test_derived_properties_work_after_load(self, system_with_periods_and_scenarios, tmp_path): """Derived properties should work correctly after loading (computed from cluster_assignments).""" @@ -676,8 +661,8 @@ def test_apply_clustering_after_load(self, system_with_periods_and_scenarios, tm # Load the full FlowSystem with clustering fs_loaded = fx.FlowSystem.from_netcdf(nc_path) clustering_loaded = fs_loaded.clustering - # ClusteringResults should be fully preserved after load - assert clustering_loaded.results is not None + # ClusteringResult should be fully preserved after load + assert clustering_loaded.clustering_result is not None # Create a fresh FlowSystem (copy the original, unclustered one) fs_fresh = fs.copy() diff --git a/tests/test_clustering/test_expansion_regression.py b/tests/test_clustering/test_expansion_regression.py new file mode 100644 index 000000000..1bce3b4e7 --- /dev/null +++ b/tests/test_clustering/test_expansion_regression.py @@ -0,0 +1,157 @@ +"""Regression tests for cluster → optimize → expand numerical equivalence. + +These tests verify that the expanded solution values match known reference +values, catching any changes in the clustering/expansion pipeline. +""" + +import numpy as np +import pandas as pd +import pytest + +import flixopt as fx + +tsam = pytest.importorskip('tsam') + + +@pytest.fixture +def system_with_storage(): + """System with storage (tests charge_state) and effects (tests segment totals).""" + ts = pd.date_range('2020-01-01', periods=192, freq='h') # 8 days + demand = np.sin(np.linspace(0, 16 * np.pi, 192)) * 10 + 15 + + fs = fx.FlowSystem(ts) + fs.add_elements( + fx.Bus('Heat'), + fx.Bus('Gas'), + fx.Effect('costs', '€', is_standard=True, is_objective=True), + fx.Sink('D', inputs=[fx.Flow('Q', bus='Heat', fixed_relative_profile=demand, size=1)]), + fx.Source('G', outputs=[fx.Flow('Gas', bus='Gas', effects_per_flow_hour=0.05)]), + fx.linear_converters.Boiler( + 'B', + thermal_efficiency=0.9, + fuel_flow=fx.Flow('Q_fu', bus='Gas'), + thermal_flow=fx.Flow('Q_th', bus='Heat'), + ), + fx.Storage( + 'S', + capacity_in_flow_hours=50, + initial_charge_state=0.5, + charging=fx.Flow('in', bus='Heat', size=10), + discharging=fx.Flow('out', bus='Heat', size=10), + ), + ) + return fs + + +class TestNonSegmentedExpansion: + """Test that non-segmented cluster → expand produces correct values.""" + + def test_expanded_objective_matches(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + assert fs_e.solution['objective'].item() == pytest.approx(160.0, abs=1e-6) + + def test_expanded_flow_rates(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + assert float(np.nansum(sol['B(Q_th)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6) + assert float(np.nansum(sol['D(Q)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6) + assert float(np.nansum(sol['G(Gas)|flow_rate'].values)) == pytest.approx(3200.0, abs=1e-6) + + def test_expanded_costs(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + assert float(np.nansum(sol['costs(temporal)|per_timestep'].values)) == pytest.approx(160.0, abs=1e-6) + assert float(np.nansum(sol['G(Gas)->costs(temporal)'].values)) == pytest.approx(160.0, abs=1e-6) + + def test_expanded_storage(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + # Storage dispatch varies by solver — check charge_state is non-trivial + assert float(np.nansum(sol['S|charge_state'].values)) > 0 + # Net discharge should be ~0 (balanced storage) + assert float(np.nansum(sol['S|netto_discharge'].values)) == pytest.approx(0, abs=1e-4) + + def test_expanded_shapes(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster(n_clusters=2, cluster_duration='1D') + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + # 192 original timesteps + 1 extra boundary = 193 + for name in sol.data_vars: + if 'time' in sol[name].dims: + assert sol[name].sizes['time'] == 193, f'{name} has wrong time size' + + +class TestSegmentedExpansion: + """Test that segmented cluster → expand produces correct values.""" + + def test_expanded_objective_matches(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster( + n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6) + ) + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + assert fs_e.solution['objective'].item() == pytest.approx(160.0, abs=1e-6) + + def test_expanded_flow_rates(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster( + n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6) + ) + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + assert float(np.nansum(sol['B(Q_th)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6) + assert float(np.nansum(sol['D(Q)|flow_rate'].values)) == pytest.approx(2880.0, abs=1e-6) + assert float(np.nansum(sol['G(Gas)|flow_rate'].values)) == pytest.approx(3200.0, abs=1e-6) + + def test_expanded_costs(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster( + n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6) + ) + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + assert float(np.nansum(sol['costs(temporal)|per_timestep'].values)) == pytest.approx(160.0, abs=1e-6) + assert float(np.nansum(sol['G(Gas)->costs(temporal)'].values)) == pytest.approx(160.0, abs=1e-6) + + def test_expanded_shapes(self, system_with_storage, solver_fixture): + fs_c = system_with_storage.transform.cluster( + n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6) + ) + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + for name in sol.data_vars: + if 'time' in sol[name].dims: + assert sol[name].sizes['time'] == 193, f'{name} has wrong time size' + + def test_no_nans_in_expanded_flow_rates(self, system_with_storage, solver_fixture): + """Segmented expansion must ffill — no NaNs in flow rates (except extra boundary).""" + fs_c = system_with_storage.transform.cluster( + n_clusters=2, cluster_duration='1D', segments=tsam.SegmentConfig(n_segments=6) + ) + fs_c.optimize(solver_fixture) + fs_e = fs_c.transform.expand() + + sol = fs_e.solution + for name in ['B(Q_th)|flow_rate', 'D(Q)|flow_rate', 'G(Gas)|flow_rate']: + # Exclude last timestep (extra boundary, may be NaN for non-state variables) + vals = sol[name].isel(time=slice(None, -1)) + assert not vals.isnull().any(), f'{name} has NaN values after expansion' diff --git a/tests/test_clustering/test_integration.py b/tests/test_clustering/test_integration.py index f5d23c691..c424ceca2 100644 --- a/tests/test_clustering/test_integration.py +++ b/tests/test_clustering/test_integration.py @@ -122,97 +122,6 @@ def test_weights_with_cluster_weight(self): np.testing.assert_array_almost_equal(fs.temporal_weight.values, expected.values) -class TestClusteringData: - """Tests for FlowSystem.transform.clustering_data method.""" - - def test_clustering_data_method_exists(self): - """Test that transform.clustering_data method exists.""" - fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=48, freq='h')) - - assert hasattr(fs.transform, 'clustering_data') - assert callable(fs.transform.clustering_data) - - def test_clustering_data_returns_dataset(self): - """Test that clustering_data returns an xr.Dataset.""" - from flixopt import Bus, Flow, Sink, Source - - n_hours = 48 - fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h')) - - # Add components with time-varying data - demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2 - bus = Bus('electricity') - source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) - sink = Sink( - 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] - ) - fs.add_elements(source, sink, bus) - - clustering_data = fs.transform.clustering_data() - - assert isinstance(clustering_data, xr.Dataset) - - def test_clustering_data_contains_only_time_varying(self): - """Test that clustering_data returns only time-varying data.""" - from flixopt import Bus, Flow, Sink, Source - - n_hours = 48 - fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h')) - - # Add components with time-varying and constant data - demand_data = np.sin(np.linspace(0, 4 * np.pi, n_hours)) + 2 - bus = Bus('electricity') - source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) - sink = Sink( - 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] - ) - fs.add_elements(source, sink, bus) - - clustering_data = fs.transform.clustering_data() - - # Should contain the demand profile - assert 'demand(demand_out)|fixed_relative_profile' in clustering_data.data_vars - - # All arrays should have 'time' dimension - for var in clustering_data.data_vars: - assert 'time' in clustering_data[var].dims - - def test_clustering_data_with_periods(self): - """Test clustering_data with multi-period system.""" - from flixopt import Bus, Effect, Flow, Sink, Source - - n_hours = 48 - periods = pd.Index([2024, 2030], name='period') - fs = FlowSystem( - timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'), - periods=periods, - ) - - # Add components - demand_data = xr.DataArray( - np.random.rand(n_hours, 2), - dims=['time', 'period'], - coords={'time': fs.timesteps, 'period': periods}, - ) - bus = Bus('electricity') - effect = Effect('costs', '€', is_objective=True) - source = Source('grid', outputs=[Flow('grid_in', bus='electricity', size=100)]) - sink = Sink( - 'demand', inputs=[Flow('demand_out', bus='electricity', size=100, fixed_relative_profile=demand_data)] - ) - fs.add_elements(source, sink, bus, effect) - - # Get data for specific period - data_2024 = fs.transform.clustering_data(period=2024) - - # Should not have period dimension (it was selected) - assert 'period' not in data_2024.dims - - # Get data for all periods - data_all = fs.transform.clustering_data() - assert 'period' in data_all.dims - - class TestClusterMethod: """Tests for FlowSystem.transform.cluster method.""" @@ -302,15 +211,6 @@ def test_hierarchical_is_deterministic(self, basic_flow_system): # Hierarchical clustering should produce identical cluster orders xr.testing.assert_equal(fs1.clustering.cluster_assignments, fs2.clustering.cluster_assignments) - def test_metrics_available(self, basic_flow_system): - """Test that clustering metrics are available after clustering.""" - fs_clustered = basic_flow_system.transform.cluster(n_clusters=2, cluster_duration='1D') - - assert fs_clustered.clustering.metrics is not None - assert isinstance(fs_clustered.clustering.metrics, xr.Dataset) - assert 'time_series' in fs_clustered.clustering.metrics.dims - assert len(fs_clustered.clustering.metrics.data_vars) > 0 - def test_representation_method_parameter(self, basic_flow_system): """Test that representation method via ClusterConfig works.""" from tsam import ClusterConfig @@ -338,118 +238,26 @@ def test_tsam_kwargs_passthrough(self, basic_flow_system): def test_unknown_weight_keys_raise(self, basic_flow_system): """Test that unknown keys in ClusterConfig.weights raise ValueError. - Regression test: weight keys that don't match any variable in the - FlowSystem are likely typos and should be caught early with a clear - error message. + tsam_xarray validates weight keys and raises ValueError for unknown coords. """ from tsam import ClusterConfig # Get actual clustering column names - clustering_data = basic_flow_system.transform.clustering_data() - real_columns = list(clustering_data.data_vars) + ds = basic_flow_system.to_dataset(include_solution=False) + real_columns = [n for n in ds.data_vars if 'time' in ds[n].dims] # Build weights with real keys + extra bogus keys weights = {col: 1.0 for col in real_columns} weights['nonexistent_variable'] = 0.5 weights['another_missing_col'] = 0.3 - with pytest.raises(ValueError, match='unknown variables'): - basic_flow_system.transform.cluster( - n_clusters=2, - cluster_duration='1D', - cluster=ClusterConfig(weights=weights), - ) - - def test_weight_keys_excluded_by_data_vars_raise(self, basic_flow_system): - """Test that weight keys excluded by the data_vars allow-list raise ValueError. - - A variable may exist on the FlowSystem but be intentionally omitted from - the clustering input via data_vars. Weights referencing such excluded - variables should be rejected. - """ - from tsam import ClusterConfig - - ds = basic_flow_system.to_dataset(include_solution=False) - clustering_columns = list(basic_flow_system.transform.clustering_data().data_vars) - excluded_var = sorted(set(ds.data_vars) - set(clustering_columns))[0] - - # Weight references both a selected var and an excluded var - weights = {clustering_columns[0]: 1.0, excluded_var: 0.5} - - with pytest.raises(ValueError, match='unknown variables'): + with pytest.raises(ValueError, match='unknown'): basic_flow_system.transform.cluster( n_clusters=2, cluster_duration='1D', - data_vars=clustering_columns, cluster=ClusterConfig(weights=weights), ) - def test_extra_weight_keys_filtered_with_constant_column(self): - """Test that weights for constant (dropped) columns are filtered out. - - When a time series is constant over time it is removed before clustering. - User-provided weights referencing such columns must be silently dropped. - """ - pytest.importorskip('tsam') - from tsam import ClusterConfig - - from flixopt import Bus, Flow, Sink, Source - from flixopt.core import TimeSeriesData - - n_hours = 168 # 7 days - fs = FlowSystem(timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h')) - - demand_data = np.sin(np.linspace(0, 14 * np.pi, n_hours)) + 2 - bus = Bus('electricity') - grid_flow = Flow('grid_in', bus='electricity', size=100) - # One varying profile, one constant profile - demand_flow = Flow( - 'demand_out', - bus='electricity', - size=100, - fixed_relative_profile=TimeSeriesData(demand_data / 100), - ) - constant_flow = Flow( - 'constant_out', - bus='electricity', - size=50, - fixed_relative_profile=TimeSeriesData(np.full(n_hours, 0.8)), - ) - source = Source('grid', outputs=[grid_flow]) - sink = Sink('demand', inputs=[demand_flow]) - constant_sink = Sink('constant_load', inputs=[constant_flow]) - fs.add_elements(source, sink, constant_sink, bus) - - # Use to_dataset() to get ALL columns including the constant one - # (clustering_data() already strips constants, so it wouldn't test the path) - all_data = fs.to_dataset(include_solution=False) - all_columns = set(all_data.data_vars) - clustering_columns = set(fs.transform.clustering_data().data_vars) - - # Identify constant columns: variables with a single unique value across time - constant_columns = set() - for name in all_data.data_vars: - var = all_data[name] - if 'time' not in var.dims or np.nanmax(var.values) - np.nanmin(var.values) < 1e-10: - constant_columns.add(name) - - assert len(constant_columns) > 0, 'Test requires at least one constant column' - assert constant_columns <= all_columns, 'Constant columns must be in the full dataset' - for col in constant_columns: - assert col not in clustering_columns, f'Constant column {col!r} should not be in clustering_data()' - - # Build weights that reference ALL columns including the constant one - # that will be dropped — these are valid variables, just constant over time - weights = {col: 1.0 for col in all_columns} - - # Must not raise: constant columns are silently filtered, not rejected - fs_clustered = fs.transform.cluster( - n_clusters=2, - cluster_duration='1D', - cluster=ClusterConfig(weights=weights), - ) - assert len(fs_clustered.clusters) == 2 - def test_unknown_weight_keys_raise_multiperiod(self): """Test that unknown weight keys raise ValueError in multi-period clustering.""" pytest.importorskip('tsam') @@ -477,11 +285,11 @@ def test_unknown_weight_keys_raise_multiperiod(self): sink = Sink('demand', inputs=[demand_flow]) fs.add_elements(source, sink, bus) - clustering_data = fs.transform.clustering_data() - weights = {col: 1.0 for col in clustering_data.data_vars} + ds = fs.to_dataset(include_solution=False) + weights = {n: 1.0 for n in ds.data_vars if 'time' in ds[n].dims} weights['nonexistent_period_var'] = 0.7 - with pytest.raises(ValueError, match='unknown variables'): + with pytest.raises(ValueError, match='unknown'): fs.transform.cluster( n_clusters=2, cluster_duration='1D', @@ -519,8 +327,8 @@ def test_valid_weight_keys_multiperiod(self): sink = Sink('demand', inputs=[demand_flow]) fs.add_elements(source, sink, bus) - clustering_data = fs.transform.clustering_data() - weights = {col: 1.0 for col in clustering_data.data_vars} + ds = fs.to_dataset(include_solution=False) + weights = {n: 1.0 for n in ds.data_vars if 'time' in ds[n].dims} fs_clustered = fs.transform.cluster( n_clusters=2, @@ -528,36 +336,6 @@ def test_valid_weight_keys_multiperiod(self): cluster=ClusterConfig(weights=weights), ) assert len(fs_clustered.clusters) == 2 - assert 'period' in fs_clustered.clustering.metrics.dims - - def test_metrics_with_periods(self): - """Test that metrics have period dimension for multi-period FlowSystems.""" - pytest.importorskip('tsam') - from flixopt import Bus, Flow, Sink, Source - from flixopt.core import TimeSeriesData - - n_hours = 168 # 7 days - fs = FlowSystem( - timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'), - periods=pd.Index([2025, 2030], name='period'), - ) - - demand_data = np.sin(np.linspace(0, 14 * np.pi, n_hours)) + 2 - bus = Bus('electricity') - grid_flow = Flow('grid_in', bus='electricity', size=100) - demand_flow = Flow( - 'demand_out', bus='electricity', size=100, fixed_relative_profile=TimeSeriesData(demand_data / 100) - ) - source = Source('grid', outputs=[grid_flow]) - sink = Sink('demand', inputs=[demand_flow]) - fs.add_elements(source, sink, bus) - - fs_clustered = fs.transform.cluster(n_clusters=2, cluster_duration='1D') - - # Metrics should have period dimension - assert fs_clustered.clustering.metrics is not None - assert 'period' in fs_clustered.clustering.metrics.dims - assert len(fs_clustered.clustering.metrics.period) == 2 class TestClusteringModuleImports: diff --git a/tests/test_clustering/test_multiperiod_extremes.py b/tests/test_clustering/test_multiperiod_extremes.py index 973efe79d..01cb7d026 100644 --- a/tests/test_clustering/test_multiperiod_extremes.py +++ b/tests/test_clustering/test_multiperiod_extremes.py @@ -522,9 +522,9 @@ def test_append_with_segments(self, solver_fixture, timesteps_8_days): assert fs_clustered.clustering.is_segmented is True assert fs_clustered.clustering.n_segments == 4 - # n_representatives = n_clusters * n_segments + # n_clusters * n_segments n_clusters = fs_clustered.clustering.n_clusters - assert fs_clustered.clustering.n_representatives == n_clusters * 4 + assert n_clusters * fs_clustered.clustering.n_segments == n_clusters * 4 fs_clustered.optimize(solver_fixture) assert fs_clustered.solution is not None @@ -987,22 +987,3 @@ def test_cluster_occurrences_sum_to_original(self, timesteps_8_days, periods_2): f'Occurrences for period {period} with n_clusters={n_clusters}: ' f'{int(period_occurrences.sum())} != 8' ) - - def test_timestep_mapping_valid_range(self, timesteps_8_days, periods_2): - """Test that timestep_mapping values are within valid range.""" - fs = create_multiperiod_system_with_different_profiles(timesteps_8_days, periods_2) - - fs_clustered = fs.transform.cluster(n_clusters=3, cluster_duration='1D') - - mapping = fs_clustered.clustering.timestep_mapping - - # Mapping values should be in [0, n_clusters * timesteps_per_cluster - 1] - max_valid = 3 * 24 - 1 # n_clusters * timesteps_per_cluster - 1 - assert mapping.min().item() >= 0 - assert mapping.max().item() <= max_valid - - # Each period should have valid mappings - for period in periods_2: - period_mapping = mapping.sel(period=period) - assert period_mapping.min().item() >= 0 - assert period_mapping.max().item() <= max_valid diff --git a/tests/utilities/test_dataconverter.py b/tests/utilities/test_dataconverter.py index f9f2df889..0909b3d25 100644 --- a/tests/utilities/test_dataconverter.py +++ b/tests/utilities/test_dataconverter.py @@ -478,7 +478,7 @@ class TestTimeSeriesDataConversion: def test_timeseries_data_basic(self, time_coords): """TimeSeriesData should work like DataArray.""" data_array = xr.DataArray([10, 20, 30, 40, 50], coords={'time': time_coords}, dims='time') - ts_data = TimeSeriesData(data_array, clustering_group='test') + ts_data = TimeSeriesData(data_array) result = DataConverter.to_dataarray(ts_data, coords={'time': time_coords})