perf: Optimize clustering and I/O (4.4x faster segmented clustering) (#579)

FBumann · claude · web-flow · commit 7a4280d7e1b2 · 2026-01-17T09:57:15.000+01:00
* perf: Use ds.variables to avoid _construct_dataarray overhead

Optimize several functions by using ds.variables instead of iterating
over data_vars.items() or accessing ds[name], which triggers slow
_construct_dataarray calls.

Changes:
- io.py: save_dataset_to_netcdf, load_dataset_from_netcdf, _reduce_constant_arrays
- structure.py: from_dataset (use coord_cache pattern)
- core.py: drop_constant_arrays (use numpy operations)

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* perf: Optimize clustering serialization with ds.variables

Use ds.variables for faster access in clustering/base.py:
- _create_reference_structure: original_data and metrics iteration
- compare plot: duration_curve generation with direct numpy indexing

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* perf: Use batch assignment for clustering arrays (24x speedup)

_add_clustering_to_dataset was slow due to 210 individual
ds[name] = arr assignments. Each triggers xarray's
expensive dataset_update_method.

Changed to batch assignment with ds.assign(dict):
- Before: ~2600ms for to_dataset with clustering
- After: ~109ms for to_dataset with clustering

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* perf: Use ds.variables in _build_reduced_dataset (12% faster)

Avoided _construct_dataarray overhead by:
- Using ds.variables instead of ds.data_vars.items()
- Using numpy slicing instead of .isel()
- Passing attrs dict directly instead of DataArray

cluster() benchmark:
- Before: ~10.1s
- After: ~8.9s

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* perf: Use numpy reshape in _build_typical_das (4.4x faster)

Eliminated 451,856 slow pandas .loc calls by using numpy reshape
for segmented clustering data instead of iterating per-cluster.

cluster() with segments benchmark (50 clusters, 4 segments):
- Before: ~93.7s
- After: ~21.1s
- Speedup: 4.4x

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* fix: Multiple clustering and IO bug fixes

- benchmark_io_performance.py: Add Gurobi → HiGHS solver fallback
- components.py: Fix storage decay to use sum (not mean) for hours per cluster
- flow_system.py: Add RangeIndex validation requiring explicit timestep_duration
- io.py: Include auxiliary coordinates in _fast_get_dataarray
- transform_accessor.py: Add empty dataset guard after drop_constant_arrays
- transform_accessor.py: Fix timestep_mapping indexing for segmented clustering

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* perf: Use ds.variables pattern in expand() (2.2x faster)

Replace data_vars.items() iteration with ds.variables pattern to avoid
slow _construct_dataarray calls (5502 calls × ~1.5ms each).

Before: 3.73s
After:  1.72s

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/benchmark_io_performance.py b/benchmarks/benchmark_io_performance.py
@@ -142,7 +142,18 @@ def run_io_benchmarks(
 
     print('\n2. Clustering and solving...')
     fs_clustered = fs.transform.cluster(n_clusters=n_clusters, cluster_duration='1D')
-    fs_clustered.optimize(fx.solvers.GurobiSolver())
+
+    # Try Gurobi first, fall back to HiGHS if not available
+    try:
+        solver = fx.solvers.GurobiSolver()
+        fs_clustered.optimize(solver)
+    except Exception as e:
+        if 'gurobi' in str(e).lower() or 'license' in str(e).lower():
+            print(f'   Gurobi not available ({e}), falling back to HiGHS...')
+            solver = fx.solvers.HighsSolver()
+            fs_clustered.optimize(solver)
+        else:
+            raise
 
     print('\n3. Expanding...')
     fs_expanded = fs_clustered.transform.expand()
diff --git a/flixopt/clustering/base.py b/flixopt/clustering/base.py
@@ -1113,12 +1113,17 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup
         original_data_refs = None
         if include_original_data and self.original_data is not None:
             original_data_refs = []
-            for name, da in self.original_data.data_vars.items():
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            variables = self.original_data.variables
+            for name in self.original_data.data_vars:
+                var = variables[name]
                 ref_name = f'original_data|{name}'
                 # Rename time dim to avoid xarray alignment issues
-                if 'time' in da.dims:
-                    da = da.rename({'time': 'original_time'})
-                arrays[ref_name] = da
+                if 'time' in var.dims:
+                    new_dims = tuple('original_time' if d == 'time' else d for d in var.dims)
+                    arrays[ref_name] = xr.Variable(new_dims, var.values, attrs=var.attrs)
+                else:
+                    arrays[ref_name] = var
                 original_data_refs.append(f':::{ref_name}')
 
         # NOTE: aggregated_data is NOT serialized - it's identical to the FlowSystem's
@@ -1129,9 +1134,11 @@ def _create_reference_structure(self, include_original_data: bool = True) -> tup
         metrics_refs = None
         if self._metrics is not None:
             metrics_refs = []
-            for name, da in self._metrics.data_vars.items():
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            metrics_vars = self._metrics.variables
+            for name in self._metrics.data_vars:
                 ref_name = f'metrics|{name}'
-                arrays[ref_name] = da
+                arrays[ref_name] = metrics_vars[name]
                 metrics_refs.append(f':::{ref_name}')
 
         reference = {
@@ -1415,9 +1422,15 @@ def compare(
 
         if kind == 'duration_curve':
             sorted_vars = {}
+            # Use variables for faster access (avoids _construct_dataarray overhead)
+            variables = ds.variables
+            rep_values = ds.coords['representation'].values
+            rep_idx = {rep: i for i, rep in enumerate(rep_values)}
             for var in ds.data_vars:
-                for rep in ds.coords['representation'].values:
-                    values = np.sort(ds[var].sel(representation=rep).values.flatten())[::-1]
+                data = variables[var].values
+                for rep in rep_values:
+                    # Direct numpy indexing instead of .sel()
+                    values = np.sort(data[rep_idx[rep]].flatten())[::-1]
                     sorted_vars[(var, rep)] = values
             # Get length from first sorted array
             n = len(next(iter(sorted_vars.values())))
diff --git a/flixopt/components.py b/flixopt/components.py
@@ -1505,11 +1505,11 @@ def _add_linking_constraints(
 
         # Apply self-discharge decay factor (1-loss)^hours to soc_before per Eq. 5
         # relative_loss_per_hour is per-hour, so we need total hours per cluster
-        # Use sum over time to handle both regular and segmented systems
+        # Use sum over time to get total duration (handles both regular and segmented systems)
         # Keep as DataArray to respect per-period/scenario values
         rel_loss = _scalar_safe_reduce(self.element.relative_loss_per_hour, 'time', 'mean')
-        hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'mean')
-        decay_n = (1 - rel_loss) ** hours_per_cluster
+        total_hours_per_cluster = _scalar_safe_reduce(self._model.timestep_duration, 'time', 'sum')
+        decay_n = (1 - rel_loss) ** total_hours_per_cluster
 
         lhs = soc_after - soc_before * decay_n - delta_soc_ordered
         self.add_constraints(lhs == 0, short_name='link')
diff --git a/flixopt/core.py b/flixopt/core.py
@@ -629,17 +629,24 @@ def drop_constant_arrays(
         Dataset with constant variables removed.
     """
     drop_vars = []
+    # Use ds.variables for faster access (avoids _construct_dataarray overhead)
+    variables = ds.variables
 
-    for name, da in ds.data_vars.items():
+    for name in ds.data_vars:
+        var = variables[name]
         # Skip variables without the dimension
-        if dim not in da.dims:
+        if dim not in var.dims:
             if drop_arrays_without_dim:
                 drop_vars.append(name)
             continue
 
-        # Check if variable is constant along the dimension (ptp < atol)
-        ptp = da.max(dim, skipna=True) - da.min(dim, skipna=True)
-        if (ptp < atol).all().item():
+        # Check if variable is constant along the dimension using numpy (ptp < atol)
+        axis = var.dims.index(dim)
+        data = var.values
+        # Use numpy operations directly for speed
+        with np.errstate(invalid='ignore'):  # Ignore NaN warnings
+            ptp = np.nanmax(data, axis=axis) - np.nanmin(data, axis=axis)
+        if np.all(ptp < atol):
             drop_vars.append(name)
 
     if drop_vars:
diff --git a/flixopt/flow_system.py b/flixopt/flow_system.py
@@ -214,6 +214,12 @@ def __init__(
         elif computed_timestep_duration is not None:
             self.timestep_duration = self.fit_to_model_coords('timestep_duration', computed_timestep_duration)
         else:
+            # RangeIndex (segmented systems) requires explicit timestep_duration
+            if isinstance(self.timesteps, pd.RangeIndex):
+                raise ValueError(
+                    'timestep_duration is required when using RangeIndex timesteps (segmented systems). '
+                    'Provide timestep_duration explicitly or use DatetimeIndex timesteps.'
+                )
             self.timestep_duration = None
 
         # Cluster weight for cluster() optimization (default 1.0)
diff --git a/flixopt/io.py b/flixopt/io.py
@@ -561,14 +561,18 @@ def save_dataset_to_netcdf(
     ds.attrs = {'attrs': json.dumps(ds.attrs)}
 
     # Convert all DataArray attrs to JSON strings
-    for var_name, data_var in ds.data_vars.items():
-        if data_var.attrs:  # Only if there are attrs
-            ds[var_name].attrs = {'attrs': json.dumps(data_var.attrs)}
+    # Use ds.variables to avoid slow _construct_dataarray calls
+    variables = ds.variables
+    for var_name in ds.data_vars:
+        var = variables[var_name]
+        if var.attrs:  # Only if there are attrs
+            var.attrs = {'attrs': json.dumps(var.attrs)}
 
     # Also handle coordinate attrs if they exist
-    for coord_name, coord_var in ds.coords.items():
-        if hasattr(coord_var, 'attrs') and coord_var.attrs:
-            ds[coord_name].attrs = {'attrs': json.dumps(coord_var.attrs)}
+    for coord_name in ds.coords:
+        var = variables[coord_name]
+        if var.attrs:
+            var.attrs = {'attrs': json.dumps(var.attrs)}
 
     # Suppress numpy binary compatibility warnings from netCDF4 (numpy 1->2 transition)
     with warnings.catch_warnings():
@@ -602,25 +606,38 @@ def _reduce_constant_arrays(ds: xr.Dataset) -> xr.Dataset:
         Dataset with constant dimensions reduced.
     """
     new_data_vars = {}
+    variables = ds.variables
+
+    for name in ds.data_vars:
+        var = variables[name]
+        dims = var.dims
+        data = var.values
 
-    for name, da in ds.data_vars.items():
-        if not da.dims or da.size == 0:
-            new_data_vars[name] = da
+        if not dims or data.size == 0:
+            new_data_vars[name] = var
             continue
 
-        # Try to reduce each dimension
-        reduced = da
-        for dim in list(da.dims):
-            if dim not in reduced.dims:
+        # Try to reduce each dimension using numpy operations
+        reduced_data = data
+        reduced_dims = list(dims)
+
+        for _axis, dim in enumerate(dims):
+            if dim not in reduced_dims:
                 continue  # Already removed
-            # Check if constant along this dimension
-            first_slice = reduced.isel({dim: 0})
-            is_constant = (reduced == first_slice).all()
+
+            current_axis = reduced_dims.index(dim)
+            # Check if constant along this axis using numpy
+            first_slice = np.take(reduced_data, 0, axis=current_axis)
+            # Broadcast first_slice to compare
+            expanded = np.expand_dims(first_slice, axis=current_axis)
+            is_constant = np.allclose(reduced_data, expanded, equal_nan=True)
+
             if is_constant:
                 # Remove this dimension by taking first slice
-                reduced = first_slice
+                reduced_data = first_slice
+                reduced_dims.pop(current_axis)
 
-        new_data_vars[name] = reduced
+        new_data_vars[name] = xr.Variable(tuple(reduced_dims), reduced_data, attrs=var.attrs)
 
     return xr.Dataset(new_data_vars, coords=ds.coords, attrs=ds.attrs)
 
@@ -754,14 +771,18 @@ def load_dataset_from_netcdf(path: str | pathlib.Path) -> xr.Dataset:
         ds.attrs = json.loads(ds.attrs['attrs'])
 
     # Restore DataArray attrs (before unstacking, as stacked vars have no individual attrs)
-    for var_name, data_var in ds.data_vars.items():
-        if 'attrs' in data_var.attrs:
-            ds[var_name].attrs = json.loads(data_var.attrs['attrs'])
+    # Use ds.variables to avoid slow _construct_dataarray calls
+    variables = ds.variables
+    for var_name in ds.data_vars:
+        var = variables[var_name]
+        if 'attrs' in var.attrs:
+            var.attrs = json.loads(var.attrs['attrs'])
 
     # Restore coordinate attrs
-    for coord_name, coord_var in ds.coords.items():
-        if hasattr(coord_var, 'attrs') and 'attrs' in coord_var.attrs:
-            ds[coord_name].attrs = json.loads(coord_var.attrs['attrs'])
+    for coord_name in ds.coords:
+        var = variables[coord_name]
+        if 'attrs' in var.attrs:
+            var.attrs = json.loads(var.attrs['attrs'])
 
     # Unstack variables if they were stacked during saving
     # Detection: check if any dataset dimension starts with '__stacked__'
@@ -1577,7 +1598,10 @@ def _fast_get_dataarray(ds: xr.Dataset, name: str, coord_cache: dict[str, xr.Dat
             Constructed DataArray
         """
         variable = ds.variables[name]
-        coords = {k: coord_cache[k] for k in variable.dims if k in coord_cache}
+        var_dims = set(variable.dims)
+        # Include coordinates whose dims are a subset of the variable's dims
+        # This preserves both dimension coordinates and auxiliary coordinates
+        coords = {k: v for k, v in coord_cache.items() if set(v.dims).issubset(var_dims)}
         return xr.DataArray(variable, coords=coords, name=name)
 
     @staticmethod
@@ -1865,9 +1889,10 @@ def _add_clustering_to_dataset(
             clustering_ref, clustering_arrays = clustering._create_reference_structure(
                 include_original_data=include_original_data
             )
-            # Add clustering arrays with prefix
-            for name, arr in clustering_arrays.items():
-                ds[f'{cls.CLUSTERING_PREFIX}{name}'] = arr
+            # Add clustering arrays with prefix using batch assignment
+            # (individual ds[name] = arr assignments are slow)
+            prefixed_arrays = {f'{cls.CLUSTERING_PREFIX}{name}': arr for name, arr in clustering_arrays.items()}
+            ds = ds.assign(prefixed_arrays)
             ds.attrs['clustering'] = json.dumps(clustering_ref)
 
         return ds
diff --git a/flixopt/structure.py b/flixopt/structure.py
@@ -1116,7 +1116,17 @@ def from_dataset(cls, ds: xr.Dataset) -> Interface:
             reference_structure.pop('__class__', None)
 
             # Create arrays dictionary from dataset variables
-            arrays_dict = {name: array for name, array in ds.data_vars.items()}
+            # Use ds.variables with coord_cache for faster DataArray construction
+            variables = ds.variables
+            coord_cache = {k: ds.coords[k] for k in ds.coords}
+            arrays_dict = {
+                name: xr.DataArray(
+                    variables[name],
+                    coords={k: coord_cache[k] for k in variables[name].dims if k in coord_cache},
+                    name=name,
+                )
+                for name in ds.data_vars
+            }
 
             # Resolve all references using the centralized method
             resolved_params = cls._resolve_reference_structure(reference_structure, arrays_dict)
diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py