1111import pandas as pd
1212import pyarrow as pa
1313from dask .dataframe .dask_expr ._collection import new_collection
14+ from dask .dataframe .dask_expr ._expr import no_default as dsk_no_default
1415from nested_pandas .series .dtype import NestedDtype
1516from nested_pandas .series .packer import pack , pack_flat , pack_lists
1617from pandas ._libs import lib
@@ -731,7 +732,7 @@ def sort_values(
731732 meta = self ._meta ,
732733 )
733734
734- def reduce (self , func , * args , meta = None , ** kwargs ) -> NestedFrame :
735+ def reduce (self , func , * args , meta = dsk_no_default , infer_nesting = True , ** kwargs ) -> NestedFrame :
735736 """
736737 Takes a function and applies it to each top-level row of the NestedFrame.
737738
@@ -751,7 +752,15 @@ def reduce(self, func, *args, meta=None, **kwargs) -> NestedFrame:
751752 Positional arguments to pass to the function, the first *args should be the names of the
752753 columns to apply the function to.
753754 meta : dataframe or series-like, optional
754- The dask meta of the output.
755+ The dask meta of the output. If not provided, dask will try to
756+ infer the metadata. This may lead to unexpected results, so
757+ providing meta is recommended.
758+ infer_nesting : bool, default True
759+ If True, the function will pack output columns into nested
760+ structures based on column names adhering to a nested naming
761+ scheme. E.g. "nested.b" and "nested.c" will be packed into a column
762+ called "nested" with columns "b" and "c". If False, all outputs
763+ will be returned as base columns.
755764 kwargs : keyword arguments, optional
756765 Keyword arguments to pass to the function.
757766
@@ -773,6 +782,26 @@ def reduce(self, func, *args, meta=None, **kwargs) -> NestedFrame:
773782 >>> '''reduce will return a NestedFrame with two columns'''
774783 >>> return {"sum_col1": sum(col1), "sum_col2": sum(col2)}
775784
785+ When using nesting inference (infer_nesting=True), the output may
786+ contain nested columns. In such cases, the meta should be provided with
787+ the appropriate dtype for these columns. For example, the following
788+ function, which produces a nested column "lc":
789+
790+ >>> def complex_output(flux):
791+ >>> return {"max_flux": np.max(flux),
792+ >>> "lc.flux_quantiles": np.quantile(flux, [0.1, 0.2, 0.3, 0.4, 0.5]),
793+ >>> "lc.labels": [0.1, 0.2, 0.3, 0.4, 0.5]}
794+
795+ Would require the following meta:
796+
797+ >>> # create a NestedDtype for the nested column "lc"
798+ >>> from nested_pandas.series.dtype import NestedDtype
799+ >>> lc_dtype = NestedDtype(pa.struct([pa.field("flux_quantiles", pa.list_(pa.float64())),
800+ >>> pa.field("labels", pa.list_(pa.float64()))]))
801+ >>> # use the lc_dtype in meta creation
802+ >>> result_meta = npd.NestedFrame({'max_flux':pd.Series([], dtype='float'),
803+ >>> 'lc':pd.Series([], dtype=lc_dtype)})
804+
776805 """
777806
778807 # Handle meta shorthands to produce nestedframe output
@@ -787,7 +816,9 @@ def reduce(self, func, *args, meta=None, **kwargs) -> NestedFrame:
787816 # apply nested_pandas reduce via map_partitions
788817 # wrap the partition in a npd.NestedFrame call for:
789818 # https://github.com/lincc-frameworks/nested-dask/issues/21
790- return self .map_partitions (lambda x : npd .NestedFrame (x ).reduce (func , * args , ** kwargs ), meta = meta )
819+ return self .map_partitions (
820+ lambda x : npd .NestedFrame (x ).reduce (func , * args , infer_nesting = infer_nesting , ** kwargs ), meta = meta
821+ )
791822
792823 def to_parquet (self , path , by_layer = True , ** kwargs ) -> None :
793824 """Creates parquet file(s) with the data of a NestedFrame, either
0 commit comments