Closes #5175: overloads for in1d

1RyanK · 1RyanK · commit 3ba7c4267cae · 2026-01-27T07:08:15.000-05:00
diff --git a/arkouda/numpy/pdarraysetops.py b/arkouda/numpy/pdarraysetops.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Literal, Sequence, Tuple, TypeVar, Union, cast, overload
 
 import numpy as np
 
@@ -19,7 +19,7 @@
 
 
 if TYPE_CHECKING:
-    from arkouda.numpy.pdarraycreation import array, zeros, zeros_like
+    from arkouda.numpy.pdarraycreation import array, zeros_like
     from arkouda.numpy.strings import Strings
     from arkouda.pandas.categorical import Categorical
 else:
@@ -94,6 +94,7 @@ def _in1d_single(
     array([False True])
     """
     from arkouda.client import generic_msg
+    from arkouda.numpy.pdarraycreation import zeros
     from arkouda.numpy.strings import Strings
     from arkouda.pandas.categorical import Categorical as Categorical_
 
@@ -138,22 +139,46 @@ def _in1d_single(
         raise TypeError("Both pda1 and pda2 must be pdarray, Strings, or Categorical")
 
 
+@overload
+def in1d(
+    A: groupable,
+    B: groupable,
+    assume_unique: bool = ...,
+    symmetric: Literal[False] = ...,
+    invert: bool = ...,
+) -> pdarray: ...
+
+
+@overload
+def in1d(
+    A: groupable,
+    B: groupable,
+    assume_unique: bool = ...,
+    symmetric: Literal[True] = ...,
+    invert: bool = ...,
+) -> Tuple[pdarray, pdarray]: ...
+
+
 @typechecked
 def in1d(
     A: groupable,
     B: groupable,
     assume_unique: bool = False,
     symmetric: bool = False,
     invert: bool = False,
-) -> groupable:
+) -> Union[pdarray, Tuple[pdarray, pdarray]]:
     """
     Test whether each element of a 1-D array is also present in a second array.
 
-    Returns a boolean array the same length as `A` that is True
-    where an element of `A` is in `B` and False otherwise.
+    If ``symmetric=False`` (default), returns a boolean pdarray of the same
+    shape as ``A`` indicating whether each element of ``A`` is in ``B``.
+
+    If ``symmetric=True``, returns a tuple ``(maskA, maskB)`` where:
+
+      * ``maskA[i]`` is True iff ``A[i]`` is in ``B``
+      * ``maskB[j]`` is True iff ``B[j]`` is in ``A``
 
-    Supports multi-level, i.e. test if rows of a are in the set of rows of b.
-    But note that multi-dimensional pdarrays are not supported.
+    If ``invert=True``, the returned mask(s) are logically inverted.
 
     Parameters
     ----------
@@ -223,7 +248,7 @@ def in1d(
             raise TypeError("If A is pdarray, B must also be pdarray")
         elif isinstance(B, (pdarray, Strings, Categorical_)):
             if symmetric:
-                return _in1d_single(A, B), _in1d_single(B, A, invert)
+                return _in1d_single(A, B, invert), _in1d_single(B, A, invert)
             return _in1d_single(A, B, invert)
         else:
             raise TypeError(
@@ -260,18 +285,25 @@ def in1d(
     if assume_unique:
         # Deinterleave truth into a and b domains
         if symmetric:
-            return truth[isa], truth[~isa] if not invert else ~truth[isa], ~truth[~isa]
+            aout = truth[isa]
+            bout = truth[~isa]
+            if invert:
+                return ~aout, ~bout
+            return aout, bout
         else:
-            return truth[isa] if not invert else ~truth[isa]
+            aout = truth[isa]
+            return ~aout if invert else aout
     else:
         # If didn't start unique, first need to deinterleave into ua domain,
         # then broadcast to a domain
         atruth = ag.broadcast(truth[isa], permute=True)
         if symmetric:
             btruth = bg.broadcast(truth[~isa], permute=True)
-            return atruth, btruth if not invert else ~atruth, ~btruth
+            if invert:
+                return ~atruth, ~btruth
+            return atruth, btruth
         else:
-            return atruth if not invert else ~atruth
+            return ~atruth if invert else atruth
 
 
 def in1dmulti(a, b, assume_unique=False, symmetric=False):
diff --git a/arkouda/numpy/util.py b/arkouda/numpy/util.py
@@ -991,6 +991,9 @@ def map(
     TypeError
         If `mapping` is not of type `dict` or `Series`.
         If `values` is not of type `pdarray`, `Categorical`, or `Strings`.
+    ValueError
+        If a mapping with tuple keys has inconsistent lengths, or if a MultiIndex
+        mapping has a different number of levels than the GroupBy keys.
 
     Examples
     --------
@@ -1012,29 +1015,97 @@ def map(
     from arkouda.numpy.pdarraysetops import in1d
     from arkouda.numpy.strings import Strings
     from arkouda.pandas.categorical import Categorical
+    from arkouda.pandas.index import MultiIndex
 
     keys = values
     gb = GroupBy(keys, dropna=False)
     gb_keys = gb.unique_keys
 
+    # helper: number of unique keys (works for single key or tuple-of-keys)
+    nuniq = gb_keys[0].size if isinstance(gb_keys, tuple) else gb_keys.size
+
+    # Fast-path: empty mapping => everything is missing
+    if (isinstance(mapping, dict) and len(mapping) == 0) or (
+        isinstance(mapping, Series) and len(mapping.index) == 0
+    ):
+        if not isinstance(values, (Strings, Categorical)):
+            fillvals = full(nuniq, np.nan, values.dtype)
+        else:
+            fillvals = full(nuniq, "null")
+        return broadcast(gb.segments, fillvals, permutation=gb.permutation)
+
     if isinstance(mapping, dict):
-        mapping = Series([array(list(mapping.keys())), array(list(mapping.values()))])
+        # Build mapping as a Series with an Index/MultiIndex (avoid rank>1 arrays)
+        m_keys = list(mapping.keys())
+        m_vals = list(mapping.values())
+
+        k0 = m_keys[0]
+        if isinstance(k0, tuple):
+            # validate tuple keys
+            if not all(isinstance(k, tuple) for k in m_keys):
+                raise TypeError("Mixed key types in mapping dict (tuple and non-tuple).")
+            n = len(k0)
+            if not all(len(k) == n for k in m_keys):
+                raise ValueError("All tuple keys in mapping dict must have the same length.")
+
+            cols = list(zip(*m_keys))  # transpose list[tuple] -> list[level]
+            idx = MultiIndex([array(col) for col in cols])
+            mapping = Series(array(m_vals), index=idx)
+        else:
+            mapping = Series(array(m_vals), index=array(m_keys))
 
     if isinstance(mapping, Series):
-        xtra_keys = gb_keys[in1d(gb_keys, mapping.index.values, invert=True)]
+        # Normalize mapping index keys into a "groupable" (single array OR tuple-of-arrays)
+        mindex = mapping.index
+        if isinstance(mindex, MultiIndex):
+            mkeys = tuple(mindex.index)
+        else:
+            mkeys = mindex.values
 
-        if xtra_keys.size > 0:
-            if not isinstance(mapping.values, (Strings, Categorical)):
-                nans = full(xtra_keys.size, np.nan, mapping.values.dtype)
-            else:
-                nans = full(xtra_keys.size, "null")
+        if isinstance(gb_keys, tuple) and isinstance(mkeys, tuple):
+            if len(gb_keys) != len(mkeys):
+                raise ValueError(
+                    f"Mapping MultiIndex has {len(mkeys)} levels but GroupBy has {len(gb_keys)} keys"
+                )
+
+        mask = in1d(gb_keys, mkeys, invert=True)
+
+        # Compute extra keys + extra size without mixing tuple/non-tuple assignments
+        if isinstance(gb_keys, tuple):
+            xtra_keys_t = tuple(k[mask] for k in gb_keys)
+            xtra_size = xtra_keys_t[0].size if len(xtra_keys_t) > 0 else 0
+
+            if xtra_size > 0:
+                if not isinstance(mapping.values, (Strings, Categorical)):
+                    nans = full(xtra_size, np.nan, mapping.values.dtype)
+                else:
+                    nans = full(xtra_size, "null")
+
+                # Convert any categorical levels to strings, level-by-level
+                xtra_keys_t = tuple(
+                    k.to_strings() if isinstance(k, Categorical) else k for k in xtra_keys_t
+                )
+
+                xtra_series = Series(nans, index=MultiIndex(list(xtra_keys_t)))
+                mapping = Series.concat([mapping, xtra_series])
+
+        else:
+            xtra_keys_s = gb_keys[mask]
+            xtra_size = xtra_keys_s.size
+
+            if xtra_size > 0:
+                if not isinstance(mapping.values, (Strings, Categorical)):
+                    nans = full(xtra_size, np.nan, mapping.values.dtype)
+                else:
+                    nans = full(xtra_size, "null")
 
-            if isinstance(xtra_keys, Categorical):
-                xtra_keys = xtra_keys.to_strings()
+                if isinstance(xtra_keys_s, Categorical):
+                    xtra_keys_s = xtra_keys_s.to_strings()
 
-            xtra_series = Series(nans, index=xtra_keys)
-            mapping = Series.concat([mapping, xtra_series])
+                xtra_series = Series(nans, index=xtra_keys_s)
+                mapping = Series.concat([mapping, xtra_series])
 
+        # Align mapping to gb_keys
         if isinstance(gb_keys, Categorical):
             mapping = mapping[gb_keys.to_strings()]
         else:
diff --git a/arkouda/pandas/groupbyclass.py b/arkouda/pandas/groupbyclass.py
@@ -86,6 +86,7 @@
 
 groupable_element_type = Union[pdarray, Strings, "Categorical"]
 groupable = Union[groupable_element_type, Sequence[groupable_element_type]]
+
 # Note: we won't be typechecking GroupBy until we can figure out a way to handle
 # the circular import with Categorical
 
diff --git a/arkouda/pandas/join.py b/arkouda/pandas/join.py
@@ -18,7 +18,7 @@
 from arkouda.numpy.pdarrayclass import create_pdarray, pdarray
 from arkouda.numpy.pdarraysetops import concatenate, in1d
 from arkouda.pandas.categorical import Categorical
-from arkouda.pandas.groupbyclass import GroupBy, broadcast
+from arkouda.pandas.groupbyclass import GroupBy, broadcast, groupable_element_type
 
 
 if TYPE_CHECKING:
@@ -198,8 +198,8 @@ def compute_join_size(a: pdarray, b: pdarray) -> Tuple[int, int]:
     ua, asize = bya.size()
     byb = GroupBy(b)
     ub, bsize = byb.size()
-    afact = asize[in1d(ua, ub)]
-    bfact = bsize[in1d(ub, ua)]
+    afact = asize[in1d(cast(groupable_element_type, ua), cast(groupable_element_type, ub))]
+    bfact = bsize[in1d(cast(groupable_element_type, ub), cast(groupable_element_type, ua))]
     nelem = (afact * bfact).sum()
     nbytes = 3 * 8 * nelem
     return nelem, nbytes
diff --git a/arkouda/pandas/series.py b/arkouda/pandas/series.py
@@ -19,7 +19,7 @@
 from arkouda.numpy.pdarrayclass import RegistrationError, any, argmaxk, create_pdarray, pdarray
 from arkouda.numpy.pdarraysetops import argsort, concatenate, in1d, indexof1d
 from arkouda.numpy.util import get_callback, is_float
-from arkouda.pandas.groupbyclass import GroupBy, groupable_element_type
+from arkouda.pandas.groupbyclass import GroupBy, groupable, groupable_element_type
 from arkouda.pandas.index import Index, MultiIndex
 
 
@@ -429,6 +429,7 @@ def __setitem__(
         """
         from arkouda.numpy.pdarraycreation import array
         from arkouda.numpy.strings import Strings
+        from arkouda.pandas.categorical import Categorical
 
         val = self.validate_val(val)
         key = self.validate_key(key)
@@ -440,7 +441,23 @@ def __setitem__(
         if is_supported_scalar(key):
             indices = self.index == key
         else:
-            indices = in1d(self.index.values, key)
+            # mypy: key may be scalar/SegArray/etc, but in1d only accepts groupables
+            if not isinstance(key, (pdarray, Strings, Categorical, list, tuple)):
+                raise TypeError(f"Unsupported key type for membership test: {type(key)}")
+
+            # If key is a python list/tuple, it will be validated/converted by validate_key in many paths
+            # but if it slips through, convert here.
+            if (
+                isinstance(self.index, MultiIndex)
+                and isinstance(key, tuple)
+                and len(key) == self.index.nlevels
+            ):
+                indices = self.index.lookup(key)  # returns boolean mask
+            else:
+                if isinstance(key, list):
+                    key = array(key)
+                indices = in1d(self.index.values, cast(groupable, key))
+
         tf, counts = GroupBy(indices).size()
         update_count = counts[1] if len(counts) == 2 else 0
         if update_count == 0:
@@ -614,10 +631,28 @@ def isin(self, lst: Union[pdarray, Strings, List]) -> Series:
             and False otherwise.
 
         """
+        from arkouda.numpy.pdarraycreation import array
+        from arkouda.numpy.strings import Strings
+        from arkouda.pandas.categorical import Categorical
+
         if isinstance(lst, list):
             lst = array(lst)
 
-        boolean = in1d(self.values, lst)
+        # mypy: lst/self.values can be a wider union (SegArray/Any) at type level.
+        # At runtime, in1d only supports pdarray/Strings/Categorical (or sequences of those).
+        if not isinstance(self.values, (pdarray, Strings, Categorical)):
+            raise TypeError(f"in1d not supported for Series values type: {type(self.values)}")
+
+        if not isinstance(lst, (pdarray, Strings, Categorical, list, tuple)):
+            raise TypeError(f"in1d not supported for list type: {type(lst)}")
+
+        if isinstance(lst, (list, tuple)):
+            lst = array(lst)
+
+        boolean = in1d(
+            cast(groupable_element_type, self.values),
+            cast(groupable_element_type, lst),
+        )
         return Series(data=boolean, index=self.index)
 
     @typechecked
diff --git a/tests/numpy/setops_test.py b/tests/numpy/setops_test.py
@@ -238,6 +238,43 @@ def test_in1d_multiarray_categorical(self, size):
         stringsTwo = ak.Categorical(ak.array(["String {}".format(i % 2) for i in range(10)]))
         assert [(x % 3) < 2 for x in range(10)] == ak.in1d(stringsOne, stringsTwo).tolist()
 
+    @pytest.mark.requires_chapel_module("In1dMsg")
+    def test_in1d_symmetric(self):
+        # Duplicates to exercise assume_unique=False (GroupBy/broadcast path)
+        a = ak.array([1, 2, 2, 3, 4])
+        b = ak.array([2, 4, 4, 5])
+
+        def exp_in(x, y):
+            yset = set(y)
+            return [xi in yset for xi in x]
+
+        a_list = a.to_ndarray().tolist()
+        b_list = b.to_ndarray().tolist()
+
+        # assume_unique=False path (duplicates allowed; should match membership semantics)
+        am2, bm2 = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=False)
+        assert am2.tolist() == exp_in(a_list, b_list)
+        assert bm2.tolist() == exp_in(b_list, a_list)
+
+        am2_i, bm2_i = ak.in1d(a, b, assume_unique=False, symmetric=True, invert=True)
+        assert am2_i.tolist() == [not v for v in exp_in(a_list, b_list)]
+        assert bm2_i.tolist() == [not v for v in exp_in(b_list, a_list)]
+
+        # assume_unique=True path (inputs must be unique for this branch to be valid)
+        au = ak.array([1, 2, 3, 4])
+        bu = ak.array([2, 4, 5])
+
+        au_list = au.to_ndarray().tolist()
+        bu_list = bu.to_ndarray().tolist()
+
+        am, bm = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=False)
+        assert am.tolist() == exp_in(au_list, bu_list)
+        assert bm.tolist() == exp_in(bu_list, au_list)
+
+        am_i, bm_i = ak.in1d(au, bu, assume_unique=True, symmetric=True, invert=True)
+        assert am_i.tolist() == [not v for v in exp_in(au_list, bu_list)]
+        assert bm_i.tolist() == [not v for v in exp_in(bu_list, au_list)]
+
     @pytest.mark.parametrize("size", pytest.prob_size)
     @pytest.mark.parametrize("dtype", INTEGRAL_TYPES)
     def test_intersect1d_multiarray_numeric_types(self, size, dtype):
diff --git a/tests/pandas/series_test.py b/tests/pandas/series_test.py