Skip to content

Commit 3d0ef27

Browse files
committed
Closes #5272: alignment tests for arkouda.numpy.pdarraysetops
1 parent 8a95e0c commit 3d0ef27

3 files changed

Lines changed: 336 additions & 3 deletions

File tree

arkouda/numpy/pdarraysetops.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
if TYPE_CHECKING:
22-
from arkouda.numpy.pdarraycreation import array, zeros, zeros_like
22+
from arkouda.numpy.pdarraycreation import array, zeros_like
2323
from arkouda.numpy.strings import Strings
2424
from arkouda.pandas.categorical import Categorical
2525
else:
@@ -94,17 +94,18 @@ def _in1d_single(
9494
array([False True])
9595
"""
9696
from arkouda.client import generic_msg
97+
from arkouda.numpy.pdarraycreation import zeros as ak_zeros
9798
from arkouda.numpy.strings import Strings
9899
from arkouda.pandas.categorical import Categorical as Categorical_
99100

100101
if isinstance(pda1, pdarray) or isinstance(pda1, Strings) or isinstance(pda1, Categorical_):
101102
# While isinstance(thing, type) can be called on a tuple of types,
102103
# this causes an issue with mypy for unknown reasons.
103104
if pda1.size == 0:
104-
return zeros(0, dtype=akbool)
105+
return ak_zeros(0, dtype=akbool)
105106
if isinstance(pda2, pdarray) or isinstance(pda2, Strings) or isinstance(pda2, Categorical_):
106107
if pda2.size == 0:
107-
return zeros(pda1.size, dtype=akbool)
108+
return ak_zeros(pda1.size, dtype=akbool)
108109
if hasattr(pda1, "categories"):
109110
x = cast(Categorical_, pda1).in1d(pda2)
110111
return x if not invert else ~x

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ testpaths =
5252
tests/numpy/err_test.py
5353
tests/numpy/manipulation_functions_test.py
5454
tests/numpy/alignment_verification/operators_alignment.py
55+
tests/numpy/alignment_verification/pdarraysetops_alignment.py
5556
tests/numpy/numeric_test.py
5657
tests/numpy/numpy_test.py
5758
tests/numpy/pdarrayclass_test.py
Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
import numpy as np
2+
import pytest
3+
4+
import arkouda as ak
5+
6+
7+
def _as_np(a):
8+
"""Convert arkouda pdarray/Strings to numpy ndarray."""
9+
# pdarray.to_ndarray exists; Strings.to_ndarray exists too in arkouda
10+
return a.to_ndarray()
11+
12+
13+
def _np_struct_from_cols(cols: list[np.ndarray]) -> np.ndarray:
14+
"""
15+
Build a NumPy structured array representing "rows" from multiple 1D columns.
16+
This lets us use np.union1d/intersect1d/setdiff1d/setxor1d on rows.
17+
"""
18+
assert len(cols) >= 1
19+
n = len(cols[0])
20+
for c in cols[1:]:
21+
assert len(c) == n
22+
23+
dtype = [(f"f{i}", cols[i].dtype) for i in range(len(cols))]
24+
out = np.empty(n, dtype=dtype)
25+
for i, c in enumerate(cols):
26+
out[f"f{i}"] = c
27+
return out
28+
29+
30+
def _np_setop_rows(op, A_cols, B_cols):
31+
"""
32+
Compute numpy reference for multi-column setops by treating rows as structured scalars.
33+
op: one of np.union1d, np.intersect1d, np.setdiff1d, np.setxor1d
34+
Returns list of numpy arrays (one per column), sorted lexicographically by row.
35+
"""
36+
A_rows = _np_struct_from_cols(A_cols)
37+
B_rows = _np_struct_from_cols(B_cols)
38+
rows = op(A_rows, B_rows)
39+
40+
# Sort rows to match arkouda's "sorted unique" intent (and stable comparisons)
41+
rows = np.sort(rows)
42+
43+
# De-structure back into columns
44+
out_cols = [rows[f"f{i}"] for i in range(len(A_cols))]
45+
return out_cols
46+
47+
48+
@pytest.mark.requires_chapel_module("In1dMsg")
49+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
50+
@pytest.mark.parametrize("n", [0, 1, 2, 10, 100])
51+
def test_in1d_matches_numpy(dtype, n):
52+
rng = np.random.default_rng(12345)
53+
a_np = rng.integers(0, 20, size=n, dtype=np.int64)
54+
b_np = rng.integers(0, 20, size=max(n // 2, 1), dtype=np.int64)
55+
56+
# Cast for uint64 cases
57+
if dtype == ak.uint64:
58+
a_np = a_np.astype(np.uint64, copy=False)
59+
b_np = b_np.astype(np.uint64, copy=False)
60+
61+
a = ak.array(a_np)
62+
b = ak.array(b_np)
63+
64+
got = ak.in1d(a, b)
65+
exp = np.in1d(a_np, b_np, assume_unique=False, invert=False)
66+
67+
assert np.array_equal(_as_np(got), exp)
68+
69+
got_inv = ak.in1d(a, b, invert=True)
70+
exp_inv = np.in1d(a_np, b_np, assume_unique=False, invert=True)
71+
assert np.array_equal(_as_np(got_inv), exp_inv)
72+
73+
74+
@pytest.mark.requires_chapel_module("In1dMsg")
75+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
76+
def test_in1d_symmetric_matches_numpy(dtype):
77+
rng = np.random.default_rng(2468)
78+
a_np = rng.integers(0, 30, size=50, dtype=np.int64)
79+
b_np = rng.integers(0, 30, size=40, dtype=np.int64)
80+
if dtype == ak.uint64:
81+
a_np = a_np.astype(np.uint64, copy=False)
82+
b_np = b_np.astype(np.uint64, copy=False)
83+
84+
a = ak.array(a_np)
85+
b = ak.array(b_np)
86+
87+
got_a, got_b = ak.in1d(a, b, symmetric=True)
88+
exp_a = np.in1d(a_np, b_np)
89+
exp_b = np.in1d(b_np, a_np)
90+
assert np.array_equal(_as_np(got_a), exp_a)
91+
assert np.array_equal(_as_np(got_b), exp_b)
92+
93+
94+
@pytest.mark.requires_chapel_module("In1dMsg")
95+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
96+
def test_in1d_assume_unique_raises_when_not_unique(dtype):
97+
# Arkouda explicitly validates uniqueness when assume_unique=True for multi-array path,
98+
# and raises NonUniqueError. This test targets that behavior.
99+
from arkouda.numpy.alignment import NonUniqueError
100+
101+
a_np = np.array([1, 1, 2, 3], dtype=np.int64)
102+
b_np = np.array([1, 2, 4], dtype=np.int64)
103+
if dtype == ak.uint64:
104+
a_np = a_np.astype(np.uint64, copy=False)
105+
b_np = b_np.astype(np.uint64, copy=False)
106+
107+
a = ak.array(a_np)
108+
b = ak.array(b_np)
109+
110+
# For scalar pdarray path, arkouda routes through _in1d_single,
111+
# which does not validate uniqueness; so this test uses multi-array
112+
# mode (sequence-of-arrays) which does validate.
113+
A = [a]
114+
B = [b]
115+
with pytest.raises(NonUniqueError):
116+
ak.in1d(A, B, assume_unique=True)
117+
118+
119+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
120+
@pytest.mark.parametrize("n1,n2", [(0, 0), (0, 10), (10, 0), (10, 10), (50, 40)])
121+
def test_union1d_matches_numpy(dtype, n1, n2):
122+
if (n1 == 0 and n2 > 0) or (n2 == 0 and n1 > 0):
123+
pytest.xfail(
124+
"Known bug: ak.union1d returns non-unique/unsorted when one input is empty; "
125+
"should match np.union1d (sorted unique). Issue #5273."
126+
)
127+
128+
rng = np.random.default_rng(999)
129+
a_np = rng.integers(0, 25, size=n1, dtype=np.int64)
130+
b_np = rng.integers(0, 25, size=n2, dtype=np.int64)
131+
if dtype == ak.uint64:
132+
a_np = a_np.astype(np.uint64, copy=False)
133+
b_np = b_np.astype(np.uint64, copy=False)
134+
135+
a = ak.array(a_np)
136+
b = ak.array(b_np)
137+
138+
got = ak.union1d(a, b)
139+
exp = np.union1d(a_np, b_np)
140+
assert np.array_equal(_as_np(got), exp)
141+
142+
143+
@pytest.mark.xfail(
144+
reason="Known bug: ak.union1d returns non-unique/unsorted when one input is empty; "
145+
"should match np.union1d (sorted unique).. Issue #5273.",
146+
strict=False,
147+
)
148+
def test_union1d_empty_left_matches_numpy():
149+
b_np = np.array([20, 19, 4, 4, 4, 17, 2, 18, 3, 4], dtype=np.int64)
150+
got = ak.union1d(ak.array(np.array([], dtype=np.int64)), ak.array(b_np))
151+
assert np.array_equal(_as_np(got), np.union1d(np.array([], dtype=np.int64), b_np))
152+
153+
154+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
155+
@pytest.mark.parametrize("assume_unique", [False, True])
156+
def test_intersect1d_matches_numpy(dtype, assume_unique):
157+
rng = np.random.default_rng(2024)
158+
a_np = rng.integers(0, 40, size=100, dtype=np.int64)
159+
b_np = rng.integers(0, 40, size=80, dtype=np.int64)
160+
if dtype == ak.uint64:
161+
a_np = a_np.astype(np.uint64, copy=False)
162+
b_np = b_np.astype(np.uint64, copy=False)
163+
164+
if assume_unique:
165+
a_ref = np.unique(a_np)
166+
b_ref = np.unique(b_np)
167+
168+
a_ak = ak.array(a_ref)
169+
b_ak = ak.array(b_ref)
170+
171+
got = ak.intersect1d(a_ak, b_ak, assume_unique=True)
172+
exp = np.intersect1d(a_ref, b_ref, assume_unique=True)
173+
else:
174+
a_ak = ak.array(a_np)
175+
b_ak = ak.array(b_np)
176+
177+
got = ak.intersect1d(a_ak, b_ak, assume_unique=False)
178+
exp = np.intersect1d(a_np, b_np, assume_unique=False)
179+
180+
assert np.array_equal(_as_np(got), exp)
181+
182+
183+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
184+
@pytest.mark.parametrize("assume_unique", [False, True])
185+
def test_setdiff1d_matches_numpy(dtype, assume_unique):
186+
rng = np.random.default_rng(777)
187+
a_np = rng.integers(0, 50, size=120, dtype=np.int64)
188+
b_np = rng.integers(0, 50, size=70, dtype=np.int64)
189+
190+
if dtype == ak.uint64:
191+
a_np = a_np.astype(np.uint64, copy=False)
192+
b_np = b_np.astype(np.uint64, copy=False)
193+
194+
if assume_unique:
195+
a_ref = np.unique(a_np)
196+
b_ref = np.unique(b_np)
197+
198+
got = ak.setdiff1d(ak.array(a_ref), ak.array(b_ref), assume_unique=True)
199+
exp = np.setdiff1d(a_ref, b_ref, assume_unique=True)
200+
else:
201+
got = ak.setdiff1d(ak.array(a_np), ak.array(b_np), assume_unique=False)
202+
exp = np.setdiff1d(a_np, b_np, assume_unique=False)
203+
204+
assert np.array_equal(_as_np(got), exp)
205+
206+
207+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
208+
@pytest.mark.parametrize("assume_unique", [False, True])
209+
def test_setxor1d_matches_numpy(dtype, assume_unique):
210+
rng = np.random.default_rng(31415)
211+
a_np = rng.integers(0, 60, size=100, dtype=np.int64)
212+
b_np = rng.integers(0, 60, size=90, dtype=np.int64)
213+
if dtype == ak.uint64:
214+
a_np = a_np.astype(np.uint64, copy=False)
215+
b_np = b_np.astype(np.uint64, copy=False)
216+
217+
if assume_unique:
218+
a_ref = np.unique(a_np)
219+
b_ref = np.unique(b_np)
220+
221+
got = ak.setxor1d(ak.array(a_ref), ak.array(b_ref), assume_unique=True)
222+
exp = np.setxor1d(a_ref, b_ref, assume_unique=True)
223+
else:
224+
got = ak.setxor1d(ak.array(a_np), ak.array(b_np), assume_unique=False)
225+
exp = np.setxor1d(a_np, b_np, assume_unique=False)
226+
227+
assert np.array_equal(_as_np(got), exp)
228+
229+
230+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
231+
def test_concatenate_ordered_matches_numpy(dtype):
232+
rng = np.random.default_rng(123)
233+
parts = [rng.integers(0, 100, size=s, dtype=np.int64) for s in [0, 5, 1, 10]]
234+
if dtype == ak.uint64:
235+
parts = [p.astype(np.uint64, copy=False) for p in parts]
236+
237+
ak_parts = [ak.array(p) for p in parts]
238+
got = ak.concatenate(ak_parts, ordered=True)
239+
exp = np.concatenate(parts, axis=0)
240+
241+
assert np.array_equal(_as_np(got), exp)
242+
243+
244+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
245+
def test_concatenate_unordered_is_multiset_equal(dtype):
246+
rng = np.random.default_rng(456)
247+
parts = [rng.integers(0, 50, size=s, dtype=np.int64) for s in [3, 7, 0, 9]]
248+
if dtype == ak.uint64:
249+
parts = [p.astype(np.uint64, copy=False) for p in parts]
250+
251+
ak_parts = [ak.array(p) for p in parts]
252+
got = ak.concatenate(ak_parts, ordered=False)
253+
exp = np.concatenate(parts, axis=0)
254+
255+
# unordered concatenate may interleave; compare as multisets
256+
assert np.array_equal(np.sort(_as_np(got)), np.sort(exp))
257+
258+
259+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
260+
def test_multiarray_union_intersect_setdiff_setxor_align(dtype):
261+
rng = np.random.default_rng(8888)
262+
263+
# 2-column "rows"
264+
n1, n2 = 60, 55
265+
a1 = rng.integers(0, 20, size=n1, dtype=np.int64)
266+
a2 = rng.integers(0, 20, size=n1, dtype=np.int64)
267+
b1 = rng.integers(0, 20, size=n2, dtype=np.int64)
268+
b2 = rng.integers(0, 20, size=n2, dtype=np.int64)
269+
270+
if dtype == ak.uint64:
271+
a1, a2, b1, b2 = [x.astype(np.uint64, copy=False) for x in (a1, a2, b1, b2)]
272+
273+
A = [ak.array(a1), ak.array(a2)]
274+
B = [ak.array(b1), ak.array(b2)]
275+
276+
# union1d (multi)
277+
got_u = ak.union1d(A, B)
278+
exp_u = _np_setop_rows(np.union1d, [a1, a2], [b1, b2])
279+
assert np.array_equal(_as_np(got_u[0]), exp_u[0])
280+
assert np.array_equal(_as_np(got_u[1]), exp_u[1])
281+
282+
# intersect1d (multi)
283+
got_i = ak.intersect1d(A, B, assume_unique=False)
284+
exp_i = _np_setop_rows(np.intersect1d, [a1, a2], [b1, b2])
285+
assert np.array_equal(_as_np(got_i[0]), exp_i[0])
286+
assert np.array_equal(_as_np(got_i[1]), exp_i[1])
287+
288+
# setdiff1d (multi): A - B
289+
got_d = ak.setdiff1d(A, B, assume_unique=False)
290+
exp_d = _np_setop_rows(np.setdiff1d, [a1, a2], [b1, b2])
291+
assert np.array_equal(_as_np(got_d[0]), exp_d[0])
292+
assert np.array_equal(_as_np(got_d[1]), exp_d[1])
293+
294+
# setxor1d (multi)
295+
got_x = ak.setxor1d(A, B, assume_unique=False)
296+
exp_x = _np_setop_rows(np.setxor1d, [a1, a2], [b1, b2])
297+
assert np.array_equal(_as_np(got_x[0]), exp_x[0])
298+
assert np.array_equal(_as_np(got_x[1]), exp_x[1])
299+
300+
301+
@pytest.mark.parametrize("dtype", [ak.int64, ak.uint64])
302+
def test_indexof1d_all_occurrences_remove_missing(dtype):
303+
rng = np.random.default_rng(13579)
304+
space_np = rng.integers(0, 10, size=50, dtype=np.int64)
305+
query_np = rng.integers(0, 10, size=20, dtype=np.int64)
306+
307+
# Force some missing values by shifting query range
308+
query_np = (query_np + 50).astype(np.int64)
309+
310+
# Insert some present values as well
311+
query_np[:5] = space_np[:5]
312+
313+
if dtype == ak.uint64:
314+
space_np = space_np.astype(np.uint64, copy=False)
315+
query_np = query_np.astype(np.uint64, copy=False)
316+
317+
space = ak.array(space_np)
318+
query = ak.array(query_np)
319+
320+
got = ak.indexof1d(query, space)
321+
got_np = _as_np(got)
322+
323+
# Reference: for each query value, emit indices of all matches in space; skip if none.
324+
exp_list = []
325+
for q in query_np:
326+
hits = np.nonzero(space_np == q)[0]
327+
if hits.size:
328+
exp_list.extend(hits.tolist())
329+
exp = np.array(exp_list, dtype=np.int64)
330+
331+
assert np.array_equal(got_np, exp)

0 commit comments

Comments
 (0)