From 5740546dd8dada20d353c4d2f1e6700bddbdf240 Mon Sep 17 00:00:00 2001 From: Gabriel Silva Simoes Date: Sat, 10 Jan 2026 18:50:20 +0000 Subject: [PATCH 1/3] GH-40053: [C++][Python] Preserve dict key order when inferring struct type When converting Python dictionaries to PyArrow arrays, struct fields were previously sorted alphabetically due to the use of std::map. This change preserves the original dictionary key insertion order, which is the expected behavior since Python 3.7+ guarantees dict ordering. The fix replaces std::map with a vector + unordered_map combination, following the same pattern used in Arrow's JSON parser. This maintains O(1) lookup performance while preserving insertion order. --- python/pyarrow/src/arrow/python/inference.cc | 28 ++++++++++------- python/pyarrow/tests/test_convert_builtin.py | 33 ++++++++++++++++---- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index e5714862e41..0298636634e 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -22,8 +22,8 @@ #include #include -#include #include +#include #include #include @@ -704,15 +704,19 @@ class TypeInferrer { Py_TYPE(key_obj)->tp_name, "'"); } // Get or create visitor for this key - auto it = struct_inferrers_.find(key); - if (it == struct_inferrers_.end()) { - it = struct_inferrers_ - .insert( - std::make_pair(key, TypeInferrer(pandas_null_sentinels_, - validate_interval_, make_unions_))) - .first; + TypeInferrer* visitor; + auto it = struct_field_index_.find(key); + if (it == struct_field_index_.end()) { + // New field - add to vector and index + size_t new_index = struct_inferrers_.size(); + struct_inferrers_.emplace_back( + key, TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); + struct_field_index_.emplace(key, new_index); + visitor = &struct_inferrers_.back().second; + } else { + // Existing field - retrieve from vector + visitor = &struct_inferrers_[it->second].second; } - TypeInferrer* visitor = &it->second; // We ignore termination signals from child visitors for now // @@ -730,7 +734,8 @@ class TypeInferrer { Status GetStructType(std::shared_ptr* out) { std::vector> fields; - for (auto&& it : struct_inferrers_) { + fields.reserve(struct_inferrers_.size()); + for (auto& it : struct_inferrers_) { std::shared_ptr field_type; RETURN_NOT_OK(it.second.GetType(&field_type)); fields.emplace_back(field(it.first, field_type)); @@ -762,7 +767,8 @@ class TypeInferrer { int64_t numpy_dtype_count_; int64_t interval_count_; std::unique_ptr list_inferrer_; - std::map struct_inferrers_; + std::vector> struct_inferrers_; + std::unordered_map struct_field_index_; std::shared_ptr scalar_type_; // If we observe a strongly-typed value in e.g. a NumPy array, we can store diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f1461a302db..a61f3ef0149 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -71,10 +71,10 @@ def __int__(self): def check_struct_type(ty, expected): """ - Check a struct type is as expected, but not taking order into account. + Check a struct type is as expected, including field order. """ assert pa.types.is_struct(ty) - assert set(ty) == set(expected) + assert list(ty) == list(expected) def test_iterable_types(): @@ -2014,21 +2014,25 @@ def test_struct_from_dicts_inference(): assert arr.to_pylist() == data # With omitted values + # Field order is determined by first occurrence: a, c from first dict, then b from fourth data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}] - expected = [{'a': 5, 'b': None, 'c': True}, + expected_type_omitted = pa.struct([pa.field('a', pa.int64()), + pa.field('c', pa.bool_()), + pa.field('b', pa.string())]) + expected = [{'a': 5, 'c': True, 'b': None}, None, - {'a': None, 'b': None, 'c': None}, - {'a': None, 'b': 'bar', 'c': None}] + {'a': None, 'c': None, 'b': None}, + {'a': None, 'c': None, 'b': 'bar'}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) - check_struct_type(arr.type, expected_type) + check_struct_type(arr.type, expected_type_omitted) assert arr.to_pylist() == expected assert arr.equals(arr2) @@ -2054,6 +2058,23 @@ def test_struct_from_dicts_inference(): pa.array([1, {'a': 2}]) +def test_struct_from_dicts_field_order(): + # GH-40053: Struct fields should preserve dictionary key insertion order + data = [{'b': 2, 'a': 1}, {'b': 4, 'a': 3}] + arr = pa.array(data) + expected_type = pa.struct([('b', pa.int64()), ('a', pa.int64())]) + assert arr.type == expected_type + assert arr.to_pylist() == data + + # Nested structs also preserve order + data = [{'b': {'y': 1, 'x': 2}, 'a': 3}] + arr = pa.array(data) + expected_type = pa.struct([('b', pa.struct([('y', pa.int64()), + ('x', pa.int64())])), + ('a', pa.int64())]) + assert arr.type == expected_type + + def test_structarray_from_arrays_coerce(): # ARROW-1706 ints = [None, 2, 3] From 2509788afdcd31ead0fe165c4297dff4e1e9120f Mon Sep 17 00:00:00 2001 From: Gabriel Silva Simoes Date: Mon, 12 Jan 2026 20:29:32 +0000 Subject: [PATCH 2/3] Address PR review --- python/pyarrow/src/arrow/python/inference.cc | 2 +- python/pyarrow/tests/test_convert_builtin.py | 32 +++----------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 0298636634e..06cb4694831 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -711,7 +711,7 @@ class TypeInferrer { size_t new_index = struct_inferrers_.size(); struct_inferrers_.emplace_back( key, TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); - struct_field_index_.emplace(key, new_index); + struct_field_index_.emplace(std::move(key), new_index); visitor = &struct_inferrers_.back().second; } else { // Existing field - retrieve from vector diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index a61f3ef0149..6af0e5f9187 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -69,14 +69,6 @@ def __int__(self): 1/0 # MARKER -def check_struct_type(ty, expected): - """ - Check a struct type is as expected, including field order. - """ - assert pa.types.is_struct(ty) - assert list(ty) == list(expected) - - def test_iterable_types(): arr1 = pa.array(StrangeIterable([0, 1, 2, 3])) arr2 = pa.array((0, 1, 2, 3)) @@ -2010,11 +2002,11 @@ def test_struct_from_dicts_inference(): {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data) - check_struct_type(arr.type, expected_type) + assert arr.type == expected_type assert arr.to_pylist() == data # With omitted values - # Field order is determined by first occurrence: a, c from first dict, then b from fourth + # GH-40053: Field order is determined by first occurrence: a, c from first dict, then b from fourth data = [{'a': 5, 'c': True}, None, {}, @@ -2032,7 +2024,7 @@ def test_struct_from_dicts_inference(): data_as_ndarray[:] = data arr2 = pa.array(data) - check_struct_type(arr.type, expected_type_omitted) + assert arr.type == expected_type_omitted assert arr.to_pylist() == expected assert arr.equals(arr2) @@ -2046,6 +2038,7 @@ def test_struct_from_dicts_inference(): {'a': None, 'b': 'bar'}] arr = pa.array(data) + assert arr.type == expected_type assert arr.to_pylist() == data # Edge cases @@ -2058,23 +2051,6 @@ def test_struct_from_dicts_inference(): pa.array([1, {'a': 2}]) -def test_struct_from_dicts_field_order(): - # GH-40053: Struct fields should preserve dictionary key insertion order - data = [{'b': 2, 'a': 1}, {'b': 4, 'a': 3}] - arr = pa.array(data) - expected_type = pa.struct([('b', pa.int64()), ('a', pa.int64())]) - assert arr.type == expected_type - assert arr.to_pylist() == data - - # Nested structs also preserve order - data = [{'b': {'y': 1, 'x': 2}, 'a': 3}] - arr = pa.array(data) - expected_type = pa.struct([('b', pa.struct([('y', pa.int64()), - ('x', pa.int64())])), - ('a', pa.int64())]) - assert arr.type == expected_type - - def test_structarray_from_arrays_coerce(): # ARROW-1706 ints = [None, 2, 3] From 3f3ca3374cb2c24ed671963c8f0bab2ec88b4143 Mon Sep 17 00:00:00 2001 From: Gabriel Silva Simoes Date: Mon, 12 Jan 2026 20:44:51 +0000 Subject: [PATCH 3/3] Address CI failures --- python/pyarrow/table.pxi | 28 ++++++++++---------- python/pyarrow/tests/test_convert_builtin.py | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..8e258e38afe 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3581,9 +3581,9 @@ cdef class RecordBatch(_Tabular): >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, ... {'year': 2022, 'n_legs': 4}]) >>> pa.RecordBatch.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 + n_legs animals year + 0 2 Parrot NaN + 1 4 None 2022.0 """ cdef: shared_ptr[CRecordBatch] c_record_batch @@ -4468,18 +4468,18 @@ cdef class Table(_Tabular): ... names = ["a", "month"]) >>> table pyarrow.Table - a: struct - child 0, animals: string - child 1, n_legs: int64 + a: struct + child 0, n_legs: int64 + child 1, animals: string child 2, year: int64 month: int64 ---- a: [ -- is_valid: all not null - -- child 0 type: string - ["Parrot",null] - -- child 1 type: int64 + -- child 0 type: int64 [2,4] + -- child 1 type: string + ["Parrot",null] -- child 2 type: int64 [null,2022]] month: [[4,6]] @@ -4488,13 +4488,13 @@ cdef class Table(_Tabular): >>> table.flatten() pyarrow.Table - a.animals: string a.n_legs: int64 + a.animals: string a.year: int64 month: int64 ---- - a.animals: [["Parrot",null]] a.n_legs: [[2,4]] + a.animals: [["Parrot",null]] a.year: [[null,2022]] month: [[4,6]] """ @@ -4936,9 +4936,9 @@ cdef class Table(_Tabular): >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, ... {'year': 2022, 'n_legs': 4}]) >>> pa.Table.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 + n_legs animals year + 0 2 Parrot NaN + 1 4 None 2022.0 """ if isinstance(struct_array, Array): return Table.from_batches([RecordBatch.from_struct_array(struct_array)]) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 6af0e5f9187..c10ae0f62b4 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -2006,7 +2006,7 @@ def test_struct_from_dicts_inference(): assert arr.to_pylist() == data # With omitted values - # GH-40053: Field order is determined by first occurrence: a, c from first dict, then b from fourth + # GH-40053: Field order follows first occurrence (a, c, then b) data = [{'a': 5, 'c': True}, None, {},