diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index e5714862e41..06cb4694831 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -22,8 +22,8 @@ #include #include -#include #include +#include #include #include @@ -704,15 +704,19 @@ class TypeInferrer { Py_TYPE(key_obj)->tp_name, "'"); } // Get or create visitor for this key - auto it = struct_inferrers_.find(key); - if (it == struct_inferrers_.end()) { - it = struct_inferrers_ - .insert( - std::make_pair(key, TypeInferrer(pandas_null_sentinels_, - validate_interval_, make_unions_))) - .first; + TypeInferrer* visitor; + auto it = struct_field_index_.find(key); + if (it == struct_field_index_.end()) { + // New field - add to vector and index + size_t new_index = struct_inferrers_.size(); + struct_inferrers_.emplace_back( + key, TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); + struct_field_index_.emplace(std::move(key), new_index); + visitor = &struct_inferrers_.back().second; + } else { + // Existing field - retrieve from vector + visitor = &struct_inferrers_[it->second].second; } - TypeInferrer* visitor = &it->second; // We ignore termination signals from child visitors for now // @@ -730,7 +734,8 @@ class TypeInferrer { Status GetStructType(std::shared_ptr* out) { std::vector> fields; - for (auto&& it : struct_inferrers_) { + fields.reserve(struct_inferrers_.size()); + for (auto& it : struct_inferrers_) { std::shared_ptr field_type; RETURN_NOT_OK(it.second.GetType(&field_type)); fields.emplace_back(field(it.first, field_type)); @@ -762,7 +767,8 @@ class TypeInferrer { int64_t numpy_dtype_count_; int64_t interval_count_; std::unique_ptr list_inferrer_; - std::map struct_inferrers_; + std::vector> struct_inferrers_; + std::unordered_map struct_field_index_; std::shared_ptr scalar_type_; // If we observe a strongly-typed value in e.g. a NumPy array, we can store diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..8e258e38afe 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3581,9 +3581,9 @@ cdef class RecordBatch(_Tabular): >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, ... {'year': 2022, 'n_legs': 4}]) >>> pa.RecordBatch.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 + n_legs animals year + 0 2 Parrot NaN + 1 4 None 2022.0 """ cdef: shared_ptr[CRecordBatch] c_record_batch @@ -4468,18 +4468,18 @@ cdef class Table(_Tabular): ... names = ["a", "month"]) >>> table pyarrow.Table - a: struct - child 0, animals: string - child 1, n_legs: int64 + a: struct + child 0, n_legs: int64 + child 1, animals: string child 2, year: int64 month: int64 ---- a: [ -- is_valid: all not null - -- child 0 type: string - ["Parrot",null] - -- child 1 type: int64 + -- child 0 type: int64 [2,4] + -- child 1 type: string + ["Parrot",null] -- child 2 type: int64 [null,2022]] month: [[4,6]] @@ -4488,13 +4488,13 @@ cdef class Table(_Tabular): >>> table.flatten() pyarrow.Table - a.animals: string a.n_legs: int64 + a.animals: string a.year: int64 month: int64 ---- - a.animals: [["Parrot",null]] a.n_legs: [[2,4]] + a.animals: [["Parrot",null]] a.year: [[null,2022]] month: [[4,6]] """ @@ -4936,9 +4936,9 @@ cdef class Table(_Tabular): >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, ... {'year': 2022, 'n_legs': 4}]) >>> pa.Table.from_struct_array(struct).to_pandas() - animals n_legs year - 0 Parrot 2 NaN - 1 None 4 2022.0 + n_legs animals year + 0 2 Parrot NaN + 1 4 None 2022.0 """ if isinstance(struct_array, Array): return Table.from_batches([RecordBatch.from_struct_array(struct_array)]) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f1461a302db..c10ae0f62b4 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -69,14 +69,6 @@ def __int__(self): 1/0 # MARKER -def check_struct_type(ty, expected): - """ - Check a struct type is as expected, but not taking order into account. - """ - assert pa.types.is_struct(ty) - assert set(ty) == set(expected) - - def test_iterable_types(): arr1 = pa.array(StrangeIterable([0, 1, 2, 3])) arr2 = pa.array((0, 1, 2, 3)) @@ -2010,25 +2002,29 @@ def test_struct_from_dicts_inference(): {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data) - check_struct_type(arr.type, expected_type) + assert arr.type == expected_type assert arr.to_pylist() == data # With omitted values + # GH-40053: Field order follows first occurrence (a, c, then b) data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}] - expected = [{'a': 5, 'b': None, 'c': True}, + expected_type_omitted = pa.struct([pa.field('a', pa.int64()), + pa.field('c', pa.bool_()), + pa.field('b', pa.string())]) + expected = [{'a': 5, 'c': True, 'b': None}, None, - {'a': None, 'b': None, 'c': None}, - {'a': None, 'b': 'bar', 'c': None}] + {'a': None, 'c': None, 'b': None}, + {'a': None, 'c': None, 'b': 'bar'}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) - check_struct_type(arr.type, expected_type) + assert arr.type == expected_type_omitted assert arr.to_pylist() == expected assert arr.equals(arr2) @@ -2042,6 +2038,7 @@ def test_struct_from_dicts_inference(): {'a': None, 'b': 'bar'}] arr = pa.array(data) + assert arr.type == expected_type assert arr.to_pylist() == data # Edge cases