Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions python/pyarrow/src/arrow/python/inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

#include <algorithm>
#include <limits>
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -704,15 +704,19 @@ class TypeInferrer {
Py_TYPE(key_obj)->tp_name, "'");
}
// Get or create visitor for this key
auto it = struct_inferrers_.find(key);
if (it == struct_inferrers_.end()) {
it = struct_inferrers_
.insert(
std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
validate_interval_, make_unions_)))
.first;
TypeInferrer* visitor;
auto it = struct_field_index_.find(key);
if (it == struct_field_index_.end()) {
// New field - add to vector and index
size_t new_index = struct_inferrers_.size();
struct_inferrers_.emplace_back(
key, TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
struct_field_index_.emplace(key, new_index);
Comment thread
gabrielsimoes marked this conversation as resolved.
Outdated
visitor = &struct_inferrers_.back().second;
} else {
// Existing field - retrieve from vector
visitor = &struct_inferrers_[it->second].second;
}
TypeInferrer* visitor = &it->second;

// We ignore termination signals from child visitors for now
//
Expand All @@ -730,7 +734,8 @@ class TypeInferrer {

Status GetStructType(std::shared_ptr<DataType>* out) {
std::vector<std::shared_ptr<Field>> fields;
for (auto&& it : struct_inferrers_) {
fields.reserve(struct_inferrers_.size());
for (auto& it : struct_inferrers_) {
std::shared_ptr<DataType> field_type;
RETURN_NOT_OK(it.second.GetType(&field_type));
fields.emplace_back(field(it.first, field_type));
Expand Down Expand Up @@ -762,7 +767,8 @@ class TypeInferrer {
int64_t numpy_dtype_count_;
int64_t interval_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::map<std::string, TypeInferrer> struct_inferrers_;
std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
std::unordered_map<std::string, size_t> struct_field_index_;
std::shared_ptr<DataType> scalar_type_;

// If we observe a strongly-typed value in e.g. a NumPy array, we can store
Expand Down
33 changes: 27 additions & 6 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ def __int__(self):

def check_struct_type(ty, expected):
Comment thread
gabrielsimoes marked this conversation as resolved.
Outdated
"""
Check a struct type is as expected, but not taking order into account.
Check a struct type is as expected, including field order.
"""
assert pa.types.is_struct(ty)
assert set(ty) == set(expected)
assert list(ty) == list(expected)


def test_iterable_types():
Expand Down Expand Up @@ -2014,21 +2014,25 @@ def test_struct_from_dicts_inference():
assert arr.to_pylist() == data

# With omitted values
# Field order is determined by first occurrence: a, c from first dict, then b from fourth
data = [{'a': 5, 'c': True},
None,
{},
{'a': None, 'b': 'bar'}]
expected = [{'a': 5, 'b': None, 'c': True},
expected_type_omitted = pa.struct([pa.field('a', pa.int64()),
pa.field('c', pa.bool_()),
pa.field('b', pa.string())])
expected = [{'a': 5, 'c': True, 'b': None},
None,
{'a': None, 'b': None, 'c': None},
{'a': None, 'b': 'bar', 'c': None}]
{'a': None, 'c': None, 'b': None},
{'a': None, 'c': None, 'b': 'bar'}]

arr = pa.array(data)
data_as_ndarray = np.empty(len(data), dtype=object)
data_as_ndarray[:] = data
arr2 = pa.array(data)

check_struct_type(arr.type, expected_type)
check_struct_type(arr.type, expected_type_omitted)
Comment thread
gabrielsimoes marked this conversation as resolved.
Outdated
assert arr.to_pylist() == expected
assert arr.equals(arr2)

Expand All @@ -2054,6 +2058,23 @@ def test_struct_from_dicts_inference():
pa.array([1, {'a': 2}])


def test_struct_from_dicts_field_order():
Comment thread
gabrielsimoes marked this conversation as resolved.
Outdated
# GH-40053: Struct fields should preserve dictionary key insertion order
data = [{'b': 2, 'a': 1}, {'b': 4, 'a': 3}]
arr = pa.array(data)
expected_type = pa.struct([('b', pa.int64()), ('a', pa.int64())])
assert arr.type == expected_type
assert arr.to_pylist() == data

# Nested structs also preserve order
data = [{'b': {'y': 1, 'x': 2}, 'a': 3}]
arr = pa.array(data)
expected_type = pa.struct([('b', pa.struct([('y', pa.int64()),
('x', pa.int64())])),
('a', pa.int64())])
assert arr.type == expected_type


def test_structarray_from_arrays_coerce():
# ARROW-1706
ints = [None, 2, 3]
Expand Down
Loading