Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions python/pyarrow/src/arrow/python/inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@

#include <algorithm>
#include <limits>
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -704,15 +704,19 @@ class TypeInferrer {
Py_TYPE(key_obj)->tp_name, "'");
}
// Get or create visitor for this key
auto it = struct_inferrers_.find(key);
if (it == struct_inferrers_.end()) {
it = struct_inferrers_
.insert(
std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
validate_interval_, make_unions_)))
.first;
TypeInferrer* visitor;
auto it = struct_field_index_.find(key);
if (it == struct_field_index_.end()) {
// New field - add to vector and index
size_t new_index = struct_inferrers_.size();
struct_inferrers_.emplace_back(
key, TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
struct_field_index_.emplace(std::move(key), new_index);
visitor = &struct_inferrers_.back().second;
} else {
// Existing field - retrieve from vector
visitor = &struct_inferrers_[it->second].second;
}
TypeInferrer* visitor = &it->second;

// We ignore termination signals from child visitors for now
//
Expand All @@ -730,7 +734,8 @@ class TypeInferrer {

Status GetStructType(std::shared_ptr<DataType>* out) {
std::vector<std::shared_ptr<Field>> fields;
for (auto&& it : struct_inferrers_) {
fields.reserve(struct_inferrers_.size());
for (auto& it : struct_inferrers_) {
std::shared_ptr<DataType> field_type;
RETURN_NOT_OK(it.second.GetType(&field_type));
fields.emplace_back(field(it.first, field_type));
Expand Down Expand Up @@ -762,7 +767,8 @@ class TypeInferrer {
int64_t numpy_dtype_count_;
int64_t interval_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::map<std::string, TypeInferrer> struct_inferrers_;
std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
std::unordered_map<std::string, size_t> struct_field_index_;
std::shared_ptr<DataType> scalar_type_;

// If we observe a strongly-typed value in e.g. a NumPy array, we can store
Expand Down
28 changes: 14 additions & 14 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3581,9 +3581,9 @@ cdef class RecordBatch(_Tabular):
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
... {'year': 2022, 'n_legs': 4}])
>>> pa.RecordBatch.from_struct_array(struct).to_pandas()
animals n_legs year
0 Parrot 2 NaN
1 None 4 2022.0
n_legs animals year
0 2 Parrot NaN
1 4 None 2022.0
"""
cdef:
shared_ptr[CRecordBatch] c_record_batch
Expand Down Expand Up @@ -4468,18 +4468,18 @@ cdef class Table(_Tabular):
... names = ["a", "month"])
>>> table
pyarrow.Table
a: struct<animals: string, n_legs: int64, year: int64>
child 0, animals: string
child 1, n_legs: int64
a: struct<n_legs: int64, animals: string, year: int64>
child 0, n_legs: int64
child 1, animals: string
child 2, year: int64
month: int64
----
a: [
-- is_valid: all not null
-- child 0 type: string
["Parrot",null]
-- child 1 type: int64
-- child 0 type: int64
[2,4]
-- child 1 type: string
["Parrot",null]
-- child 2 type: int64
[null,2022]]
month: [[4,6]]
Expand All @@ -4488,13 +4488,13 @@ cdef class Table(_Tabular):

>>> table.flatten()
pyarrow.Table
a.animals: string
a.n_legs: int64
a.animals: string
a.year: int64
month: int64
----
a.animals: [["Parrot",null]]
a.n_legs: [[2,4]]
a.animals: [["Parrot",null]]
a.year: [[null,2022]]
month: [[4,6]]
"""
Expand Down Expand Up @@ -4936,9 +4936,9 @@ cdef class Table(_Tabular):
>>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'},
... {'year': 2022, 'n_legs': 4}])
>>> pa.Table.from_struct_array(struct).to_pandas()
animals n_legs year
0 Parrot 2 NaN
1 None 4 2022.0
n_legs animals year
0 2 Parrot NaN
1 4 None 2022.0
"""
if isinstance(struct_array, Array):
return Table.from_batches([RecordBatch.from_struct_array(struct_array)])
Expand Down
23 changes: 10 additions & 13 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ def __int__(self):
1/0 # MARKER


def check_struct_type(ty, expected):
"""
Check a struct type is as expected, but not taking order into account.
"""
assert pa.types.is_struct(ty)
assert set(ty) == set(expected)


def test_iterable_types():
arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
arr2 = pa.array((0, 1, 2, 3))
Expand Down Expand Up @@ -2010,25 +2002,29 @@ def test_struct_from_dicts_inference():
{'a': 6, 'b': 'bar', 'c': False}]

arr = pa.array(data)
check_struct_type(arr.type, expected_type)
assert arr.type == expected_type
assert arr.to_pylist() == data

# With omitted values
# GH-40053: Field order follows first occurrence (a, c, then b)
data = [{'a': 5, 'c': True},
None,
{},
{'a': None, 'b': 'bar'}]
expected = [{'a': 5, 'b': None, 'c': True},
expected_type_omitted = pa.struct([pa.field('a', pa.int64()),
pa.field('c', pa.bool_()),
pa.field('b', pa.string())])
expected = [{'a': 5, 'c': True, 'b': None},
None,
{'a': None, 'b': None, 'c': None},
{'a': None, 'b': 'bar', 'c': None}]
{'a': None, 'c': None, 'b': None},
{'a': None, 'c': None, 'b': 'bar'}]

arr = pa.array(data)
data_as_ndarray = np.empty(len(data), dtype=object)
data_as_ndarray[:] = data
arr2 = pa.array(data)

check_struct_type(arr.type, expected_type)
assert arr.type == expected_type_omitted
assert arr.to_pylist() == expected
assert arr.equals(arr2)

Expand All @@ -2042,6 +2038,7 @@ def test_struct_from_dicts_inference():
{'a': None, 'b': 'bar'}]
arr = pa.array(data)

assert arr.type == expected_type
assert arr.to_pylist() == data

# Edge cases
Expand Down
Loading