fix(arrow-array): disallow creating MapArray with nullable key field#10272
fix(arrow-array): disallow creating MapArray with nullable key field#10272rluvaton wants to merge 10 commits into
MapArray with nullable key field#10272Conversation
Jefffrey
left a comment
There was a problem hiding this comment.
there's also another path through arraydata
#[test]
fn test123() {
let keys = Arc::new(Int32Array::from(vec![Some(1), None])) as ArrayRef;
let values = Arc::new(StringArray::from(vec!["a", "b"]));
let struct_array = StructArray::from(vec![
(Arc::new(Field::new("key", DataType::Int32, true)), keys),
(Arc::new(Field::new("values", DataType::Utf8, true)), values),
]);
let a = ArrayData::builder(DataType::Map(
Field::new("entries", struct_array.data_type().clone(), false).into(),
false,
))
.len(1)
.add_buffer(
OffsetBuffer::<i32>::from_lengths(vec![2])
.into_inner()
.into(),
)
.add_child_data(struct_array.into_data())
.build()
.unwrap();
let a = MapArray::from(a);
dbg!(a);
}- currently succeeds in building a map array
|
|
||
| // The Arrow spec requires the "key" field to be non-nullable | ||
| // <https://github.com/apache/arrow/blob/98347d233f03bcf4d116d77f1769d498902b1fc8/format/Schema.fbs#L138> | ||
| if entries.fields()[0].is_nullable() { |
There was a problem hiding this comment.
Should we check the data type? Or maybe just the actual contents of the array (null_count() > 0)?
There was a problem hiding this comment.
I worry that the data_type is not always set super precisely (as in there may be some code that claims the key is "nullable" for convenience, but in reality the key is not nullable)
🤔
There was a problem hiding this comment.
i feel its better to be strict/explicit in this case
for reference, in the builder we also check nullability already:
arrow-rs/arrow-array/src/builder/map_builder.rs
Lines 243 to 247 in b1de629
There was a problem hiding this comment.
We already check that entries data type equal to field data type
Which struct is not nullable so it should enforce the no nulls
This is why I did not add it
|
Thank you for this PR @rluvaton |
|
@Jefffrey When looking at the code looks like there are a lot of missing validations, added them as well with tests |
|
This contain another fix that we panic instead of returning an error on |
| @@ -2954,13 +2954,13 @@ mod tests { | |||
| let bin_view_array = Arc::new(BinaryViewArray::from_iter(bin_values)); | |||
| let utf8_view_array = Arc::new(StringViewArray::from_iter(utf8_values)); | |||
|
|
|||
| let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); | |||
| let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 2, 0, 2, 3]); | |||
| let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone()); | |||
| #[allow(deprecated)] | |||
| let keys_field = Arc::new(Field::new_dict( | |||
| "keys", | |||
| DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)), | |||
| true, | |||
| false, | |||
| 1, | |||
| false, | |||
| )); | |||
There was a problem hiding this comment.
both tests failed,
the pr that disallowed nullable entries keys kept the nullable key even if not allowed on purpose
but I looked at the original pr that added that test and it did not seem to be on purpose that the keys have nulls:
so I modified the test
| // Test Map<Null, Null> - both keys and values are Null type | ||
| #[test] | ||
| fn test_map_null_keys_and_null_values() { | ||
| let null_keys = Arc::new(NullArray::new(3)) as ArrayRef; | ||
| let null_values = Arc::new(NullArray::new(3)) as ArrayRef; | ||
|
|
||
| let offsets = OffsetBuffer::new(vec![0, 1, 1, 3].into()); | ||
| let entries_fields = vec![ | ||
| Arc::new(Field::new("keys", DataType::Null, true)), | ||
| Arc::new(Field::new("values", DataType::Null, true)), | ||
| ]; | ||
| let struct_field = Arc::new(Field::new( | ||
| "entries", | ||
| DataType::Struct(entries_fields.clone().into()), | ||
| false, | ||
| )); | ||
| let entries = StructArray::new(entries_fields.into(), vec![null_keys, null_values], None); | ||
|
|
||
| let map: ArrayRef = Arc::new(MapArray::new(struct_field, offsets, entries, None, false)); | ||
|
|
||
| let converter = RowConverter::new(vec![SortField::new(map.data_type().clone())]).unwrap(); | ||
| let rows = converter.convert_columns(&[Arc::clone(&map)]).unwrap(); | ||
| let back = converter.convert_rows(&rows).unwrap(); | ||
| assert_eq!(back.len(), 1); | ||
| back[0].to_data().validate_full().unwrap(); | ||
| assert_eq!(&map, &back[0]); | ||
| } |
There was a problem hiding this comment.
The test I created here is invalid since keys cant be null.
did not modify to only have null values, since the next test (Test Map<Utf8, Null> all empty maps) check that already
…e_validate` feature is on (#10282) # Which issue does this PR close? N/A # Rationale for this change the caller of `try_new` is responsible for handling errors, so it should not panic even when `force_validate` feature is on. the added benefit is that validation is not called twice on `try_new` when `force_validate` feature is on # What changes are included in this PR? use builder validation call # Are these changes tested? yes # Are there any user-facing changes? error instead of panic ----- Found while testing: - #10272
# Conflicts: # arrow-data/src/data.rs
|
@Jefffrey can you please merge? |
Which issue does this PR close?
MapArray::try_newallows to create nullable keys even though the spec disallow it #10268Rationale for this change
we should be unable to create invalid MapArray according to the spec
What changes are included in this PR?
added validation that key field is not nullable
Are these changes tested?
yes
Are there any user-facing changes?
yes, it will now fail for nullable key field