@@ -265,6 +265,14 @@ pub(crate) fn merge_dictionary_values<K: ArrowDictionaryKeyType>(
265265 let mut indices = Vec :: with_capacity ( num_values) ;
266266
267267 // Compute the mapping for each dictionary
268+ //
269+ // Mathematical invariant for dictionary keys:
270+ // For key type K::Native, the maximum number of distinct values is (K::Native::MAX as usize) + 1
271+ // - u8: valid keys 0..=255, max cardinality = 256
272+ // - u16: valid keys 0..=65535, max cardinality = 65,536
273+ //
274+ // The insertion condition is: indices.len() <= K::Native::MAX as usize
275+ // Insertion must fail when: indices.len() > K::Native::MAX as usize
268276 let key_mappings = dictionaries
269277 . iter ( )
270278 . enumerate ( )
@@ -275,12 +283,18 @@ pub(crate) fn merge_dictionary_values<K: ArrowDictionaryKeyType>(
275283
276284 for ( value_idx, value) in values {
277285 mapping[ value_idx] =
278- * interner. intern ( value, || match K :: Native :: from_usize ( indices. len ( ) ) {
279- Some ( idx) => {
280- indices. push ( ( dictionary_idx, value_idx) ) ;
281- Ok ( idx)
282- }
283- None => Err ( ArrowError :: DictionaryKeyOverflowError ) ,
286+ * interner. intern ( value, || -> Result < K :: Native , ArrowError > {
287+ let next_idx = indices. len ( ) ;
288+
289+ // Explicit boundary check: ensure the next index can be represented by the key type
290+ // This check happens BEFORE pushing, allowing the full valid range:
291+ // - For u8: indices 0..=255 (256 total values) are valid
292+ // - For u16: indices 0..=65535 (65,536 total values) are valid
293+ let key = K :: Native :: from_usize ( next_idx)
294+ . ok_or_else ( || ArrowError :: DictionaryKeyOverflowError ) ?;
295+
296+ indices. push ( ( dictionary_idx, value_idx) ) ;
297+ Ok ( key)
284298 } ) ?;
285299 }
286300 Ok ( mapping)
@@ -378,7 +392,11 @@ mod tests {
378392 use arrow_array:: cast:: as_string_array;
379393 use arrow_array:: types:: Int8Type ;
380394 use arrow_array:: types:: Int32Type ;
381- use arrow_array:: { DictionaryArray , Int8Array , Int32Array , StringArray } ;
395+ use arrow_array:: types:: UInt8Type ;
396+ use arrow_array:: types:: UInt16Type ;
397+ use arrow_array:: {
398+ DictionaryArray , Int8Array , Int32Array , StringArray , UInt8Array , UInt16Array ,
399+ } ;
382400 use arrow_buffer:: { BooleanBuffer , Buffer , NullBuffer , OffsetBuffer } ;
383401 use std:: sync:: Arc ;
384402
@@ -527,4 +545,109 @@ mod tests {
527545 let expected = StringArray :: from ( vec ! [ "b" ] ) ;
528546 assert_eq ! ( merged. values. as_ref( ) , & expected) ;
529547 }
548+
549+ #[ test]
550+ fn test_merge_u8_boundary_256_values ( ) {
551+ // Test that exactly 256 unique values works for u8 (boundary case)
552+ // This is the maximum valid cardinality for u8 keys (0..=255)
553+ let values = StringArray :: from ( ( 0 ..256 ) . map ( |i| format ! ( "v{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
554+ let keys = UInt8Array :: from ( ( 0 ..256 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
555+ let dict = DictionaryArray :: < UInt8Type > :: try_new ( keys, Arc :: new ( values) ) . unwrap ( ) ;
556+
557+ let merged = merge_dictionary_values ( & [ & dict] , None ) . unwrap ( ) ;
558+ assert_eq ! (
559+ merged. values. len( ) ,
560+ 256 ,
561+ "Should support exactly 256 values for u8"
562+ ) ;
563+ assert_eq ! ( merged. key_mappings. len( ) , 1 ) ;
564+ assert_eq ! ( merged. key_mappings[ 0 ] . len( ) , 256 ) ;
565+ }
566+
567+ #[ test]
568+ fn test_merge_u8_overflow_257_values ( ) {
569+ // Test that 257 distinct values correctly fails for u8
570+ // Create two dictionaries with no overlap that together have 257 values
571+ let values1 = StringArray :: from ( ( 0 ..128 ) . map ( |i| format ! ( "a{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
572+ let keys1 = UInt8Array :: from ( ( 0 ..128 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
573+ let dict1 = DictionaryArray :: < UInt8Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
574+
575+ let values2 = StringArray :: from ( ( 0 ..129 ) . map ( |i| format ! ( "b{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
576+ let keys2 = UInt8Array :: from ( ( 0 ..129 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
577+ let dict2 = DictionaryArray :: < UInt8Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
578+
579+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
580+ assert ! (
581+ result. is_err( ) ,
582+ "Should fail with 257 distinct values for u8"
583+ ) ;
584+ if let Err ( e) = result {
585+ assert ! ( matches!( e, ArrowError :: DictionaryKeyOverflowError ) ) ;
586+ }
587+ }
588+
589+ #[ test]
590+ fn test_merge_u8_with_overlap ( ) {
591+ // Test that overlap is handled correctly and doesn't cause false overflow
592+ // dict1: 150 values (val0..val149)
593+ // dict2: 150 values (val100..val249), overlaps with dict1 on val100..val149
594+ // Total distinct: 150 + 100 = 250 values (should succeed)
595+ // Note: Interner is best-effort, so actual count may be slightly higher due to hash collisions
596+ let values1 = StringArray :: from ( ( 0 ..150 ) . map ( |i| format ! ( "val{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
597+ let keys1 = UInt8Array :: from ( ( 0 ..150 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
598+ let dict1 = DictionaryArray :: < UInt8Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
599+
600+ // Second dict: val100..val249 (overlaps on val100..val149, adds val150..val249)
601+ let values2 =
602+ StringArray :: from ( ( 100 ..250 ) . map ( |i| format ! ( "val{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
603+ let keys2 = UInt8Array :: from ( ( 0 ..150 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
604+ let dict2 = DictionaryArray :: < UInt8Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
605+
606+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
607+ assert ! (
608+ result. is_ok( ) ,
609+ "Should succeed with ~250 distinct values (within u8 range)"
610+ ) ;
611+ let merged = result. unwrap ( ) ;
612+ assert ! ( merged. values. len( ) <= 256 , "Should not exceed u8 maximum" ) ;
613+ }
614+
615+ #[ test]
616+ fn test_merge_u16_boundary_65536_values ( ) {
617+ // Test that exactly 65,536 unique values works for u16 (boundary case)
618+ // This is the maximum valid cardinality for u16 keys (0..=65535)
619+ let values = StringArray :: from ( ( 0 ..65536 ) . map ( |i| format ! ( "v{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
620+ let keys = UInt16Array :: from ( ( 0 ..65536 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
621+ let dict = DictionaryArray :: < UInt16Type > :: try_new ( keys, Arc :: new ( values) ) . unwrap ( ) ;
622+
623+ let merged = merge_dictionary_values ( & [ & dict] , None ) . unwrap ( ) ;
624+ assert_eq ! (
625+ merged. values. len( ) ,
626+ 65536 ,
627+ "Should support exactly 65,536 values for u16"
628+ ) ;
629+ assert_eq ! ( merged. key_mappings. len( ) , 1 ) ;
630+ assert_eq ! ( merged. key_mappings[ 0 ] . len( ) , 65536 ) ;
631+ }
632+
633+ #[ test]
634+ fn test_merge_u16_overflow_65537_values ( ) {
635+ // Test that 65,537 distinct values correctly fails for u16
636+ let values1 = StringArray :: from ( ( 0 ..32768 ) . map ( |i| format ! ( "a{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
637+ let keys1 = UInt16Array :: from ( ( 0 ..32768 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
638+ let dict1 = DictionaryArray :: < UInt16Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
639+
640+ let values2 = StringArray :: from ( ( 0 ..32769 ) . map ( |i| format ! ( "b{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
641+ let keys2 = UInt16Array :: from ( ( 0 ..32769 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
642+ let dict2 = DictionaryArray :: < UInt16Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
643+
644+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
645+ assert ! (
646+ result. is_err( ) ,
647+ "Should fail with 65,537 distinct values for u16"
648+ ) ;
649+ if let Err ( e) = result {
650+ assert ! ( matches!( e, ArrowError :: DictionaryKeyOverflowError ) ) ;
651+ }
652+ }
530653}
0 commit comments