@@ -275,12 +275,12 @@ pub(crate) fn merge_dictionary_values<K: ArrowDictionaryKeyType>(
275275
276276 for ( value_idx, value) in values {
277277 mapping[ value_idx] =
278- * interner. intern ( value, || match K :: Native :: from_usize ( indices . len ( ) ) {
279- Some ( idx ) => {
280- indices . push ( ( dictionary_idx , value_idx ) ) ;
281- Ok ( idx )
282- }
283- None => Err ( ArrowError :: DictionaryKeyOverflowError ) ,
278+ * interner. intern ( value, || -> Result < K :: Native , ArrowError > {
279+ let next_idx = indices . len ( ) ;
280+ let key = K :: Native :: from_usize ( next_idx )
281+ . ok_or_else ( || ArrowError :: DictionaryKeyOverflowError ) ? ;
282+ indices . push ( ( dictionary_idx , value_idx ) ) ;
283+ Ok ( key )
284284 } ) ?;
285285 }
286286 Ok ( mapping)
@@ -378,7 +378,11 @@ mod tests {
378378 use arrow_array:: cast:: as_string_array;
379379 use arrow_array:: types:: Int8Type ;
380380 use arrow_array:: types:: Int32Type ;
381- use arrow_array:: { DictionaryArray , Int8Array , Int32Array , StringArray } ;
381+ use arrow_array:: types:: UInt8Type ;
382+ use arrow_array:: types:: UInt16Type ;
383+ use arrow_array:: {
384+ DictionaryArray , Int8Array , Int32Array , StringArray , UInt8Array , UInt16Array ,
385+ } ;
382386 use arrow_buffer:: { BooleanBuffer , Buffer , NullBuffer , OffsetBuffer } ;
383387 use std:: sync:: Arc ;
384388
@@ -527,4 +531,109 @@ mod tests {
527531 let expected = StringArray :: from ( vec ! [ "b" ] ) ;
528532 assert_eq ! ( merged. values. as_ref( ) , & expected) ;
529533 }
534+
535+ #[ test]
536+ fn test_merge_u8_boundary_256_values ( ) {
537+ // Test that exactly 256 unique values works for u8 (boundary case)
538+ // This is the maximum valid cardinality for u8 keys (0..=255)
539+ let values = StringArray :: from ( ( 0 ..256 ) . map ( |i| format ! ( "v{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
540+ let keys = UInt8Array :: from ( ( 0 ..256 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
541+ let dict = DictionaryArray :: < UInt8Type > :: try_new ( keys, Arc :: new ( values) ) . unwrap ( ) ;
542+
543+ let merged = merge_dictionary_values ( & [ & dict] , None ) . unwrap ( ) ;
544+ assert_eq ! (
545+ merged. values. len( ) ,
546+ 256 ,
547+ "Should support exactly 256 values for u8"
548+ ) ;
549+ assert_eq ! ( merged. key_mappings. len( ) , 1 ) ;
550+ assert_eq ! ( merged. key_mappings[ 0 ] . len( ) , 256 ) ;
551+ }
552+
553+ #[ test]
554+ fn test_merge_u8_overflow_257_values ( ) {
555+ // Test that 257 distinct values correctly fails for u8
556+ // Create two dictionaries with no overlap that together have 257 values
557+ let values1 = StringArray :: from ( ( 0 ..128 ) . map ( |i| format ! ( "a{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
558+ let keys1 = UInt8Array :: from ( ( 0 ..128 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
559+ let dict1 = DictionaryArray :: < UInt8Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
560+
561+ let values2 = StringArray :: from ( ( 0 ..129 ) . map ( |i| format ! ( "b{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
562+ let keys2 = UInt8Array :: from ( ( 0 ..129 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
563+ let dict2 = DictionaryArray :: < UInt8Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
564+
565+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
566+ assert ! (
567+ result. is_err( ) ,
568+ "Should fail with 257 distinct values for u8"
569+ ) ;
570+ if let Err ( e) = result {
571+ assert ! ( matches!( e, ArrowError :: DictionaryKeyOverflowError ) ) ;
572+ }
573+ }
574+
575+ #[ test]
576+ fn test_merge_u8_with_overlap ( ) {
577+ // Test that overlap is handled correctly and doesn't cause false overflow
578+ // dict1: 150 values (val0..val149)
579+ // dict2: 150 values (val100..val249), overlaps with dict1 on val100..val149
580+ // Total distinct: 150 + 100 = 250 values (should succeed)
581+ // Note: Interner is best-effort, so actual count may be slightly higher due to hash collisions
582+ let values1 = StringArray :: from ( ( 0 ..150 ) . map ( |i| format ! ( "val{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
583+ let keys1 = UInt8Array :: from ( ( 0 ..150 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
584+ let dict1 = DictionaryArray :: < UInt8Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
585+
586+ // Second dict: val100..val249 (overlaps on val100..val149, adds val150..val249)
587+ let values2 =
588+ StringArray :: from ( ( 100 ..250 ) . map ( |i| format ! ( "val{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
589+ let keys2 = UInt8Array :: from ( ( 0 ..150 ) . map ( |i| i as u8 ) . collect :: < Vec < _ > > ( ) ) ;
590+ let dict2 = DictionaryArray :: < UInt8Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
591+
592+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
593+ assert ! (
594+ result. is_ok( ) ,
595+ "Should succeed with ~250 distinct values (within u8 range)"
596+ ) ;
597+ let merged = result. unwrap ( ) ;
598+ assert ! ( merged. values. len( ) <= 256 , "Should not exceed u8 maximum" ) ;
599+ }
600+
601+ #[ test]
602+ fn test_merge_u16_boundary_65536_values ( ) {
603+ // Test that exactly 65,536 unique values works for u16 (boundary case)
604+ // This is the maximum valid cardinality for u16 keys (0..=65535)
605+ let values = StringArray :: from ( ( 0 ..65536 ) . map ( |i| format ! ( "v{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
606+ let keys = UInt16Array :: from ( ( 0 ..65536 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
607+ let dict = DictionaryArray :: < UInt16Type > :: try_new ( keys, Arc :: new ( values) ) . unwrap ( ) ;
608+
609+ let merged = merge_dictionary_values ( & [ & dict] , None ) . unwrap ( ) ;
610+ assert_eq ! (
611+ merged. values. len( ) ,
612+ 65536 ,
613+ "Should support exactly 65,536 values for u16"
614+ ) ;
615+ assert_eq ! ( merged. key_mappings. len( ) , 1 ) ;
616+ assert_eq ! ( merged. key_mappings[ 0 ] . len( ) , 65536 ) ;
617+ }
618+
619+ #[ test]
620+ fn test_merge_u16_overflow_65537_values ( ) {
621+ // Test that 65,537 distinct values correctly fails for u16
622+ let values1 = StringArray :: from ( ( 0 ..32768 ) . map ( |i| format ! ( "a{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
623+ let keys1 = UInt16Array :: from ( ( 0 ..32768 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
624+ let dict1 = DictionaryArray :: < UInt16Type > :: try_new ( keys1, Arc :: new ( values1) ) . unwrap ( ) ;
625+
626+ let values2 = StringArray :: from ( ( 0 ..32769 ) . map ( |i| format ! ( "b{}" , i) ) . collect :: < Vec < _ > > ( ) ) ;
627+ let keys2 = UInt16Array :: from ( ( 0 ..32769 ) . map ( |i| i as u16 ) . collect :: < Vec < _ > > ( ) ) ;
628+ let dict2 = DictionaryArray :: < UInt16Type > :: try_new ( keys2, Arc :: new ( values2) ) . unwrap ( ) ;
629+
630+ let result = merge_dictionary_values ( & [ & dict1, & dict2] , None ) ;
631+ assert ! (
632+ result. is_err( ) ,
633+ "Should fail with 65,537 distinct values for u16"
634+ ) ;
635+ if let Err ( e) = result {
636+ assert ! ( matches!( e, ArrowError :: DictionaryKeyOverflowError ) ) ;
637+ }
638+ }
530639}
0 commit comments