@@ -288,25 +288,40 @@ def convert_arrow_field_to_python(field, column_metadata=None):
288288 )
289289 return value
290290 return None
291+ # Handle Doris LARGEINT type (Arrow utf8 -> Python int)
292+ elif doris_type in (b'LARGEINT' , 'LARGEINT' ):
293+ if pa .types .is_string (field .type ) or pa .types .is_large_string (field .type ):
294+ value = field .as_py ()
295+ if value is not None :
296+ try :
297+ return int (value )
298+ except (ValueError , TypeError ) as e :
299+ logging .warning (
300+ "Failed to convert string '%s' to int (LARGEINT): %s" , value , e
301+ )
302+ return value
303+ return None
291304
292305 return field .as_py ()
293306
294307
295- def convert_python_to_arrow_value (value , output_type = None ):
308+ def convert_python_to_arrow_value (value , output_type = None , output_metadata = None ):
296309 """
297310 Convert Python value back to Arrow-compatible value.
298311
299- This function handles the reverse conversion of IP addresses :
312+ This function handles the reverse conversion of special types :
300313 - ipaddress.IPv4Address -> int (with uint32 to int32 conversion)
301314 - ipaddress.IPv6Address -> str (for Arrow utf8)
315+ - Python int -> str (for LARGEINT, which uses Arrow utf8)
302316
303317 Type Safety:
304318 For IPv4/IPv6 return types, MUST return ipaddress objects.
305319 Returning raw integers or strings will raise TypeError.
306320
307321 Args:
308322 value: Python value to convert (can be single value or iterable)
309- output_type: Optional Arrow DataType with metadata
323+ output_type: Optional Arrow DataType
324+ output_metadata: Optional metadata dict from the output Arrow field
310325
311326 Returns:
312327 Arrow-compatible value
@@ -316,14 +331,23 @@ def convert_python_to_arrow_value(value, output_type=None):
316331
317332 is_ipv4_output = False
318333 is_ipv6_output = False
334+ is_largeint_output = False
319335
320- if output_type is not None and hasattr (output_type , 'metadata' ) and output_type .metadata :
336+ # Check output_metadata (from field metadata, passed explicitly)
337+ metadata = output_metadata
338+ # Fallback: check output_type.metadata (for compound types like struct fields)
339+ if metadata is None and output_type is not None and hasattr (output_type , 'metadata' ) and output_type .metadata :
340+ metadata = output_type .metadata
341+
342+ if metadata :
321343 # Arrow metadata keys can be either bytes or str depending on how they were created
322- doris_type = output_type . metadata .get (b'doris_type' ) or output_type . metadata .get ('doris_type' )
344+ doris_type = metadata .get (b'doris_type' ) or metadata .get ('doris_type' )
323345 if doris_type in (b'IPV4' , 'IPV4' ):
324346 is_ipv4_output = True
325347 elif doris_type in (b'IPV6' , 'IPV6' ):
326348 is_ipv6_output = True
349+ elif doris_type in (b'LARGEINT' , 'LARGEINT' ):
350+ is_largeint_output = True
327351
328352 # Convert IPv4Address back to int
329353 if isinstance (value , ipaddress .IPv4Address ):
@@ -333,6 +357,10 @@ def convert_python_to_arrow_value(value, output_type=None):
333357 if isinstance (value , ipaddress .IPv6Address ):
334358 return str (value )
335359
360+ # Convert Python int back to str for LARGEINT (Arrow uses utf8 for LARGEINT)
361+ if is_largeint_output and isinstance (value , int ):
362+ return str (value )
363+
336364 # IPv4 output must return IPv4Address objects
337365 if is_ipv4_output and isinstance (value , int ):
338366 raise TypeError (
@@ -352,10 +380,10 @@ def convert_python_to_arrow_value(value, output_type=None):
352380 # For list types, recursively convert elements
353381 if output_type and pa .types .is_list (output_type ):
354382 element_type = output_type .value_type
355- return [convert_python_to_arrow_value (v , element_type ) for v in value ]
383+ return [convert_python_to_arrow_value (v , element_type , output_metadata ) for v in value ]
356384 else :
357385 # No type info, just recurse without type
358- return [convert_python_to_arrow_value (v , None ) for v in value ]
386+ return [convert_python_to_arrow_value (v , None , output_metadata ) for v in value ]
359387
360388 # Handle tuple values (could be struct data)
361389 if isinstance (value , tuple ):
@@ -373,7 +401,7 @@ def convert_python_to_arrow_value(value, output_type=None):
373401 else :
374402 # Not a struct type, treat as regular tuple and recurse without type
375403 return tuple (convert_python_to_arrow_value (v , None ) for v in value )
376-
404+
377405 if isinstance (value , dict ):
378406 # For map types, convert keys and values recursively
379407 if output_type and pa .types .is_map (output_type ):
@@ -393,7 +421,7 @@ def convert_python_to_arrow_value(value, output_type=None):
393421 for k , v in value .items ()]
394422
395423 if isinstance (value , pd .Series ):
396- return value .apply (lambda v : convert_python_to_arrow_value (v , output_type ))
424+ return value .apply (lambda v : convert_python_to_arrow_value (v , output_type , output_metadata ))
397425
398426 return value
399427
@@ -473,6 +501,7 @@ def __init__(
473501 input_types : pa .Schema ,
474502 output_type : pa .DataType ,
475503 client_type : int ,
504+ output_metadata : Optional [dict ] = None ,
476505 ) -> None :
477506 """
478507 Initialize Python UDF metadata.
@@ -488,6 +517,7 @@ def __init__(
488517 input_types: PyArrow schema for input parameters
489518 output_type: PyArrow data type for return value
490519 client_type: 0 for UDF, 1 for UDAF, 2 for UDTF
520+ output_metadata: Optional metadata dict from the output Arrow field
491521 """
492522 self .name = name
493523 self .symbol = symbol
@@ -499,6 +529,7 @@ def __init__(
499529 self .input_types = input_types
500530 self .output_type = output_type
501531 self .client_type = ClientType (client_type )
532+ self .output_metadata = output_metadata
502533
503534 def is_udf (self ) -> bool :
504535 """Check if this is a UDF (User-Defined Function)."""
@@ -627,7 +658,7 @@ def _scalar_call(self, record_batch: pa.RecordBatch) -> pa.Array:
627658 f"please check the always_nullable property in create function statement, "
628659 f"it should be true"
629660 )
630- result .append (convert_python_to_arrow_value (res , self .python_udf_meta .output_type ))
661+ result .append (convert_python_to_arrow_value (res , self .python_udf_meta .output_type , self . python_udf_meta . output_metadata ))
631662 except Exception as e :
632663 logging .error (
633664 "Error in scalar UDF execution at row %s: %s\n Args: %s\n Traceback: %s" ,
@@ -697,7 +728,7 @@ def _vectorized_call(self, record_batch: pa.RecordBatch) -> pa.Array:
697728 )
698729 raise RuntimeError (f"Error in vectorized UDF: { e } " ) from e
699730
700- result = convert_python_to_arrow_value (result , self .python_udf_meta .output_type )
731+ result = convert_python_to_arrow_value (result , self .python_udf_meta .output_type , self . python_udf_meta . output_metadata )
701732
702733 # Convert result to PyArrow Array
703734 result_array = None
@@ -1614,6 +1645,7 @@ def parse_python_udf_meta(
16141645 return None
16151646
16161647 output_type = output_schema .field (0 ).type
1648+ output_metadata = output_schema .field (0 ).metadata
16171649
16181650 python_udf_meta = PythonUDFMeta (
16191651 name = name ,
@@ -1626,6 +1658,7 @@ def parse_python_udf_meta(
16261658 input_types = input_schema ,
16271659 output_type = output_type ,
16281660 client_type = client_type ,
1661+ output_metadata = output_metadata ,
16291662 )
16301663
16311664 return python_udf_meta
@@ -1887,13 +1920,14 @@ def _handle_udaf_finalize(
18871920 place_id : int ,
18881921 output_type : pa .DataType ,
18891922 state_manager : UDAFStateManager ,
1923+ output_metadata : Optional [dict ] = None ,
18901924 ) -> pa .RecordBatch :
18911925 """Handle UDAF FINALIZE operation.
18921926
18931927 Returns: [result: output_type] (null if failed)
18941928 """
18951929 try :
1896- result = convert_python_to_arrow_value (state_manager .finalize (place_id ), output_type )
1930+ result = convert_python_to_arrow_value (state_manager .finalize (place_id ), output_type , output_metadata )
18971931 except Exception as e :
18981932 logging .error (
18991933 "FINALIZE operation failed for place_id=%s: %s" ,
@@ -2171,7 +2205,8 @@ def _handle_exchange_udaf(
21712205 )
21722206 elif operation_type == UDAFOperationType .FINALIZE :
21732207 result_batch_finalize = self ._handle_udaf_finalize (
2174- place_id , python_udaf_meta .output_type , state_manager
2208+ place_id , python_udaf_meta .output_type , state_manager ,
2209+ python_udaf_meta .output_metadata
21752210 )
21762211 # Serialize the result to binary (including NULL results)
21772212 # NULL is a valid aggregation result, not an error
@@ -2302,7 +2337,8 @@ def _handle_exchange_udtf(
23022337 # Process all input rows and build ListArray
23032338 try :
23042339 response_batch = self ._process_udtf_with_list_array (
2305- udtf_func , input_batch , python_udtf_meta .output_type
2340+ udtf_func , input_batch , python_udtf_meta .output_type ,
2341+ python_udtf_meta .output_metadata
23062342 )
23072343
23082344 # Send the response batch
@@ -2339,6 +2375,7 @@ def _process_udtf_with_list_array(
23392375 udtf_func : Callable ,
23402376 input_batch : pa .RecordBatch ,
23412377 expected_output_type : pa .DataType ,
2378+ output_metadata : Optional [dict ] = None ,
23422379 ) -> pa .RecordBatch :
23432380 """
23442381 Process UDTF function on all input rows and generate a ListArray.
@@ -2347,6 +2384,7 @@ def _process_udtf_with_list_array(
23472384 udtf_func: The UDTF function to call
23482385 input_batch: Input RecordBatch with N rows
23492386 expected_output_type: Expected Arrow type for output data
2387+ output_metadata: Optional metadata dict from the output Arrow field
23502388
23512389 Returns:
23522390 RecordBatch with a single ListArray column where each element
@@ -2424,7 +2462,7 @@ def _process_udtf_with_list_array(
24242462
24252463 all_results .append (row_outputs )
24262464
2427- all_results = convert_python_to_arrow_value (all_results , expected_output_type )
2465+ all_results = convert_python_to_arrow_value (all_results , expected_output_type , output_metadata )
24282466
24292467 try :
24302468 list_array = pa .array (all_results , type = pa .list_ (expected_output_type ))
0 commit comments