Fix decimal handling and timestamp tests for Python SDK

plusplusjiajia · plusplusjiajia · commit f2a606976a5e · 2026-04-13T22:24:12.000+08:00
diff --git a/paimon-python/pypaimon/table/row/generic_row.py b/paimon-python/pypaimon/table/row/generic_row.py
@@ -16,38 +16,41 @@
 # limitations under the License.
 ################################################################################
 
+import calendar
+import decimal
 import struct
+from dataclasses import dataclass
 from datetime import date, datetime, time, timedelta
 from decimal import Decimal
 from typing import Any, List, Union
 
-from dataclasses import dataclass
-
 from pypaimon.schema.data_types import AtomicType, DataField, DataType
 from pypaimon.table.row.binary_row import BinaryRow
-from pypaimon.table.row.internal_row import InternalRow, RowKind
 from pypaimon.table.row.blob import BlobData
+from pypaimon.table.row.internal_row import InternalRow, RowKind
 
+_DECIMAL_CTX = decimal.Context(prec=100, rounding=decimal.ROUND_HALF_UP)
 
-def _decimal_to_unscaled(d: Decimal, scale: int) -> int:
-    """Convert a Decimal to its unscaled integer value without precision loss.
-    Raises ArithmeticError if the value has more fractional digits than scale."""
-    sign, digits, exponent = d.as_tuple()
+
+def _decimal_to_unscaled_with_check(d: Decimal, precision: int, scale: int):
+    """Round decimal with HALF_UP, check precision overflow, and return unscaled value.
+    Returns (unscaled_int, True) on overflow, (unscaled_int, False) on success."""
+    rounded = d.quantize(Decimal(10) ** -scale, context=_DECIMAL_CTX)
+    sign, digits, exponent = rounded.as_tuple()
+    # Precision overflow check
+    if rounded != 0 and len(digits) > precision:
+        return 0, True
     int_digits = int(''.join(str(x) for x in digits)) if digits != (0,) else 0
     shift = exponent + scale
     if shift >= 0:
         unscaled = int_digits * (10 ** shift)
     else:
-        divisor = 10 ** (-shift)
-        if int_digits % divisor != 0:
-            raise ArithmeticError(
-                f"Decimal {d} has more fractional digits than scale {scale}")
-        unscaled = int_digits // divisor
-    return -unscaled if sign else unscaled
+        unscaled = int_digits // (10 ** (-shift))
+    return (-unscaled if sign else unscaled), False
 
 
 def _parse_type_precision_scale(data_type):
-    """Parse precision and scale from type string like DECIMAL(38, 10) or TIMESTAMP(6)."""
+    """Parse precision and scale from type string like DECIMAL(38, 10)."""
     type_str = str(data_type)
     if '(' in type_str and ')' in type_str:
         try:
@@ -61,6 +64,28 @@ def _parse_type_precision_scale(data_type):
     return 0, 0
 
 
+_EPOCH = datetime(1970, 1, 1)
+
+
+def _datetime_to_millis_and_nanos(value: datetime):
+    """Convert datetime to (epoch_millis, nano_of_millisecond) without float arithmetic."""
+    epoch_seconds = calendar.timegm(value.timetuple())
+    millis = epoch_seconds * 1000 + value.microsecond // 1000
+    nano_of_millisecond = (value.microsecond % 1000) * 1000
+    return millis, nano_of_millisecond
+
+
+def _millis_nanos_to_datetime(millis: int, nano_of_millisecond: int = 0) -> datetime:
+    """Convert (epoch_millis, nano_of_millisecond) to datetime. Nanos truncated to micros."""
+    total_micros = millis * 1000 + nano_of_millisecond // 1000
+    seconds = total_micros // 1_000_000
+    micros = total_micros % 1_000_000
+    if micros < 0:
+        seconds -= 1
+        micros += 1_000_000
+    return _EPOCH + timedelta(seconds=seconds, microseconds=micros)
+
+
 @dataclass
 class GenericRow(InternalRow):
 
@@ -271,26 +296,43 @@ def _unscaled_to_decimal(cls, unscaled_value: int, scale: int) -> Decimal:
         return Decimal((sign, digits, -scale))
 
     @classmethod
-    def _parse_decimal(cls, bytes_data: bytes, base_offset: int, field_offset: int, data_type: DataType) -> Decimal:
+    def _parse_decimal(cls, bytes_data: bytes, base_offset: int, field_offset: int, data_type: DataType):
         precision, scale = _parse_type_precision_scale(data_type)
+        if precision <= 0:
+            raise ValueError(f"Decimal requires precision > 0, got {precision}")
         if precision <= 18:
-            # Compact format: unscaled long stored directly in fixed part
+            # Compact: unscaled long in fixed part
             unscaled_long = struct.unpack('<q', bytes_data[field_offset:field_offset + 8])[0]
             return cls._unscaled_to_decimal(unscaled_long, scale)
         else:
-            # Non-compact format: fixed part has (cursor << 32) | byte_length
+            # Non-compact: (cursor << 32 | byte_length) in fixed part, bytes in var area
             offset_and_len = struct.unpack('<q', bytes_data[field_offset:field_offset + 8])[0]
             cursor = (offset_and_len >> 32) & 0xFFFFFFFF
             byte_length = offset_and_len & 0xFFFFFFFF
             var_offset = base_offset + cursor
             unscaled_bytes = bytes_data[var_offset:var_offset + byte_length]
             unscaled_value = int.from_bytes(unscaled_bytes, byteorder='big', signed=True)
-            return cls._unscaled_to_decimal(unscaled_value, scale)
+            # Precision overflow returns null
+            result = cls._unscaled_to_decimal(unscaled_value, scale)
+            _, digits, _ = result.as_tuple()
+            if result != 0 and len(digits) > precision:
+                return None
+            return result
 
     @classmethod
     def _parse_timestamp(cls, bytes_data: bytes, base_offset: int, field_offset: int, data_type: DataType) -> datetime:
-        millis = struct.unpack('<q', bytes_data[field_offset:field_offset + 8])[0]
-        return datetime.fromtimestamp(millis / 1000.0, tz=None)
+        precision, _ = _parse_type_precision_scale(data_type)
+        if precision <= 3:
+            # Compact: epoch millis in fixed part
+            millis = struct.unpack('<q', bytes_data[field_offset:field_offset + 8])[0]
+            return _millis_nanos_to_datetime(millis)
+        else:
+            # Non-compact: (cursor << 32 | nanoOfMillisecond) in fixed part, millis in var area
+            offset_and_nanos = struct.unpack('<q', bytes_data[field_offset:field_offset + 8])[0]
+            nano_of_millisecond = offset_and_nanos & 0xFFFFFFFF
+            sub_offset = (offset_and_nanos >> 32) & 0xFFFFFFFF
+            millis = struct.unpack('<q', bytes_data[base_offset + sub_offset:base_offset + sub_offset + 8])[0]
+            return _millis_nanos_to_datetime(millis, nano_of_millisecond)
 
     @classmethod
     def _parse_date(cls, bytes_data: bytes, field_offset: int) -> date:
@@ -339,17 +381,39 @@ def to_bytes(cls, row: Union[GenericRow, BinaryRow]) -> bytes:
                 raise ValueError(f"BinaryRow only support AtomicType yet, meet {field.type.__class__}")
 
             type_name = field.type.type.upper()
-            is_var_len_type = any(type_name.startswith(p) for p in ['CHAR', 'VARCHAR', 'STRING',
-                                                                     'BINARY', 'VARBINARY', 'BYTES', 'BLOB'])
+            is_var_len_type = any(type_name.startswith(p) for p in [
+                'CHAR', 'VARCHAR', 'STRING', 'BINARY', 'VARBINARY', 'BYTES', 'BLOB'])
             is_decimal_type = type_name.startswith('DECIMAL') or type_name.startswith('NUMERIC')
+            is_timestamp_type = type_name.startswith('TIMESTAMP')
             decimal_precision, decimal_scale = _parse_type_precision_scale(field.type) if is_decimal_type else (0, 0)
             is_high_precision_decimal = is_decimal_type and decimal_precision > 18
-
-            if is_var_len_type or is_high_precision_decimal:
+            timestamp_precision = _parse_type_precision_scale(field.type)[0] if is_timestamp_type else 0
+            is_non_compact_timestamp = is_timestamp_type and timestamp_precision > 3
+
+            # Precision overflow -> null
+            if is_decimal_type and value is not None:
+                d = value if isinstance(value, Decimal) else Decimal(str(value))
+                unscaled_value, overflow = _decimal_to_unscaled_with_check(d, decimal_precision, decimal_scale)
+                if overflow:
+                    cls._set_null_bit(fixed_part, 0, i)
+                    struct.pack_into('<q', fixed_part, field_fixed_offset, 0)
+                    continue
+
+            if is_non_compact_timestamp:
+                # Non-compact: millis in var area, (offset << 32 | nanoOfMilli) in fixed part
+                if value.tzinfo is not None:
+                    raise RuntimeError("datetime tzinfo not supported yet")
+                ts_millis, nano_of_millisecond = _datetime_to_millis_and_nanos(value)
+                var_value_bytes = struct.pack('<q', ts_millis)
+                offset_in_variable_part = current_variable_offset
+                variable_part_data.append(var_value_bytes)
+                current_variable_offset += 8
+                absolute_offset = fixed_part_size + offset_in_variable_part
+                offset_and_nano = (absolute_offset << 32) | nano_of_millisecond
+                struct.pack_into('<q', fixed_part, field_fixed_offset, offset_and_nano)
+            elif is_var_len_type or is_high_precision_decimal:
                 if is_high_precision_decimal:
-                    d = value if isinstance(value, Decimal) else Decimal(str(value))
-                    unscaled_value = _decimal_to_unscaled(d, decimal_scale)
-                    # Convert to big-endian signed bytes (minimal representation)
+                    # Big-endian signed bytes
                     if unscaled_value == 0:
                         value_bytes = b'\x00'
                     else:
@@ -370,7 +434,7 @@ def to_bytes(cls, row: Union[GenericRow, BinaryRow]) -> bytes:
                     header_byte = 0x80 | length
                     fixed_part[field_fixed_offset + 7] = header_byte
                 else:
-                    # Non-compact decimal uses fixed 16 bytes, others use 8-byte alignment
+                    # Non-compact decimal: fixed 16 bytes; others: 8-byte aligned
                     if is_high_precision_decimal:
                         var_length = 16
                     else:
@@ -428,6 +492,11 @@ def _serialize_field_value(cls, value: Any, data_type: AtomicType) -> bytes:
                     f"via the variable-length path in to_bytes(), not _serialize_field_value()")
             return cls._serialize_decimal(value, data_type)
         elif type_name.startswith('TIMESTAMP'):
+            precision = _parse_type_precision_scale(data_type)[0]
+            if precision > 3:
+                raise ValueError(
+                    f"Non-compact timestamp (precision={precision}) must be serialized "
+                    f"via the variable-length path in to_bytes(), not _serialize_field_value()")
             return cls._serialize_timestamp(value)
         elif type_name in ['DATE']:
             return cls._serialize_date(value) + b'\x00' * 4
@@ -466,17 +535,17 @@ def _serialize_double(cls, value: float) -> bytes:
 
     @classmethod
     def _serialize_decimal(cls, value: Decimal, data_type: DataType) -> bytes:
-        """Serialize compact decimal (precision <= 18) as unscaled long in fixed part."""
-        _, scale = _parse_type_precision_scale(data_type)
+        """Compact decimal: unscaled long in fixed part."""
+        precision, scale = _parse_type_precision_scale(data_type)
         d = value if isinstance(value, Decimal) else Decimal(str(value))
-        unscaled_value = _decimal_to_unscaled(d, scale)
+        unscaled_value, _ = _decimal_to_unscaled_with_check(d, precision, scale)
         return struct.pack('<q', unscaled_value)
 
     @classmethod
     def _serialize_timestamp(cls, value: datetime) -> bytes:
         if value.tzinfo is not None:
             raise RuntimeError("datetime tzinfo not supported yet")
-        millis = int(value.timestamp() * 1000)
+        millis, _ = _datetime_to_millis_and_nanos(value)
         return struct.pack('<q', millis)
 
     @classmethod
diff --git a/paimon-python/pypaimon/tests/decimal_test.py b/paimon-python/pypaimon/tests/decimal_test.py
@@ -20,7 +20,8 @@
 from decimal import Decimal
 
 from pypaimon.schema.data_types import AtomicType, DataField
-from pypaimon.table.row.generic_row import GenericRow, GenericRowSerializer, GenericRowDeserializer
+from pypaimon.table.row.generic_row import (GenericRow, GenericRowDeserializer,
+                                            GenericRowSerializer)
 from pypaimon.table.row.row_kind import RowKind
 
 
@@ -113,7 +114,6 @@ def test_decimal_mixed_with_other_types(self):
         self.assertEqual(result.values[3], Decimal("12312455.22"))
         self.assertAlmostEqual(result.values[4], 3.14)
 
-
     def test_decimal_compact_binary_format(self):
         """Verify compact decimal binary layout: unscaled long in fixed part."""
         fields = [DataField(0, "d", AtomicType("DECIMAL(4, 2)"))]
@@ -148,7 +148,7 @@ def test_decimal_not_compact_binary_format(self):
 
         # cursor should point to the variable area (== fixed_part_size)
         self.assertEqual(cursor, fixed_part_size)
-        # variable area should be exactly 16 bytes (matching Java's cursor += 16)
+        # variable area should be exactly 16 bytes
         var_area = data[cursor:]
         self.assertEqual(len(var_area), 16)
         # unscaled bytes are big-endian signed
@@ -157,7 +157,6 @@ def test_decimal_not_compact_binary_format(self):
         # Decimal("5.55000") with scale=5 => unscaled = 555000
         self.assertEqual(unscaled_value, 555000)
 
-
     def test_decimal_boundary_precision(self):
         """Test boundary: DECIMAL(18, ...) is compact, DECIMAL(19, ...) is non-compact."""
         # precision=18: last compact
@@ -197,12 +196,85 @@ def test_decimal_zero_different_scales(self):
                 result = GenericRowDeserializer.from_bytes(serialized, fields)
                 self.assertEqual(result.values[0], val)
 
-    def test_decimal_truncation_raises(self):
-        """Serializing a value with more fractional digits than scale should raise."""
+    def test_decimal_half_up_rounding(self):
+        """Excess fractional digits should be rounded with HALF_UP."""
         fields = [DataField(0, "d", AtomicType("DECIMAL(10, 2)"))]
-        row = GenericRow([Decimal("1.999")], fields, RowKind.INSERT)
-        with self.assertRaises(ArithmeticError):
-            GenericRowSerializer.to_bytes(row)
+
+        test_cases = [
+            (Decimal("1.999"), Decimal("2.00")),    # .999 rounds up
+            (Decimal("1.235"), Decimal("1.24")),    # .235 rounds up (HALF_UP)
+            (Decimal("1.234"), Decimal("1.23")),    # .234 rounds down
+            (Decimal("1.225"), Decimal("1.23")),    # .225 rounds up (HALF_UP)
+            (Decimal("-1.235"), Decimal("-1.24")),   # negative HALF_UP
+        ]
+        for val, expected in test_cases:
+            with self.subTest(value=val):
+                row = GenericRow([val], fields, RowKind.INSERT)
+                serialized = GenericRowSerializer.to_bytes(row)
+                result = GenericRowDeserializer.from_bytes(serialized, fields)
+                self.assertEqual(result.values[0], expected)
+
+    def test_decimal_precision_overflow_returns_null(self):
+        """Values exceeding declared precision should be stored as null."""
+        # DECIMAL(4, 2) can hold at most 2 integer + 2 fractional digits => max 99.99
+        fields = [DataField(0, "d", AtomicType("DECIMAL(4, 2)"))]
+
+        # 999.99 needs 5 digits total, exceeds precision=4
+        row = GenericRow([Decimal("999.99")], fields, RowKind.INSERT)
+        serialized = GenericRowSerializer.to_bytes(row)
+        result = GenericRowDeserializer.from_bytes(serialized, fields)
+        self.assertIsNone(result.values[0])
+
+        # 99.999 rounds to 100.00 (5 digits), also overflows
+        row2 = GenericRow([Decimal("99.999")], fields, RowKind.INSERT)
+        serialized2 = GenericRowSerializer.to_bytes(row2)
+        result2 = GenericRowDeserializer.from_bytes(serialized2, fields)
+        self.assertIsNone(result2.values[0])
+
+        # 99.99 fits exactly in DECIMAL(4, 2)
+        row3 = GenericRow([Decimal("99.99")], fields, RowKind.INSERT)
+        serialized3 = GenericRowSerializer.to_bytes(row3)
+        result3 = GenericRowDeserializer.from_bytes(serialized3, fields)
+        self.assertEqual(result3.values[0], Decimal("99.99"))
+
+    def test_decimal_precision_overflow_high_precision(self):
+        """Precision overflow check also works for non-compact decimals."""
+        # DECIMAL(20, 5) can hold 15 integer + 5 fractional digits
+        fields = [DataField(0, "d", AtomicType("DECIMAL(20, 5)"))]
+
+        # This value fits: 15 integer digits + 5 fractional
+        row = GenericRow([Decimal("123456789012345.12345")], fields, RowKind.INSERT)
+        serialized = GenericRowSerializer.to_bytes(row)
+        result = GenericRowDeserializer.from_bytes(serialized, fields)
+        self.assertEqual(result.values[0], Decimal("123456789012345.12345"))
+
+        # This value overflows: 16 integer digits + 5 fractional = 21 > 20
+        row2 = GenericRow([Decimal("1234567890123456.12345")], fields, RowKind.INSERT)
+        serialized2 = GenericRowSerializer.to_bytes(row2)
+        result2 = GenericRowDeserializer.from_bytes(serialized2, fields)
+        self.assertIsNone(result2.values[0])
+
+    def test_decimal_deserialization_precision_overflow_non_compact(self):
+        """Non-compact decimal deserialization returns None if precision overflows."""
+        # Serialize with DECIMAL(38, 5) which fits, then deserialize as DECIMAL(20, 5)
+        fields_wide = [DataField(0, "d", AtomicType("DECIMAL(38, 5)"))]
+        fields_narrow = [DataField(0, "d", AtomicType("DECIMAL(20, 5)"))]
+
+        # 21 digits total exceeds precision=20
+        row = GenericRow([Decimal("1234567890123456.12345")], fields_wide, RowKind.INSERT)
+        serialized = GenericRowSerializer.to_bytes(row)
+        result = GenericRowDeserializer.from_bytes(serialized, fields_narrow)
+        self.assertIsNone(result.values[0])
+
+    def test_decimal_deserialization_invalid_precision(self):
+        """Deserialization with precision <= 0 raises ValueError."""
+        fields_valid = [DataField(0, "d", AtomicType("DECIMAL(10, 2)"))]
+        row = GenericRow([Decimal("1.23")], fields_valid, RowKind.INSERT)
+        serialized = GenericRowSerializer.to_bytes(row)
+
+        fields_bad = [DataField(0, "d", AtomicType("DECIMAL(0, 2)"))]
+        with self.assertRaises(ValueError):
+            GenericRowDeserializer.from_bytes(serialized, fields_bad)
 
 
 if __name__ == '__main__':
diff --git a/paimon-python/pypaimon/tests/timestamp_test.py b/paimon-python/pypaimon/tests/timestamp_test.py