Skip to content

Commit 5b65553

Browse files
committed
support VARIANT for pypaimon
1 parent b6c0787 commit 5b65553

4 files changed

Lines changed: 25 additions & 18 deletions

File tree

docs/content/pypaimon/python-api.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -736,11 +736,11 @@ from pypaimon.data.generic_variant import GenericVariant
736736
read_builder = table.new_read_builder()
737737
result = read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits())
738738

739-
for row in result.column("payload").to_pylist():
740-
if row is not None:
741-
gv = GenericVariant.from_dict(row) # wrap raw bytes
742-
print(gv.to_python()) # decode to Python object
743-
print(gv.variant_get("$.city", "string")) # path extraction
739+
for record in result.to_pylist():
740+
if (payload := record["payload"]) is not None:
741+
gv = GenericVariant.from_arrow_struct(payload)
742+
print(gv.to_python()) # decode to Python object
743+
print(gv.variant_get("$.city", "string")) # path extraction
744744
```
745745

746746
**Writing a VARIANT column:**
@@ -773,7 +773,7 @@ table_commit.close()
773773
|:-------|:------------|
774774
| `GenericVariant.from_json(json_str)` | Build from a JSON string |
775775
| `GenericVariant.from_python(obj)` | Build from a Python object (`dict`, `list`, `int`, `str`, …) |
776-
| `GenericVariant.from_dict({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row |
776+
| `GenericVariant.from_arrow_struct({"value": b"...", "metadata": b"..."})` | Wrap raw bytes from an Arrow VARIANT struct row (read path) |
777777
| `GenericVariant.to_arrow_array([gv1, gv2, None, ...])` | Convert a list of `GenericVariant` (or `None`) to a `pa.StructArray` for writing |
778778
| `gv.to_python()` | Decode to native Python (`dict`, `list`, `int`, `str`, `None`, …) |
779779
| `gv.to_json()` | Decode to a JSON string |
@@ -785,7 +785,7 @@ table_commit.close()
785785
- `VARIANT` is only supported with Parquet file format. Writing to ORC or Avro raises `NotImplementedError`.
786786
- `VARIANT` cannot be used as a primary key or partition key.
787787
- Shredded VARIANT files (written by Paimon Java with `typed_value` sub-fields) are readable
788-
via the raw `from_dict` path, but the extra fields are not automatically interpreted.
788+
via the raw `from_arrow_struct` path, but the extra fields are not automatically interpreted.
789789

790790
## Predicate
791791

paimon-python/pypaimon/data/generic_variant.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,7 @@ class GenericVariant:
709709
710710
# Construct from raw bytes (e.g. what to_arrow() returns for a VARIANT column)
711711
row = result.column('payload')[0].as_py() # {'value': bytes, 'metadata': bytes}
712-
v = GenericVariant.from_dict(row)
712+
v = GenericVariant.from_arrow_struct(row)
713713
print(v.to_python()) # {'age': 30, 'city': 'Beijing'}
714714
"""
715715

@@ -741,8 +741,15 @@ def from_python(cls, obj) -> 'GenericVariant':
741741
return builder.result()
742742

743743
@classmethod
744-
def from_dict(cls, d: dict) -> 'GenericVariant':
745-
"""Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}."""
744+
def from_arrow_struct(cls, d: dict) -> 'GenericVariant':
745+
"""Wrap raw bytes from a PyArrow VARIANT struct: {'value': bytes, 'metadata': bytes}.
746+
747+
Use this on the read path after calling ``column.to_pylist()`` on a VARIANT column::
748+
749+
for row in result.column("payload").to_pylist():
750+
if row is not None:
751+
gv = GenericVariant.from_arrow_struct(row)
752+
"""
746753
return cls(bytes(d['value']), bytes(d['metadata']))
747754

748755
@classmethod

paimon-python/pypaimon/tests/e2e/java_py_read_write_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -710,17 +710,17 @@ def test_py_read_variant_table(self):
710710
payload_list = result_sorted.column('payload').to_pylist()
711711

712712
# Row 1: Alice, {"age":30,"city":"Beijing"}
713-
gv_alice = GenericVariant.from_dict(payload_list[id_list.index(1)])
713+
gv_alice = GenericVariant.from_arrow_struct(payload_list[id_list.index(1)])
714714
self.assertEqual(gv_alice.variant_get('$.age', 'int'), 30)
715715
self.assertEqual(gv_alice.variant_get('$.city', 'string'), 'Beijing')
716716

717717
# Row 2: Bob, {"age":25,"city":"Shanghai"}
718-
gv_bob = GenericVariant.from_dict(payload_list[id_list.index(2)])
718+
gv_bob = GenericVariant.from_arrow_struct(payload_list[id_list.index(2)])
719719
self.assertEqual(gv_bob.variant_get('$.age', 'int'), 25)
720720
self.assertEqual(gv_bob.variant_get('$.city', 'string'), 'Shanghai')
721721

722722
# Row 3: Carol, [1,2,3]
723-
gv_carol = GenericVariant.from_dict(payload_list[id_list.index(3)])
723+
gv_carol = GenericVariant.from_arrow_struct(payload_list[id_list.index(3)])
724724
self.assertEqual(gv_carol.to_python(), [1, 2, 3])
725725

726726
print(f"test_py_read_variant_table: verified {result.num_rows} VARIANT rows")

paimon-python/pypaimon/tests/variant_test.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -538,14 +538,14 @@ def test_string_cast_on_int(self):
538538

539539
class TestConstructors(unittest.TestCase):
540540

541-
def test_from_dict_roundtrip(self):
541+
def test_from_arrow_struct_roundtrip(self):
542542
original = GenericVariant.from_json('{"x":1,"y":2}')
543-
restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()})
543+
restored = GenericVariant.from_arrow_struct({'value': original.value(), 'metadata': original.metadata()})
544544
self.assertEqual(restored.to_json(), original.to_json())
545545

546-
def test_from_dict_array(self):
546+
def test_from_arrow_struct_array(self):
547547
original = GenericVariant.from_json('[1,2,3]')
548-
restored = GenericVariant.from_dict({'value': original.value(), 'metadata': original.metadata()})
548+
restored = GenericVariant.from_arrow_struct({'value': original.value(), 'metadata': original.metadata()})
549549
self.assertEqual(restored.get_type(), Type.ARRAY)
550550
self.assertEqual(restored.to_python(), [1, 2, 3])
551551

@@ -578,7 +578,7 @@ def test_basic(self):
578578
arr = GenericVariant.to_arrow_array([gv1, gv2])
579579
self.assertIsInstance(arr, pa.StructArray)
580580
self.assertEqual(len(arr), 2)
581-
restored = GenericVariant.from_dict(arr[0].as_py())
581+
restored = GenericVariant.from_arrow_struct(arr[0].as_py())
582582
self.assertEqual(restored.variant_get('$.a', 'int'), 1)
583583

584584
def test_with_nulls(self):

0 commit comments

Comments
 (0)