Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,15 +716,18 @@ def delete(
"""
from pyiceberg.io.pyarrow import ArrowScan, _dataframe_to_data_files, _expression_to_complementary_pyarrow

if isinstance(delete_filter, str):
delete_filter = _parse_row_filter(delete_filter)

if (
self.table_metadata.properties.get(TableProperties.DELETE_MODE, TableProperties.DELETE_MODE_DEFAULT)
== TableProperties.DELETE_MODE_MERGE_ON_READ
):
if self.table_metadata.format_version == 2:
self._delete_merge_on_read(delete_filter, snapshot_properties, case_sensitive, branch)
return
warnings.warn("Merge on read is not yet supported, falling back to copy-on-write", stacklevel=2)

if isinstance(delete_filter, str):
delete_filter = _parse_row_filter(delete_filter)

with self.update_snapshot(snapshot_properties=snapshot_properties, branch=branch).delete() as delete_snapshot:
delete_snapshot.delete_by_predicate(delete_filter, case_sensitive)

Expand Down Expand Up @@ -790,6 +793,75 @@ def delete(
if not delete_snapshot.files_affected and not delete_snapshot.rewrites_needed:
warnings.warn("Delete operation did not match any records", stacklevel=2)

def _delete_merge_on_read(
self,
delete_filter: BooleanExpression,
snapshot_properties: dict[str, str] = EMPTY_DICT,
case_sensitive: bool = True,
branch: str | None = MAIN_BRANCH,
) -> None:
import pyarrow as pa

from pyiceberg.io.pyarrow import ArrowScan, _read_all_delete_files, expression_to_pyarrow, write_position_delete_file

file_scan = self._scan(row_filter=delete_filter, case_sensitive=case_sensitive)
if branch is not None:
file_scan = file_scan.use_ref(branch)
tasks = list(file_scan.plan_files())

bound_delete_filter = bind(self.table_metadata.schema(), delete_filter, case_sensitive)
pyarrow_filter = expression_to_pyarrow(bound_delete_filter, self.table_metadata.schema())
deletes_per_file = _read_all_delete_files(self._table.io, tasks)
positions_by_data_file: dict[DataFile, set[int]] = {}

raw_scan = ArrowScan(
table_metadata=self.table_metadata,
io=self._table.io,
projected_schema=self.table_metadata.schema(),
row_filter=AlwaysTrue(),
case_sensitive=case_sensitive,
)

for task in tasks:
existing_deleted_positions: set[int] = set()
for positions in deletes_per_file.get(task.file.file_path, []):
existing_deleted_positions.update(int(pos) for pos in positions.to_pylist())

current_index = 0
raw_task = FileScanTask(task.file, delete_files=set(), residual=AlwaysTrue())
for batch in raw_scan.to_record_batches([raw_task]):
row_positions = pa.array(range(current_index, current_index + batch.num_rows), type=pa.int64())
current_index += batch.num_rows

position_column_name = "__pyiceberg_position"
while position_column_name in batch.schema.names:
position_column_name += "_"

batch_with_positions = pa.Table.from_batches([batch]).append_column(position_column_name, row_positions)
matching_positions = batch_with_positions.filter(pyarrow_filter).column(position_column_name).to_pylist()
positions_to_delete = {int(pos) for pos in matching_positions if int(pos) not in existing_deleted_positions}

if positions_to_delete:
positions_by_data_file.setdefault(task.file, set()).update(positions_to_delete)

if not positions_by_data_file:
warnings.warn("Delete operation did not match any records", stacklevel=2)
return

counter = itertools.count(0)
with self.update_snapshot(snapshot_properties=snapshot_properties, branch=branch).row_delta() as producer:
for data_file, positions in positions_by_data_file.items():
producer.append_delete_file(
write_position_delete_file(
io=self._table.io,
table_metadata=self.table_metadata,
referenced_data_file=data_file,
positions=sorted(positions),
write_uuid=producer.commit_uuid,
counter=counter,
)
)

def upsert(
self,
df: pa.Table,
Expand Down
Loading