Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions pyiceberg/environment_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from pyiceberg import __version__


class EnvironmentContext:
_PROPERTIES: dict[str, str] = {
"engine-name": "pyiceberg",
"engine-version": __version__,
}

def __init__(self) -> None:
raise NotImplementedError("EnvironmentContext is a utility class and cannot be instantiated.")

@classmethod
def get(cls) -> dict[str, str]:
"""Return a read-only copy of all properties."""
return cls._PROPERTIES.copy()

@classmethod
def put(cls, key: str, value: str) -> None:
"""Will add the given key/value pair in a global properties map."""
cls._PROPERTIES[key] = value

@classmethod
def remove(cls, key: str) -> str | None:
"""Remove the key from the global properties map."""
return cls._PROPERTIES.pop(key, None)
4 changes: 4 additions & 0 deletions pyiceberg/table/snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

from pydantic import Field, PrivateAttr, model_serializer

from pyiceberg.environment_context import EnvironmentContext
from pyiceberg.io import FileIO
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
Expand Down Expand Up @@ -402,6 +403,9 @@ def _update_totals(total_property: str, added_property: str, removed_property: s
removed_property=REMOVED_EQUALITY_DELETES,
)

for key, value in EnvironmentContext.get().items():
summary[key] = value

return summary


Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_deletes.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pyspark.sql import SparkSession

from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.environment_context import EnvironmentContext
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.expressions import AlwaysTrue, EqualTo, LessThanOrEqual
from pyiceberg.manifest import ManifestEntryStatus
Expand Down Expand Up @@ -480,6 +481,7 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
"total-files-size": snapshots[2].summary["total-files-size"],
"total-position-deletes": "1",
"total-records": "4",
**EnvironmentContext.get(),
},
)

Expand Down
54 changes: 32 additions & 22 deletions tests/integration/test_inspect_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pytest_lazy_fixtures import lf

from pyiceberg.catalog import Catalog
from pyiceberg.environment_context import EnvironmentContext
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.expressions import (
And,
Expand All @@ -53,6 +54,11 @@
TimestamptzType,
)


def with_environment_context_tuples(summary: list[tuple[str, str]]) -> list[tuple[str, str]]:
return summary + list(EnvironmentContext.get().items())


TABLE_SCHEMA = Schema(
NestedField(field_id=1, name="bool", field_type=BooleanType(), required=False),
NestedField(field_id=2, name="string", field_type=StringType(), required=False),
Expand Down Expand Up @@ -267,30 +273,34 @@ def test_inspect_snapshots(
assert file_size > 0

# Append
assert df["summary"][0].as_py() == [
("added-files-size", str(file_size)),
("added-data-files", "1"),
("added-records", "3"),
("total-data-files", "1"),
("total-delete-files", "0"),
("total-records", "3"),
("total-files-size", str(file_size)),
("total-position-deletes", "0"),
("total-equality-deletes", "0"),
]
assert df["summary"][0].as_py() == with_environment_context_tuples(
[
("added-files-size", str(file_size)),
("added-data-files", "1"),
("added-records", "3"),
("total-data-files", "1"),
("total-delete-files", "0"),
("total-records", "3"),
("total-files-size", str(file_size)),
("total-position-deletes", "0"),
("total-equality-deletes", "0"),
]
)

# Delete
assert df["summary"][1].as_py() == [
("removed-files-size", str(file_size)),
("deleted-data-files", "1"),
("deleted-records", "3"),
("total-data-files", "0"),
("total-delete-files", "0"),
("total-records", "0"),
("total-files-size", "0"),
("total-position-deletes", "0"),
("total-equality-deletes", "0"),
]
assert df["summary"][1].as_py() == with_environment_context_tuples(
[
("removed-files-size", str(file_size)),
("deleted-data-files", "1"),
("deleted-records", "3"),
("total-data-files", "0"),
("total-delete-files", "0"),
("total-records", "0"),
("total-files-size", "0"),
("total-position-deletes", "0"),
("total-equality-deletes", "0"),
]
)

lhs = spark.table(f"{identifier}.snapshots").toPandas()
rhs = df.to_pandas()
Expand Down
184 changes: 99 additions & 85 deletions tests/integration/test_writes/test_partitioned_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from pyiceberg.types import (
StringType,
)
from utils import TABLE_SCHEMA, _create_table
from utils import TABLE_SCHEMA, _create_table, with_environment_context


@pytest.mark.integration
Expand Down Expand Up @@ -487,95 +487,109 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro
file_size = int(summaries[0]["added-files-size"])
assert file_size > 0

assert summaries[0] == {
"changed-partition-count": "3",
"added-data-files": "3",
"added-files-size": str(file_size),
"added-records": "3",
"total-data-files": "3",
"total-delete-files": "0",
"total-equality-deletes": "0",
"total-files-size": str(file_size),
"total-position-deletes": "0",
"total-records": "3",
}
assert summaries[0] == with_environment_context(
{
"changed-partition-count": "3",
"added-data-files": "3",
"added-files-size": str(file_size),
"added-records": "3",
"total-data-files": "3",
"total-delete-files": "0",
"total-equality-deletes": "0",
"total-files-size": str(file_size),
"total-position-deletes": "0",
"total-records": "3",
}
)

assert summaries[1] == {
"changed-partition-count": "3",
"added-data-files": "3",
"added-files-size": str(file_size),
"added-records": "3",
"total-data-files": "6",
"total-delete-files": "0",
"total-equality-deletes": "0",
"total-files-size": str(file_size * 2),
"total-position-deletes": "0",
"total-records": "6",
}
assert summaries[2] == {
"removed-files-size": str(file_size * 2),
"changed-partition-count": "3",
"total-equality-deletes": "0",
"deleted-data-files": "6",
"total-position-deletes": "0",
"total-delete-files": "0",
"deleted-records": "6",
"total-files-size": "0",
"total-data-files": "0",
"total-records": "0",
}
assert summaries[3] == {
"changed-partition-count": "3",
"added-data-files": "3",
"total-equality-deletes": "0",
"added-records": "3",
"total-position-deletes": "0",
"added-files-size": str(file_size),
"total-delete-files": "0",
"total-files-size": str(file_size),
"total-data-files": "3",
"total-records": "3",
}
assert summaries[4] == {
"changed-partition-count": "3",
"added-data-files": "3",
"total-equality-deletes": "0",
"added-records": "3",
"total-position-deletes": "0",
"added-files-size": str(file_size),
"total-delete-files": "0",
"total-files-size": str(file_size * 2),
"total-data-files": "6",
"total-records": "6",
}
assert summaries[1] == with_environment_context(
{
"changed-partition-count": "3",
"added-data-files": "3",
"added-files-size": str(file_size),
"added-records": "3",
"total-data-files": "6",
"total-delete-files": "0",
"total-equality-deletes": "0",
"total-files-size": str(file_size * 2),
"total-position-deletes": "0",
"total-records": "6",
}
)
assert summaries[2] == with_environment_context(
{
"removed-files-size": str(file_size * 2),
"changed-partition-count": "3",
"total-equality-deletes": "0",
"deleted-data-files": "6",
"total-position-deletes": "0",
"total-delete-files": "0",
"deleted-records": "6",
"total-files-size": "0",
"total-data-files": "0",
"total-records": "0",
}
)
assert summaries[3] == with_environment_context(
{
"changed-partition-count": "3",
"added-data-files": "3",
"total-equality-deletes": "0",
"added-records": "3",
"total-position-deletes": "0",
"added-files-size": str(file_size),
"total-delete-files": "0",
"total-files-size": str(file_size),
"total-data-files": "3",
"total-records": "3",
}
)
assert summaries[4] == with_environment_context(
{
"changed-partition-count": "3",
"added-data-files": "3",
"total-equality-deletes": "0",
"added-records": "3",
"total-position-deletes": "0",
"added-files-size": str(file_size),
"total-delete-files": "0",
"total-files-size": str(file_size * 2),
"total-data-files": "6",
"total-records": "6",
}
)
assert "removed-files-size" in summaries[5]
assert "total-files-size" in summaries[5]
assert summaries[5] == {
"removed-files-size": summaries[5]["removed-files-size"],
"changed-partition-count": "2",
"total-equality-deletes": "0",
"deleted-data-files": "4",
"total-position-deletes": "0",
"total-delete-files": "0",
"deleted-records": "4",
"total-files-size": summaries[5]["total-files-size"],
"total-data-files": "2",
"total-records": "2",
}
assert summaries[5] == with_environment_context(
{
"removed-files-size": summaries[5]["removed-files-size"],
"changed-partition-count": "2",
"total-equality-deletes": "0",
"deleted-data-files": "4",
"total-position-deletes": "0",
"total-delete-files": "0",
"deleted-records": "4",
"total-files-size": summaries[5]["total-files-size"],
"total-data-files": "2",
"total-records": "2",
}
)
assert "added-files-size" in summaries[6]
assert "total-files-size" in summaries[6]
assert summaries[6] == {
"changed-partition-count": "2",
"added-data-files": "2",
"total-equality-deletes": "0",
"added-records": "2",
"total-position-deletes": "0",
"added-files-size": summaries[6]["added-files-size"],
"total-delete-files": "0",
"total-files-size": summaries[6]["total-files-size"],
"total-data-files": "4",
"total-records": "4",
}
assert summaries[6] == with_environment_context(
{
"changed-partition-count": "2",
"added-data-files": "2",
"total-equality-deletes": "0",
"added-records": "2",
"total-position-deletes": "0",
"added-files-size": summaries[6]["added-files-size"],
"total-delete-files": "0",
"total-files-size": summaries[6]["total-files-size"],
"total-data-files": "4",
"total-records": "4",
}
)


@pytest.mark.integration
Expand Down
Loading