Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/Storages/ObjectStorage/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,9 +513,11 @@ std::pair<DB::ObjectStoragePtr, std::string> resolveObjectStorageForPath(
normalized_path = "gs://" + target_decomposed.authority + "/" + target_decomposed.key;
}
S3::URI s3_uri(normalized_path);

std::string key_to_use = s3_uri.key;


// Use key (parsed without URI decoding) so that percent-encoded
// characters in object keys (e.g. %2F in Iceberg partition paths) are preserved.
std::string key_to_use = target_decomposed.key;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Parse object key with S3::URI for HTTP URLs

Deriving key_to_use from target_decomposed.key regresses path-style S3 locations like https://s3.amazonaws.com/<bucket>/<key> (and MinIO-style http(s)://endpoint/<bucket>/<key>): SchemeAuthorityKey leaves the bucket in the key, while bucket selection is already taken from s3_uri.bucket. That makes reads use a bucket-prefixed key against a storage already scoped to that bucket, so objects are looked up under the wrong path and can return 404 for valid files.

Useful? React with 👍 / 👎.


bool use_base_storage = false;
if (base_storage->getType() == ObjectStorageType::S3)
{
Expand Down
48 changes: 47 additions & 1 deletion tests/integration/test_database_iceberg/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,52 @@ def test_table_with_slash(started_cluster):
assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n"


def test_partition_value_with_slash(started_cluster):
"""Partition value containing '/' produces object keys with %2F; reading must preserve encoding."""
node = started_cluster.instances["node1"]

test_ref = f"test_partition_slash_{uuid.uuid4()}"
table_name = f"{test_ref}_table"
root_namespace = f"{test_ref}_namespace"

# Partition by symbol (string) so partition value "us/west" becomes path segment symbol=us%2Fwest
partition_spec = PartitionSpec(
PartitionField(
source_id=2, field_id=1000, transform=IdentityTransform(), name="symbol"
)
)
schema = DEFAULT_SCHEMA

catalog = load_catalog_impl(started_cluster)
catalog.create_namespace(root_namespace)

table = create_table(
catalog,
root_namespace,
table_name,
schema,
partition_spec=partition_spec,
sort_order=DEFAULT_SORT_ORDER,
)

# Write a row with partition value containing slash (path will have %2F in S3 key)
data = [
{
"datetime": datetime.now(),
"symbol": "us/west",
"bid": 100.0,
"ask": 101.0,
"details": {"created_by": "test"},
}
]
df = pa.Table.from_pylist(data)
table.append(df)

create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME)
assert 1 == int(node.query(f"SELECT count() FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`"))
assert "us/west" in node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`")


def test_cluster_select(started_cluster):
node1 = started_cluster.instances["node1"]
node2 = started_cluster.instances["node2"]
Expand Down Expand Up @@ -665,7 +711,7 @@ def test_cluster_select(started_cluster):
assert len(cluster_secondary_queries) == 1

assert node2.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`", settings={"parallel_replicas_for_cluster_engines":1, 'enable_parallel_replicas': 2, 'cluster_for_parallel_replicas': 'cluster_simple', 'parallel_replicas_for_cluster_engines' : 1}) == 'pablo\n'

def test_not_specified_catalog_type(started_cluster):
node = started_cluster.instances["node1"]
settings = {
Expand Down
Loading