From 9927b3753f0a56530f5ab618c4205b825ac470e9 Mon Sep 17 00:00:00 2001 From: Hiren Date: Wed, 25 Mar 2026 13:13:25 -0400 Subject: [PATCH 1/3] feat: add serialization helpers (to_dict/to_yaml) to BaseSearchIndex Addresses all review feedback on #525: - Move to_dict/to_yaml/_sanitize_redis_url to BaseSearchIndex (DRY) - Fix password-only URL sanitization (redis://:pass@host pattern) - Handle _redis_url=None gracefully - Fix from_dict to restore _redis_url and _connection_kwargs (round-trip) - Fix from_yaml to also extract connection metadata via from_dict - Add overwrite param to to_yaml (raises FileExistsError when False) - Import yaml at module level (not inline) - Filter sensitive connection_kwargs (only ssl_cert_reqs passes through) - Add comprehensive test suite (16 tests, no Docker required) - to_dict: schema-only, connection, password sanitization, None URL, sensitive kwargs, async - to_yaml: basic write, overwrite guard, async - round-trip: dict, dict+connection, async+connection, yaml+connection, connection_kwargs --- redisvl/index/index.py | 95 +++++++++++- tests/unit/test_index_serialization.py | 192 +++++++++++++++++++++++++ 2 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 tests/unit/test_index_serialization.py diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 649aa215..194ee72a 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -4,6 +4,7 @@ import time import warnings import weakref +from pathlib import Path from math import ceil from typing import ( TYPE_CHECKING, @@ -28,6 +29,7 @@ from redis.asyncio import Redis as AsyncRedis from redis.asyncio.cluster import RedisCluster as AsyncRedisCluster from redis.cluster import RedisCluster +import yaml from redisvl.query.hybrid import HybridQuery from redisvl.query.query import VectorQuery @@ -331,10 +333,15 @@ def storage_type(self) -> StorageType: hash or json.""" return self.schema.index.storage_type + @classmethod @classmethod def from_yaml(cls, schema_path: str, **kwargs): """Create a SearchIndex from a YAML schema file. + If the YAML file contains ``_redis_url`` or ``_connection_kwargs`` + keys (produced by ``to_yaml(include_connection=True)``), they are + automatically passed to the constructor. + Args: schema_path (str): Path to the YAML schema file. @@ -347,15 +354,18 @@ def from_yaml(cls, schema_path: str, **kwargs): index = SearchIndex.from_yaml("schemas/schema.yaml", redis_url="redis://localhost:6379") """ - schema = IndexSchema.from_yaml(schema_path) - return cls(schema=schema, **kwargs) + with open(schema_path) as f: + data = yaml.safe_load(f) or {} + return cls.from_dict(data, **kwargs) @classmethod def from_dict(cls, schema_dict: Dict[str, Any], **kwargs): """Create a SearchIndex from a dictionary. Args: - schema_dict (Dict[str, Any]): A dictionary containing the schema. + schema_dict (Dict[str, Any]): A dictionary containing the schema + and optionally ``_redis_url`` and ``_connection_kwargs`` + (produced by ``to_dict(include_connection=True)``). Returns: SearchIndex: A RedisVL SearchIndex object. @@ -376,9 +386,86 @@ def from_dict(cls, schema_dict: Dict[str, Any], **kwargs): }, redis_url="redis://localhost:6379") """ - schema = IndexSchema.from_dict(schema_dict) + # Extract connection metadata so it reaches the constructor + # instead of being silently dropped by IndexSchema validation. + copy = dict(schema_dict) + if "_redis_url" in copy: + kwargs.setdefault("redis_url", copy.pop("_redis_url")) + if "_connection_kwargs" in copy: + kwargs.setdefault("connection_kwargs", copy.pop("_connection_kwargs")) + + schema = IndexSchema.from_dict(copy) return cls(schema=schema, **kwargs) + def _sanitize_redis_url(self) -> Optional[str]: + """Sanitize a Redis URL by masking passwords. + + Handles both ``user:password@`` and ``:password@`` URL patterns. + + Returns: + Optional[str]: The sanitized URL, or None if ``_redis_url`` + is not set. + """ + if not self._redis_url: + return None + from urllib.parse import urlparse + + parsed = urlparse(self._redis_url) + if parsed.password or parsed.username: + host_info = parsed.hostname or "" + if parsed.port: + host_info += f":{parsed.port}" + netloc = f"{parsed.username}:****@{host_info}" if parsed.username else f":****@{host_info}" + return parsed._replace(netloc=netloc).geturl() + return self._redis_url + + def to_dict(self, include_connection: bool = False) -> Dict[str, Any]: + """Serialize the index configuration to a dictionary. + + Args: + include_connection: Whether to include connection parameters + (Redis URL and connection kwargs). Passwords are never + serialized. Defaults to False. + + Returns: + A dictionary representation of the index configuration. + """ + config = self.schema.to_dict() + + if include_connection: + if self._redis_url is not None: + config["_redis_url"] = self._sanitize_redis_url() + safe_keys = {"ssl_cert_reqs"} + if self._connection_kwargs: + config["_connection_kwargs"] = { + k: v for k, v in self._connection_kwargs.items() + if k in safe_keys + } + + return config + + def to_yaml( + self, file_path: str, include_connection: bool = False, overwrite: bool = True + ) -> None: + """Serialize the index configuration to a YAML file. + + Args: + file_path: Destination path for the YAML file. + include_connection: Whether to include connection parameters. + Passwords are never serialized. Defaults to False. + overwrite: Whether to overwrite an existing file. If False, + raises FileExistsError when the file exists. Defaults to True. + + Raises: + FileExistsError: If *file_path* already exists and + *overwrite* is False. + """ + fp = Path(file_path).resolve() + if fp.exists() and not overwrite: + raise FileExistsError(f"File {file_path} already exists.") + with open(fp, "w") as f: + yaml.dump(self.to_dict(include_connection=include_connection), f) + def disconnect(self): """Disconnect from the Redis database.""" raise NotImplementedError("This method should be implemented by subclasses.") diff --git a/tests/unit/test_index_serialization.py b/tests/unit/test_index_serialization.py new file mode 100644 index 00000000..e5d592c6 --- /dev/null +++ b/tests/unit/test_index_serialization.py @@ -0,0 +1,192 @@ +import os +import tempfile + +import pytest +import yaml + +from redisvl.index.index import AsyncSearchIndex, SearchIndex +from redisvl.schema.schema import IndexSchema + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +SAMPLE_SCHEMA = { + "index": { + "name": "test-index", + "prefix": "doc", + "storage_type": "hash", + }, + "fields": [ + {"name": "title", "type": "tag"}, + {"name": "body", "type": "text"}, + { + "name": "vector", + "type": "vector", + "attrs": { + "dims": 3, + "algorithm": "flat", + "datatype": "float32", + }, + }, + ], +} + + +def _make_index(cls=SearchIndex, **conn): + schema = IndexSchema.from_dict(SAMPLE_SCHEMA) + return cls(schema=schema, **conn) + + +# --------------------------------------------------------------------------- +# to_dict tests +# --------------------------------------------------------------------------- + + +class TestToDict: + def test_schema_only(self): + idx = _make_index() + d = idx.to_dict() + assert d["index"]["name"] == "test-index" + assert d["index"]["prefix"] == "doc" + assert len(d["fields"]) == 3 + assert "_redis_url" not in d + assert "_connection_kwargs" not in d + + def test_include_connection_with_url(self): + idx = _make_index(redis_url="redis://:secret@localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://:****@localhost:6379" + + def test_include_connection_password_only_url(self): + """Password-only URLs (:pass@host) are sanitized correctly.""" + idx = _make_index(redis_url="redis://:mysecret@localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://:****@localhost:6379" + + def test_include_connection_user_pass_url(self): + idx = _make_index(redis_url="redis://admin:secret@localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://admin:****@localhost:6379" + + def test_include_connection_no_url(self): + """When initialized with a client, _redis_url is None — omit it.""" + idx = _make_index() + d = idx.to_dict(include_connection=True) + assert "_redis_url" not in d + + def test_include_connection_filters_sensitive_kwargs(self): + idx = _make_index( + redis_url="redis://localhost:6379", + connection_kwargs={"password": "s3cret", "ssl_cert_reqs": "required"}, + ) + d = idx.to_dict(include_connection=True) + # password should NOT leak + assert "s3cret" not in d["_connection_kwargs"] + assert d["_connection_kwargs"] == {"ssl_cert_reqs": "required"} + + def test_async_index_to_dict(self): + idx = _make_index(cls=AsyncSearchIndex, redis_url="redis://localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://localhost:6379" + + +# --------------------------------------------------------------------------- +# to_yaml tests +# --------------------------------------------------------------------------- + + +class TestToYaml: + def test_writes_yaml(self): + idx = _make_index() + with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: + path = f.name + try: + idx.to_yaml(path) + with open(path) as f: + data = yaml.safe_load(f) + assert data["index"]["name"] == "test-index" + finally: + os.unlink(path) + + def test_overwrite_false_raises(self): + idx = _make_index() + with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: + path = f.name + try: + idx.to_yaml(path) + with pytest.raises(FileExistsError): + idx.to_yaml(path, overwrite=False) + finally: + os.unlink(path) + + def test_overwrite_true(self): + idx = _make_index() + with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: + path = f.name + try: + idx.to_yaml(path) + idx.to_yaml(path, overwrite=True) # should not raise + finally: + os.unlink(path) + + def test_async_to_yaml(self): + idx = _make_index(cls=AsyncSearchIndex) + with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: + path = f.name + try: + idx.to_yaml(path) + with open(path) as f: + data = yaml.safe_load(f) + assert data["index"]["name"] == "test-index" + finally: + os.unlink(path) + + +# --------------------------------------------------------------------------- +# Round-trip tests +# --------------------------------------------------------------------------- + + +class TestRoundTrip: + def test_roundtrip_dict_schema_only(self): + idx = _make_index() + d = idx.to_dict() + restored = SearchIndex.from_dict(d) + assert restored.schema.index.name == "test-index" + + def test_roundtrip_dict_with_connection(self): + idx = _make_index(redis_url="redis://localhost:6379") + d = idx.to_dict(include_connection=True) + restored = SearchIndex.from_dict(d) + assert restored._redis_url == "redis://localhost:6379" + assert restored.schema.index.name == "test-index" + + def test_roundtrip_dict_async_with_connection(self): + idx = _make_index(cls=AsyncSearchIndex, redis_url="redis://localhost:6379") + d = idx.to_dict(include_connection=True) + restored = AsyncSearchIndex.from_dict(d) + assert restored._redis_url == "redis://localhost:6379" + + def test_roundtrip_yaml(self): + idx = _make_index(redis_url="redis://localhost:6379") + with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: + path = f.name + try: + idx.to_yaml(path, include_connection=True) + restored = SearchIndex.from_yaml(path) + assert restored.schema.index.name == "test-index" + assert restored._redis_url == "redis://localhost:6379" + finally: + os.unlink(path) + + def test_roundtrip_dict_with_connection_kwargs(self): + idx = _make_index( + redis_url="redis://localhost:6379", + connection_kwargs={"ssl_cert_reqs": "optional"}, + ) + d = idx.to_dict(include_connection=True) + restored = SearchIndex.from_dict(d) + assert restored._redis_url == "redis://localhost:6379" + assert restored._connection_kwargs == {"ssl_cert_reqs": "optional"} From 6163ce4775d9ecaa91aea620325a927139890abd Mon Sep 17 00:00:00 2001 From: Hiren Date: Wed, 25 Mar 2026 13:19:19 -0400 Subject: [PATCH 2/3] fix: address Copilot review feedback - Remove duplicate @classmethod on from_yaml - Add path validation/resolution in from_yaml - Only mask URLs with password (not username-only URLs) - Add sort_keys=False to yaml.dump for stable output - Add test for username-only URL sanitization --- redisvl/index/index.py | 23 +++++++++++++++++------ tests/unit/test_index_serialization.py | 6 ++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 194ee72a..19c42b45 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -333,7 +333,6 @@ def storage_type(self) -> StorageType: hash or json.""" return self.schema.index.storage_type - @classmethod @classmethod def from_yaml(cls, schema_path: str, **kwargs): """Create a SearchIndex from a YAML schema file. @@ -354,8 +353,16 @@ def from_yaml(cls, schema_path: str, **kwargs): index = SearchIndex.from_yaml("schemas/schema.yaml", redis_url="redis://localhost:6379") """ - with open(schema_path) as f: - data = yaml.safe_load(f) or {} + path_obj = Path(schema_path) + try: + resolved_path = path_obj.resolve() + except Exception as exc: + raise ValueError(f"Invalid schema path: {schema_path}") from exc + try: + with resolved_path.open() as f: + data = yaml.safe_load(f) or {} + except FileNotFoundError as exc: + raise FileNotFoundError(f"Schema file not found: {resolved_path}") from exc return cls.from_dict(data, **kwargs) @classmethod @@ -411,11 +418,15 @@ def _sanitize_redis_url(self) -> Optional[str]: from urllib.parse import urlparse parsed = urlparse(self._redis_url) - if parsed.password or parsed.username: + # Only mask when a password component is present (including + # empty-string passwords). Username-only URLs like + # ``redis://user@host:6379`` are left unchanged. + if parsed.password is not None: host_info = parsed.hostname or "" if parsed.port: host_info += f":{parsed.port}" - netloc = f"{parsed.username}:****@{host_info}" if parsed.username else f":****@{host_info}" + user_part = f"{parsed.username}:" if parsed.username is not None else ":" + netloc = f"{user_part}****@{host_info}" return parsed._replace(netloc=netloc).geturl() return self._redis_url @@ -464,7 +475,7 @@ def to_yaml( if fp.exists() and not overwrite: raise FileExistsError(f"File {file_path} already exists.") with open(fp, "w") as f: - yaml.dump(self.to_dict(include_connection=include_connection), f) + yaml.dump(self.to_dict(include_connection=include_connection), f, sort_keys=False) def disconnect(self): """Disconnect from the Redis database.""" diff --git a/tests/unit/test_index_serialization.py b/tests/unit/test_index_serialization.py index e5d592c6..52110d9f 100644 --- a/tests/unit/test_index_serialization.py +++ b/tests/unit/test_index_serialization.py @@ -70,6 +70,12 @@ def test_include_connection_user_pass_url(self): d = idx.to_dict(include_connection=True) assert d["_redis_url"] == "redis://admin:****@localhost:6379" + def test_include_connection_username_only_url(self): + """Username-only URLs (no password) are left unchanged.""" + idx = _make_index(redis_url="redis://readonly@localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://readonly@localhost:6379" + def test_include_connection_no_url(self): """When initialized with a client, _redis_url is None — omit it.""" idx = _make_index() From 617583551ca5ae26862d1a05a0d15e0e51a4bc9c Mon Sep 17 00:00:00 2001 From: Hiren Date: Wed, 25 Mar 2026 13:39:30 -0400 Subject: [PATCH 3/3] fix: skip sanitized URLs on from_dict round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sanitized URLs (containing ****) are for inspection only — restoring them as real redis_url would cause silent auth failures. - from_dict/from_yaml now skip _redis_url when it contains **** - Updated to_dict/to_yaml docstrings to clarify inspection-only behavior - Added test_roundtrip_dict_sanitized_url_skipped (18/18 passing) --- redisvl/index/index.py | 15 +++++++++++---- tests/unit/test_index_serialization.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 19c42b45..8158a56e 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -397,7 +397,11 @@ def from_dict(cls, schema_dict: Dict[str, Any], **kwargs): # instead of being silently dropped by IndexSchema validation. copy = dict(schema_dict) if "_redis_url" in copy: - kwargs.setdefault("redis_url", copy.pop("_redis_url")) + url = copy.pop("_redis_url") + # Skip sanitized URLs (contain ****) to avoid silent auth + # failures — these are for inspection only, not round-trip. + if url and "****" not in url: + kwargs.setdefault("redis_url", url) if "_connection_kwargs" in copy: kwargs.setdefault("connection_kwargs", copy.pop("_connection_kwargs")) @@ -435,8 +439,10 @@ def to_dict(self, include_connection: bool = False) -> Dict[str, Any]: Args: include_connection: Whether to include connection parameters - (Redis URL and connection kwargs). Passwords are never - serialized. Defaults to False. + (Redis URL and connection kwargs). Passwords are masked + (``****``). The serialized URL is for **inspection + only** — ``from_dict`` will not restore sanitized URLs. + Defaults to False. Returns: A dictionary representation of the index configuration. @@ -463,7 +469,8 @@ def to_yaml( Args: file_path: Destination path for the YAML file. include_connection: Whether to include connection parameters. - Passwords are never serialized. Defaults to False. + Passwords are masked (``****``) and the URL is for + **inspection only**. Defaults to False. overwrite: Whether to overwrite an existing file. If False, raises FileExistsError when the file exists. Defaults to True. diff --git a/tests/unit/test_index_serialization.py b/tests/unit/test_index_serialization.py index 52110d9f..a241bb23 100644 --- a/tests/unit/test_index_serialization.py +++ b/tests/unit/test_index_serialization.py @@ -196,3 +196,13 @@ def test_roundtrip_dict_with_connection_kwargs(self): restored = SearchIndex.from_dict(d) assert restored._redis_url == "redis://localhost:6379" assert restored._connection_kwargs == {"ssl_cert_reqs": "optional"} + + def test_roundtrip_dict_sanitized_url_skipped(self): + """Sanitized URLs (containing ****) should not be restored — they'd + cause auth failures.""" + idx = _make_index(redis_url="redis://:secret@localhost:6379") + d = idx.to_dict(include_connection=True) + assert d["_redis_url"] == "redis://:****@localhost:6379" + restored = SearchIndex.from_dict(d) + # The sanitized URL should be silently skipped, not restored + assert restored._redis_url is None