diff --git a/.gitignore b/.gitignore index cd8005815..1a2a5d6c1 100644 --- a/.gitignore +++ b/.gitignore @@ -234,3 +234,17 @@ tests/data # Local working directory (personal scripts, docs, tools) local/ +nitin_docs/ +nitin_scripts/ + +# Local notebooks (kept for development, not committed) +docs/user_guide/13_index_migrations.ipynb + +# Migration temp files (generated by rvl migrate commands) +migration_plan.yaml +migration_report.yaml +schema_patch.yaml + +# Benchmark artifacts +tests/benchmarks/charts/ +tests/benchmarks/results_*.json diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..3004e0514 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,165 @@ +# AGENTS.md - RedisVL Project Context + +## Frequently Used Commands + +```bash +# Development workflow +make install # Install dependencies +make format # Format code (black + isort) +make check-types # Run mypy type checking +make lint # Run all linting (format + types) +make test # Run tests (no external APIs) +make test-all # Run all tests (includes API tests) +make check # Full check (lint + test) + +# Redis setup +make redis-start # Start Redis container +make redis-stop # Stop Redis container + +# Documentation +make docs-build # Build documentation +make docs-serve # Serve docs locally +``` + +Pre-commit hooks are also configured, which you should +run before you commit: +```bash +pre-commit run --all-files +``` + +## Important Architectural Patterns + +### Async/Sync Dual Interfaces +- Most core classes have both sync and async versions (e.g., `SearchIndex` / `AsyncSearchIndex`) +- Follow existing patterns when adding new functionality + +### Schema-Driven Design +```python +# Index schemas define structure +schema = IndexSchema.from_yaml("schema.yaml") +index = SearchIndex(schema, redis_url="redis://localhost:6379") +``` + +## Critical Rules + +### Do Not Modify +- **CRITICAL**: Do not change this line unless explicitly asked: + ```python + token.strip().strip(",").replace(""", "").replace(""", "").lower() + ``` + +### Git Operations +**CRITICAL**: NEVER use `git push` or attempt to push to remote repositories. The user will handle all git push operations. + +### Branch and Commit Policy +**IMPORTANT**: Use conventional branch names and conventional commits. + +Branch naming: +- Human-created branches should use `/` +- Automation-created branches may use `codex//` +- Preferred branch types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat/index-migrator` + - `fix/async-sentinel-pool` + - `docs/index-migrator-benchmarking` + - `codex/feat/index-migrator` + +Commit messages: +- Use Conventional Commits: `(optional-scope): ` +- Preferred commit types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat(migrate): add drop recreate planning docs` + - `docs(index-migrator): add benchmarking guidance` + - `fix(cli): validate migrate plan inputs` + +### Code Quality +**IMPORTANT**: Always run `make format` before committing code to ensure proper formatting and linting compliance. + +### README.md Maintenance +**IMPORTANT**: DO NOT modify README.md unless explicitly requested. + +**If you need to document something, use these alternatives:** +- Development info → CONTRIBUTING.md +- API details → docs/ directory +- Examples → docs/examples/ +- Project memory (explicit preferences, directives, etc.) → AGENTS.md + +## Code Style Preferences + +### Import Organization +- **Prefer module-level imports** by default for clarity and standard Python conventions +- **Use local/inline imports only when necessary** for specific reasons: + - Avoiding circular import dependencies + - Improving startup time for heavy/optional dependencies + - Lazy loading for performance-critical paths +- When using local imports, add a brief comment explaining why (e.g., `# Local import to avoid circular dependency`) + +### Comments and Output +- **No emojis in code comments or print statements** +- Keep comments professional and focused on technical clarity +- Use emojis sparingly only in user-facing documentation (markdown files), not in Python code + +### General Guidelines +- Follow existing patterns in the RedisVL codebase +- Maintain consistency with the project's established conventions +- Run `make format` before committing to ensure code quality standards + +## Testing Notes +RedisVL uses `pytest` with `testcontainers` for testing. + +- `make test` - unit tests only (no external APIs) +- `make test-all` - run the full suite, including tests that call external APIs +- `pytest --run-api-tests` - explicitly run API-dependent tests (e.g., LangCache, + external vectorizer/reranker providers). These require the appropriate API + keys and environment variables to be set. + +## Project Structure + +``` +redisvl/ +├── cli/ # Command-line interface (rvl command) +├── extensions/ # AI extensions (cache, memory, routing) +│ ├── cache/ # Semantic caching for LLMs +│ ├── llmcache/ # LLM-specific caching +│ ├── message_history/ # Chat history management +│ ├── router/ # Semantic routing +│ └── session_manager/ # Session management +├── index/ # SearchIndex classes (sync/async) +├── query/ # Query builders (Vector, Range, Filter, Count) +├── redis/ # Redis client utilities +├── schema/ # Index schema definitions +└── utils/ # Utilities (vectorizers, rerankers, optimization) + ├── rerank/ # Result reranking + └── vectorize/ # Embedding providers integration +``` + +## Core Components + +### 1. Index Management +- `SearchIndex` / `AsyncSearchIndex` - Main interface for Redis vector indices +- `IndexSchema` - Define index structure with fields (text, tags, vectors, etc.) +- Support for JSON and Hash storage types + +### 2. Query System +- `VectorQuery` - Semantic similarity search +- `RangeQuery` - Vector search within distance range +- `FilterQuery` - Metadata filtering and full-text search +- `CountQuery` - Count matching records +- Etc. + +### 3. AI Extensions +- `SemanticCache` - LLM response caching with semantic similarity +- `EmbeddingsCache` - Cache for vector embeddings +- `MessageHistory` - Chat history with recency/relevancy retrieval +- `SemanticRouter` - Route queries to topics/intents + +### 4. Vectorizers (Optional Dependencies) +- OpenAI, Azure OpenAI, Cohere, HuggingFace, Mistral, VoyageAI +- Custom vectorizer support +- Batch processing capabilities + +## Documentation +- Main docs: https://docs.redisvl.com +- Built with Sphinx from `docs/` directory +- Includes API reference and user guides +- Example notebooks in documentation `docs/user_guide/...` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 54138d77c..6f4afb7f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -251,12 +251,33 @@ Before suggesting a new feature: ## Pull Request Process -1. **Fork and create a branch**: Create a descriptive branch name (e.g., `fix-search-bug` or `add-vector-similarity`) +1. **Fork and create a branch**: Use a conventional branch name such as `feat/index-migrator`, `fix/search-bug`, or `docs/vectorizer-guide` 2. **Make your changes**: Follow our coding standards and include tests 3. **Test thoroughly**: Ensure your changes work and don't break existing functionality 4. **Update documentation**: Add or update documentation as needed 5. **Submit your PR**: Include a clear description of what your changes do +### Branch Naming and Commit Messages + +We use conventional branch names and Conventional Commits to keep history easy to scan and automate. + +Branch naming: + +- Use `/` +- Recommended types: `feat`, `fix`, `docs`, `refactor`, `test`, `chore`, `perf`, `build`, `ci` +- Examples: + - `feat/index-migrator` + - `fix/async-sentinel-pool` + - `docs/migration-benchmarking` + +Commit messages: + +- Use `(optional-scope): ` +- Examples: + - `feat(migrate): add drop recreate plan generation` + - `docs(index-migrator): add benchmark guidance` + - `fix(cli): reject unsupported migration diffs` + ### Review Process - The core team reviews Pull Requests regularly diff --git a/docs/api/cli.rst b/docs/api/cli.rst new file mode 100644 index 000000000..4f651a38c --- /dev/null +++ b/docs/api/cli.rst @@ -0,0 +1,614 @@ +********************** +Command Line Interface +********************** + +RedisVL provides a command line interface (CLI) called ``rvl`` for managing vector search indices. The CLI enables you to create, inspect, and delete indices directly from your terminal without writing Python code. + +Installation +============ + +The ``rvl`` command is included when you install RedisVL. + +.. code-block:: bash + + pip install redisvl + +Verify the installation by running: + +.. code-block:: bash + + rvl version + +Connection Configuration +======================== + +The CLI connects to Redis using the following resolution order: + +1. The ``REDIS_URL`` environment variable, if set +2. Explicit connection flags (``--host``, ``--port``, ``--url``) +3. Default values (``localhost:6379``) + +**Connection Flags** + +All commands that interact with Redis accept these optional flags: + +.. list-table:: + :widths: 20 15 50 15 + :header-rows: 1 + + * - Flag + - Type + - Description + - Default + * - ``-u``, ``--url`` + - string + - Full Redis URL (e.g., ``redis://localhost:6379``) + - None + * - ``--host`` + - string + - Redis server hostname + - ``localhost`` + * - ``-p``, ``--port`` + - integer + - Redis server port + - ``6379`` + * - ``--user`` + - string + - Redis username for authentication + - ``default`` + * - ``-a``, ``--password`` + - string + - Redis password for authentication + - Empty + * - ``--ssl`` + - flag + - Enable SSL/TLS encryption + - Disabled + +**Examples** + +Connect using environment variable: + +.. code-block:: bash + + export REDIS_URL="redis://localhost:6379" + rvl index listall + +Connect with explicit host and port: + +.. code-block:: bash + + rvl index listall --host myredis.example.com --port 6380 + +Connect with authentication and SSL: + +.. code-block:: bash + + rvl index listall --user admin --password secret --ssl + +Getting Help +============ + +All commands support the ``-h`` and ``--help`` flags to display usage information. + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Flag + - Description + * - ``-h``, ``--help`` + - Display usage information for the command + +**Examples** + +.. code-block:: bash + + # Display top-level help + rvl --help + + # Display help for a command group + rvl index --help + + # Display help for a specific subcommand + rvl index create --help + +Running ``rvl`` without any arguments also displays the top-level help message. + +.. tip:: + + For a hands-on tutorial with practical examples, see the :doc:`/user_guide/cli`. + +Commands +======== + +rvl version +----------- + +Display the installed RedisVL version. + +**Syntax** + +.. code-block:: bash + + rvl version [OPTIONS] + +**Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-s``, ``--short`` + - Print only the version number without additional formatting + +**Examples** + +.. code-block:: bash + + # Full version output + rvl version + + # Version number only + rvl version --short + +rvl index +--------- + +Manage vector search indices. This command group provides subcommands for creating, inspecting, listing, and removing indices. + +**Syntax** + +.. code-block:: bash + + rvl index [OPTIONS] + +**Subcommands** + +.. list-table:: + :widths: 15 85 + :header-rows: 1 + + * - Subcommand + - Description + * - ``create`` + - Create a new index from a YAML schema file + * - ``info`` + - Display detailed information about an index + * - ``listall`` + - List all existing indices in the Redis instance + * - ``delete`` + - Remove an index while preserving the underlying data + * - ``destroy`` + - Remove an index and delete all associated data + +rvl index create +^^^^^^^^^^^^^^^^ + +Create a new vector search index from a YAML schema definition. + +**Syntax** + +.. code-block:: bash + + rvl index create -s [CONNECTION_OPTIONS] + +**Required Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-s``, ``--schema`` + - Path to the YAML schema file defining the index structure + +**Example** + +.. code-block:: bash + + rvl index create -s schema.yaml + +**Schema File Format** + +The schema file must be valid YAML with the following structure: + +.. code-block:: yaml + + version: '0.1.0' + + index: + name: my_index + prefix: doc + storage_type: hash + + fields: + - name: content + type: text + - name: embedding + type: vector + attrs: + dims: 768 + algorithm: hnsw + distance_metric: cosine + +rvl index info +^^^^^^^^^^^^^^ + +Display detailed information about an existing index, including field definitions and index options. + +**Syntax** + +.. code-block:: bash + + rvl index info (-i | -s ) [OPTIONS] + +**Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-i``, ``--index`` + - Name of the index to inspect + * - ``-s``, ``--schema`` + - Path to the schema file (alternative to specifying index name) + +**Example** + +.. code-block:: bash + + rvl index info -i my_index + +**Output** + +The command displays two tables: + +1. **Index Information** containing the index name, storage type, key prefixes, index options, and indexing status +2. **Index Fields** listing each field with its name, attribute, type, and any additional field options + +rvl index listall +^^^^^^^^^^^^^^^^^ + +List all vector search indices in the connected Redis instance. + +**Syntax** + +.. code-block:: bash + + rvl index listall [CONNECTION_OPTIONS] + +**Example** + +.. code-block:: bash + + rvl index listall + +**Output** + +Returns a numbered list of all index names: + +.. code-block:: text + + Indices: + 1. products_index + 2. documents_index + 3. embeddings_index + +rvl index delete +^^^^^^^^^^^^^^^^ + +Remove an index from Redis while preserving the underlying data. Use this when you want to rebuild an index with a different schema without losing your data. + +**Syntax** + +.. code-block:: bash + + rvl index delete (-i | -s ) [CONNECTION_OPTIONS] + +**Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-i``, ``--index`` + - Name of the index to delete + * - ``-s``, ``--schema`` + - Path to the schema file (alternative to specifying index name) + +**Example** + +.. code-block:: bash + + rvl index delete -i my_index + +rvl index destroy +^^^^^^^^^^^^^^^^^ + +Remove an index and permanently delete all associated data from Redis. This operation cannot be undone. + +**Syntax** + +.. code-block:: bash + + rvl index destroy (-i | -s ) [CONNECTION_OPTIONS] + +**Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-i``, ``--index`` + - Name of the index to destroy + * - ``-s``, ``--schema`` + - Path to the schema file (alternative to specifying index name) + +**Example** + +.. code-block:: bash + + rvl index destroy -i my_index + +.. warning:: + + This command permanently deletes both the index and all documents stored with the index prefix. Ensure you have backups before running this command. + +rvl stats +--------- + +Display statistics about an existing index, including document counts, memory usage, and indexing performance metrics. + +**Syntax** + +.. code-block:: bash + + rvl stats (-i | -s ) [OPTIONS] + +**Options** + +.. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Option + - Description + * - ``-i``, ``--index`` + - Name of the index to query + * - ``-s``, ``--schema`` + - Path to the schema file (alternative to specifying index name) + +**Example** + +.. code-block:: bash + + rvl stats -i my_index + +**Statistics Reference** + +The command returns the following metrics: + +.. list-table:: + :widths: 35 65 + :header-rows: 1 + + * - Metric + - Description + * - ``num_docs`` + - Total number of indexed documents + * - ``num_terms`` + - Number of distinct terms in text fields + * - ``max_doc_id`` + - Highest internal document ID + * - ``num_records`` + - Total number of index records + * - ``percent_indexed`` + - Percentage of documents fully indexed + * - ``hash_indexing_failures`` + - Number of documents that failed to index + * - ``number_of_uses`` + - Number of times the index has been queried + * - ``bytes_per_record_avg`` + - Average bytes per index record + * - ``doc_table_size_mb`` + - Document table size in megabytes + * - ``inverted_sz_mb`` + - Inverted index size in megabytes + * - ``key_table_size_mb`` + - Key table size in megabytes + * - ``offset_bits_per_record_avg`` + - Average offset bits per record + * - ``offset_vectors_sz_mb`` + - Offset vectors size in megabytes + * - ``offsets_per_term_avg`` + - Average offsets per term + * - ``records_per_doc_avg`` + - Average records per document + * - ``sortable_values_size_mb`` + - Sortable values size in megabytes + * - ``total_indexing_time`` + - Total time spent indexing in milliseconds + * - ``total_inverted_index_blocks`` + - Number of inverted index blocks + * - ``vector_index_sz_mb`` + - Vector index size in megabytes + +rvl migrate +----------- + +Manage document-preserving index migrations. This command group provides subcommands for planning, executing, and validating schema migrations that preserve existing data. + +**Syntax** + +.. code-block:: bash + + rvl migrate [OPTIONS] + +**Subcommands** + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Subcommand + - Description + * - ``helper`` + - Show migration guidance and supported capabilities + * - ``list`` + - List all available indexes + * - ``plan`` + - Generate a migration plan from a schema patch or target schema + * - ``wizard`` + - Interactively build a migration plan and schema patch + * - ``apply`` + - Execute a reviewed drop/recreate migration plan + * - ``estimate`` + - Estimate disk space required for a migration (dry-run) + * - ``validate`` + - Validate a completed migration against the live index + * - ``batch-plan`` + - Generate a batch migration plan for multiple indexes + * - ``batch-apply`` + - Execute a batch migration plan with checkpointing + * - ``batch-resume`` + - Resume an interrupted batch migration + * - ``batch-status`` + - Show status of an in-progress or completed batch migration + +rvl migrate plan +^^^^^^^^^^^^^^^^ + +Generate a migration plan for a document-preserving drop/recreate migration. + +**Syntax** + +.. code-block:: bash + + rvl migrate plan --index (--patch | --target-schema ) [OPTIONS] + +**Required Options** + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Option + - Description + * - ``--index``, ``-i`` + - Name of the source index to migrate + * - ``--patch`` + - Path to a YAML schema patch file (mutually exclusive with ``--target-schema``) + * - ``--target-schema`` + - Path to a full target schema YAML file (mutually exclusive with ``--patch``) + +**Optional Options** + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Option + - Description + * - ``--output``, ``-o`` + - Output path for the migration plan YAML (default: ``migration_plan.yaml``) + +**Example** + +.. code-block:: bash + + rvl migrate plan -i my_index --patch changes.yaml -o plan.yaml + +rvl migrate apply +^^^^^^^^^^^^^^^^^ + +Execute a reviewed drop/recreate migration plan. Use ``--async`` for large migrations involving vector quantization. + +**Syntax** + +.. code-block:: bash + + rvl migrate apply --plan [OPTIONS] + +**Required Options** + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Option + - Description + * - ``--plan`` + - Path to the migration plan YAML file + +**Optional Options** + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Option + - Description + * - ``--async`` + - Run migration asynchronously (recommended for large quantization jobs) + * - ``--query-check`` + - Path to a YAML file with post-migration query checks + * - ``--resume`` + - Path to a checkpoint file for crash-safe recovery + +**Example** + +.. code-block:: bash + + rvl migrate apply --plan plan.yaml + rvl migrate apply --plan plan.yaml --async --resume checkpoint.yaml + +rvl migrate wizard +^^^^^^^^^^^^^^^^^^ + +Interactively build a schema patch and migration plan through a guided wizard. + +**Syntax** + +.. code-block:: bash + + rvl migrate wizard [--index ] [OPTIONS] + +**Example** + +.. code-block:: bash + + rvl migrate wizard -i my_index -o plan.yaml + +Exit Codes +========== + +The CLI returns the following exit codes: + +.. list-table:: + :widths: 15 85 + :header-rows: 1 + + * - Code + - Description + * - ``0`` + - Command completed successfully + * - ``1`` + - Command failed due to missing required arguments or invalid input + +Related Resources +================= + +- :doc:`/user_guide/cli` for a tutorial-style walkthrough +- :doc:`schema` for YAML schema format details +- :doc:`searchindex` for the Python ``SearchIndex`` API + diff --git a/docs/concepts/field-attributes.md b/docs/concepts/field-attributes.md index c7764a4a7..96060d2fb 100644 --- a/docs/concepts/field-attributes.md +++ b/docs/concepts/field-attributes.md @@ -267,7 +267,7 @@ Key vector attributes: - `dims`: Vector dimensionality (required) - `algorithm`: `flat`, `hnsw`, or `svs-vamana` - `distance_metric`: `COSINE`, `L2`, or `IP` -- `datatype`: `float16`, `float32`, `float64`, or `bfloat16` +- `datatype`: Vector precision (see table below) - `index_missing`: Allow searching for documents without vectors ```yaml @@ -281,6 +281,48 @@ Key vector attributes: index_missing: true # Handle documents without embeddings ``` +### Vector Datatypes + +The `datatype` attribute controls how vector components are stored. Smaller datatypes reduce memory usage but may affect precision. + +| Datatype | Bits | Memory (768 dims) | Use Case | +|----------|------|-------------------|----------| +| `float32` | 32 | 3 KB | Default. Best precision for most applications. | +| `float16` | 16 | 1.5 KB | Good balance of memory and precision. Recommended for large-scale deployments. | +| `bfloat16` | 16 | 1.5 KB | Better dynamic range than float16. Useful when embeddings have large value ranges. | +| `float64` | 64 | 6 KB | Maximum precision. Rarely needed. | +| `int8` | 8 | 768 B | Integer quantization. Significant memory savings with some precision loss. | +| `uint8` | 8 | 768 B | Unsigned integer quantization. For embeddings with non-negative values. | + +**Algorithm Compatibility:** + +| Datatype | FLAT | HNSW | SVS-VAMANA | +|----------|------|------|------------| +| `float32` | Yes | Yes | Yes | +| `float16` | Yes | Yes | Yes | +| `bfloat16` | Yes | Yes | No | +| `float64` | Yes | Yes | No | +| `int8` | Yes | Yes | No | +| `uint8` | Yes | Yes | No | + +**Choosing a Datatype:** + +- **Start with `float32`** unless you have memory constraints +- **Use `float16`** for production systems with millions of vectors (50% memory savings, minimal precision loss) +- **Use `int8`/`uint8`** only after benchmarking recall on your specific dataset +- **SVS-VAMANA users**: Must use `float16` or `float32` + +**Quantization with the Migrator:** + +You can change vector datatypes on existing indexes using the migration wizard: + +```bash +rvl migrate wizard --index my_index --url redis://localhost:6379 +# Select "Update field" > choose vector field > change datatype +``` + +The migrator automatically re-encodes stored vectors to the new precision. See {doc}`/user_guide/how_to_guides/migrate-indexes` for details. + ## Redis-Specific Subtleties ### Modifier Ordering @@ -304,6 +346,53 @@ Not all attributes work with all field types: | `unf` | ✓ | ✗ | ✓ | ✗ | ✗ | | `withsuffixtrie` | ✓ | ✓ | ✗ | ✗ | ✗ | +### Migration Support + +The migration wizard (`rvl migrate wizard`) supports updating field attributes on existing indexes. The table below shows which attributes can be updated via the wizard vs requiring manual schema patch editing. + +**Wizard Prompts:** + +| Attribute | Text | Tag | Numeric | Geo | Vector | +|-----------|------|-----|---------|-----|--------| +| `sortable` | Wizard | Wizard | Wizard | Wizard | N/A | +| `index_missing` | Wizard | Wizard | Wizard | Wizard | N/A | +| `index_empty` | Wizard | Wizard | N/A | N/A | N/A | +| `no_index` | Wizard | Wizard | Wizard | Wizard | N/A | +| `unf` | Wizard* | N/A | Wizard* | N/A | N/A | +| `separator` | N/A | Wizard | N/A | N/A | N/A | +| `case_sensitive` | N/A | Wizard | N/A | N/A | N/A | +| `no_stem` | Wizard | N/A | N/A | N/A | N/A | +| `weight` | Wizard | N/A | N/A | N/A | N/A | +| `algorithm` | N/A | N/A | N/A | N/A | Wizard | +| `datatype` | N/A | N/A | N/A | N/A | Wizard | +| `distance_metric` | N/A | N/A | N/A | N/A | Wizard | +| `m`, `ef_construction` | N/A | N/A | N/A | N/A | Wizard | + +*\* `unf` is only prompted when `sortable` is enabled.* + +**Manual Schema Patch Required:** + +| Attribute | Notes | +|-----------|-------| +| `phonetic_matcher` | Enable phonetic search | +| `withsuffixtrie` | Suffix/contains search optimization | + +**Example manual patch** for adding `index_missing` to a field: + +```yaml +# schema_patch.yaml +version: 1 +changes: + update_fields: + - name: category + attrs: + index_missing: true +``` + +```bash +rvl migrate plan --index my_index --schema-patch schema_patch.yaml +``` + ### JSON Path for Nested Fields When using JSON storage, use the `path` attribute to index nested fields: diff --git a/docs/concepts/index-migrations.md b/docs/concepts/index-migrations.md new file mode 100644 index 000000000..065522d98 --- /dev/null +++ b/docs/concepts/index-migrations.md @@ -0,0 +1,255 @@ +--- +myst: + html_meta: + "description lang=en": | + Learn how RedisVL index migrations work and which schema changes are supported. +--- + +# Index Migrations + +Redis Search indexes are immutable. To change an index schema, you must drop the existing index and create a new one. RedisVL provides a migration workflow that automates this process while preserving your data. + +This page explains how migrations work and which changes are supported. For step by step instructions, see the [migration guide](../user_guide/how_to_guides/migrate-indexes.md). + +## Supported and blocked changes + +The migrator classifies schema changes into two categories: + +| Change | Status | +|--------|--------| +| Add or remove a field | Supported | +| Rename a field | Supported | +| Change field options (sortable, separator) | Supported | +| Change key prefix | Supported | +| Rename the index | Supported | +| Change vector algorithm (FLAT, HNSW, SVS-VAMANA) | Supported | +| Change distance metric (COSINE, L2, IP) | Supported | +| Tune algorithm parameters (M, EF_CONSTRUCTION) | Supported | +| Quantize vectors (float32 to float16/bfloat16/int8/uint8) | Supported | +| Change vector dimensions | Blocked | +| Change storage type (hash to JSON) | Blocked | +| Add a new vector field | Blocked | + +**Note:** INT8 and UINT8 vector datatypes require Redis 8.0+. SVS-VAMANA algorithm requires Redis 8.2+ and Intel AVX-512 hardware. + +**Supported** changes can be applied automatically using `rvl migrate`. The migrator handles the index rebuild and any necessary data transformations. + +**Blocked** changes require manual intervention because they involve incompatible data formats or missing data. The migrator will reject these changes and explain why. + +## How the migrator works + +The migrator uses a plan first workflow: + +1. **Plan**: Capture the current schema, classify your changes, and generate a migration plan +2. **Review**: Inspect the plan before making any changes +3. **Apply**: Drop the index, transform data if needed, and recreate with the new schema +4. **Validate**: Verify the result matches expectations + +This separation ensures you always know what will happen before any changes are made. + +## Migration mode: drop_recreate + +The `drop_recreate` mode rebuilds the index in place while preserving your documents. + +The process: + +1. Drop only the index structure (documents remain in Redis) +2. For datatype changes, re-encode vectors to the target precision +3. Recreate the index with the new schema +4. Wait for Redis to re-index the existing documents +5. Validate the result + +**Tradeoff**: The index is unavailable during the rebuild. Review the migration plan carefully before applying. + +## Index only vs document dependent changes + +Schema changes fall into two categories based on whether they require modifying stored data. + +**Index only changes** affect how Redis Search indexes data, not the data itself: + +- Algorithm changes: The stored vector bytes are identical. Only the index structure differs. +- Distance metric changes: Same vectors, different similarity calculation. +- Adding or removing fields: The documents already contain the data. The index just starts or stops indexing it. + +These changes complete quickly because they only require rebuilding the index. + +**Document dependent changes** require modifying the stored data: + +- Datatype changes (float32 to float16): Stored vector bytes must be re-encoded. +- Field renames: Stored field names must be updated in every document. +- Dimension changes: Vectors must be re-embedded with a different model. + +The migrator handles datatype changes automatically. Other document dependent changes are blocked because they require application level logic or external services. + +## Vector quantization + +Changing vector precision from float32 to float16 reduces memory usage at the cost of slight precision loss. The migrator handles this automatically by: + +1. Reading all vectors from Redis +2. Converting to the target precision +3. Writing updated vectors back +4. Recreating the index with the new schema + +Typical reductions: + +| Metric | Value | +|--------|-------| +| Index size reduction | ~50% | +| Memory reduction | ~35% | + +Quantization time is proportional to document count. Plan for downtime accordingly. + +## Why some changes are blocked + +### Vector dimension changes + +Vector dimensions are determined by your embedding model. A 384 dimensional vector from one model is mathematically incompatible with a 768 dimensional index expecting vectors from a different model. There is no way to resize an embedding. + +**Resolution**: Re-embed your documents using the new model and load them into a new index. + +### Storage type changes + +Hash and JSON have different data layouts. Hash stores flat key value pairs. JSON stores nested structures. Converting between them requires understanding your schema and restructuring each document. + +**Resolution**: Export your data, transform it to the new format, and reload into a new index. + +### Adding a vector field + +Adding a vector field means all existing documents need vectors for that field. The migrator cannot generate these vectors because it does not know which embedding model to use or what content to embed. + +**Resolution**: Add vectors to your documents using your application, then run the migration. + +## Downtime considerations + +With `drop_recreate`, your index is unavailable between the drop and when re-indexing completes. + +**CRITICAL**: Downtime requires both reads AND writes to be paused: + +| Requirement | Reason | +|-------------|--------| +| **Pause reads** | Index is unavailable during migration | +| **Pause writes** | Redis updates indexes synchronously. Writes during migration may conflict with vector re-encoding or be missed | + +Plan for: + +- Search unavailability during the migration window +- Partial results while indexing is in progress +- Resource usage from the re-indexing process +- Quantization time if changing vector datatypes + +The duration depends on document count, field count, and vector dimensions. For large indexes, consider running migrations during low traffic periods. + +## Sync vs async execution + +The migrator provides both synchronous and asynchronous execution modes. + +### What becomes async and what stays sync + +The migration workflow has distinct phases. Here is what each mode affects: + +| Phase | Sync mode | Async mode | Notes | +|-------|-----------|------------|-------| +| **Plan generation** | `MigrationPlanner.create_plan()` | `AsyncMigrationPlanner.create_plan()` | Reads index metadata from Redis | +| **Schema snapshot** | Sync Redis calls | Async Redis calls | Single `FT.INFO` command | +| **Enumeration** | FT.AGGREGATE (or SCAN fallback) | FT.AGGREGATE (or SCAN fallback) | Before drop, only if quantization needed | +| **Drop index** | `index.delete()` | `await index.delete()` | Single `FT.DROPINDEX` command | +| **Quantization** | Sequential HGET + HSET | Sequential HGET + batched HSET | Uses pre-enumerated keys | +| **Create index** | `index.create()` | `await index.create()` | Single `FT.CREATE` command | +| **Readiness polling** | `time.sleep()` loop | `asyncio.sleep()` loop | Polls `FT.INFO` until indexed | +| **Validation** | Sync Redis calls | Async Redis calls | Schema and doc count checks | +| **CLI interaction** | Always sync | Always sync | User prompts, file I/O | +| **YAML read/write** | Always sync | Always sync | Local filesystem only | + +### When to use sync (default) + +Sync execution is simpler and sufficient for most migrations: + +- Small to medium indexes (under 100K documents) +- Index-only changes (algorithm, distance metric, field options) +- Interactive CLI usage where blocking is acceptable + +For migrations without quantization, the Redis operations are fast single commands. Sync mode adds no meaningful overhead. + +### When to use async + +Async execution (`--async` flag) provides benefits in specific scenarios: + +**Large quantization jobs (1M+ vectors)** + +Converting float32 to float16 requires reading every vector, converting it, and writing it back. The async executor: + +- Enumerates documents using `FT.AGGREGATE WITHCURSOR` for index-specific enumeration (falls back to `SCAN` only if indexing failures exist) +- Pipelines `HSET` operations in batches (100-1000 operations per pipeline is optimal for Redis) +- Yields to the event loop between batches so other tasks can proceed + +**Large keyspaces (40M+ keys)** + +When your Redis instance has many keys and the index has indexing failures (requiring SCAN fallback), async mode yields between batches. + +**Async application integration** + +If your application uses asyncio, you can integrate migration directly: + +```python +import asyncio +from redisvl.migration import AsyncMigrationPlanner, AsyncMigrationExecutor + +async def migrate(): + planner = AsyncMigrationPlanner() + plan = await planner.create_plan("myindex", redis_url="redis://localhost:6379") + + executor = AsyncMigrationExecutor() + report = await executor.apply(plan, redis_url="redis://localhost:6379") + +asyncio.run(migrate()) +``` + +### Why async helps with quantization + +The migrator uses an optimized enumeration strategy: + +1. **Index-based enumeration**: Uses `FT.AGGREGATE WITHCURSOR` to enumerate only indexed documents (not the entire keyspace) +2. **Fallback for safety**: If the index has indexing failures (`hash_indexing_failures > 0`), falls back to `SCAN` to ensure completeness +3. **Enumerate before drop**: Captures the document list while the index still exists, then drops and quantizes + +This optimization provides 10-1000x speedup for sparse indexes (where only a small fraction of prefix-matching keys are indexed). + +**Sync quantization:** +``` +enumerate keys (FT.AGGREGATE or SCAN) -> store list +for each batch of 500 keys: + for each key: + HGET field (blocks) + convert array + pipeline.HSET(field, new_bytes) + pipeline.execute() (blocks) +``` + +**Async quantization:** +``` +enumerate keys (FT.AGGREGATE or SCAN) -> store list +for each batch of 500 keys: + for each key: + await HGET field (yields) + convert array + pipeline.HSET(field, new_bytes) + await pipeline.execute() (yields) +``` + +Each `await` is a yield point where other coroutines can run. For millions of vectors, this prevents your application from freezing. + +### What async does NOT improve + +Async execution does not reduce: + +- **Total migration time**: Same work, different scheduling +- **Redis server load**: Same commands execute on the server +- **Downtime window**: Index remains unavailable during rebuild +- **Network round trips**: Same number of Redis calls + +The benefit is application responsiveness, not faster migration. + +## Learn more + +- [Migration guide](../user_guide/how_to_guides/migrate-indexes.md): Step by step instructions +- [Search and indexing](search-and-indexing.md): How Redis Search indexes work diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 0e522b1a2..02f4d8b01 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -26,6 +26,13 @@ How RedisVL components connect: schemas, indexes, queries, and extensions. Schemas, fields, documents, storage types, and query patterns. ::: +:::{grid-item-card} 🔄 Index Migrations +:link: index-migrations +:link-type: doc + +How RedisVL handles migration planning, rebuilds, and future shadow migration. +::: + :::{grid-item-card} 🏷️ Field Attributes :link: field-attributes :link-type: doc @@ -62,6 +69,7 @@ Pre-built patterns: caching, message history, and semantic routing. architecture search-and-indexing +index-migrations field-attributes queries utilities diff --git a/docs/concepts/search-and-indexing.md b/docs/concepts/search-and-indexing.md index b4fe69569..5312d7dfb 100644 --- a/docs/concepts/search-and-indexing.md +++ b/docs/concepts/search-and-indexing.md @@ -106,9 +106,14 @@ To change a schema, you create a new index with the updated configuration, reind Planning your schema carefully upfront reduces the need for migrations, but the capability exists when requirements evolve. ---- +RedisVL now includes a dedicated migration workflow for this lifecycle: + +- `drop_recreate` for document-preserving rebuilds, including vector quantization (`float32` → `float16`) -**Related concepts:** {doc}`field-attributes` explains how to configure field options like `sortable` and `index_missing`. {doc}`queries` covers the different query types available. +That means schema evolution is no longer only a manual operational pattern. It is also a product surface in RedisVL with a planner, CLI, and validation artifacts. + +--- -**Learn more:** {doc}`/user_guide/01_getting_started` walks through building your first index. {doc}`/user_guide/05_hash_vs_json` compares storage options in depth. {doc}`/user_guide/02_complex_filtering` covers query composition. +**Related concepts:** {doc}`field-attributes` explains how to configure field options like `sortable` and `index_missing`. {doc}`queries` covers the different query types available. {doc}`index-migrations` explains migration modes, supported changes, and architecture. +**Learn more:** {doc}`/user_guide/01_getting_started` walks through building your first index. {doc}`/user_guide/05_hash_vs_json` compares storage options in depth. {doc}`/user_guide/02_complex_filtering` covers query composition. {doc}`/user_guide/how_to_guides/migrate-indexes` shows how to use the migration CLI in practice. diff --git a/docs/user_guide/13_sql_query_exercises.ipynb b/docs/user_guide/13_sql_query_exercises.ipynb new file mode 100644 index 000000000..c09af709f --- /dev/null +++ b/docs/user_guide/13_sql_query_exercises.ipynb @@ -0,0 +1,1239 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SQL-to-Redis Query Translation: Hands-On Exercises\n", + "\n", + "This notebook provides hands-on exercises for learning the new **SQLQuery** feature in RedisVL, which allows you to write familiar SQL syntax that automatically translates to Redis Search commands.\n", + "\n", + "## What You'll Learn\n", + "\n", + "1. How to use the `SQLQuery` class to write SQL-like queries\n", + "2. Three equivalent approaches for the same queries:\n", + " - **RedisVL Python API** - Using native query classes (`FilterQuery`, `VectorQuery`, etc.)\n", + " - **RedisVL SQL** - Using the new `SQLQuery` class with SQL syntax\n", + " - **Raw Redis FT.SEARCH** - The equivalent Redis Search command\n", + "3. Various query types: filtering, numeric ranges, text search, aggregations, and vector similarity\n", + "\n", + "## Prerequisites\n", + "\n", + "- Redis Stack running locally (or Redis Cloud)\n", + "- RedisVL with SQL support: `pip install redisvl[sql-redis]`\n", + "\n", + "## Documentation References\n", + "\n", + "- [RedisVL Documentation](https://docs.redisvl.com)\n", + "- [Redis Search Query Syntax](https://redis.io/docs/latest/develop/ai/search-and-query/query/)\n", + "- [Redis Aggregations](https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/aggregations/)\n", + "- [sql-redis Package](https://pypi.org/project/sql-redis/)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup: Create Sample Dataset and Index\n", + "\n", + "We'll create a realistic e-commerce products dataset with multiple field types to demonstrate various query capabilities." + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T22:52:37.820727Z", + "start_time": "2026-02-05T22:52:37.431065Z" + } + }, + "source": [ + "import numpy as np\n", + "from redis import Redis\n", + "from redisvl.index import SearchIndex\n", + "from redisvl.query import FilterQuery, VectorQuery, CountQuery, SQLQuery\n", + "from redisvl.query.filter import Tag, Num, Text\n", + "\n", + "# Redis connection\n", + "REDIS_URL = \"redis://localhost:6379\"\n", + "client = Redis.from_url(REDIS_URL)\n", + "\n", + "# Define schema with multiple field types\n", + "schema = {\n", + " \"index\": {\n", + " \"name\": \"products_exercise\",\n", + " \"prefix\": \"product_exercise\",\n", + " \"storage_type\": \"hash\",\n", + " },\n", + " \"fields\": [\n", + " {\"name\": \"name\", \"type\": \"text\", \"attrs\": {\"sortable\": True}},\n", + " {\"name\": \"description\", \"type\": \"text\"},\n", + " {\"name\": \"category\", \"type\": \"tag\", \"attrs\": {\"sortable\": True}},\n", + " {\"name\": \"brand\", \"type\": \"tag\"},\n", + " {\"name\": \"price\", \"type\": \"numeric\", \"attrs\": {\"sortable\": True}},\n", + " {\"name\": \"stock\", \"type\": \"numeric\", \"attrs\": {\"sortable\": True}},\n", + " {\"name\": \"rating\", \"type\": \"numeric\", \"attrs\": {\"sortable\": True}},\n", + " {\n", + " \"name\": \"embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", + " \"dims\": 4,\n", + " \"distance_metric\": \"cosine\",\n", + " \"algorithm\": \"flat\",\n", + " \"datatype\": \"float32\",\n", + " },\n", + " },\n", + " ],\n", + "}\n", + "\n", + "# Create the index\n", + "index = SearchIndex.from_dict(schema, redis_client=client)\n", + "index.create(overwrite=True, drop=True)\n", + "print(f\"Created index: {index.name}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created index: products_exercise\n" + ] + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T22:52:57.150152Z", + "start_time": "2026-02-05T22:52:57.105851Z" + } + }, + "source": [ + "# Sample product data with embeddings\n", + "products = [\n", + " {\"name\": \"Gaming Laptop Pro\", \"description\": \"High-performance laptop for gaming\", \"category\": \"electronics\", \"brand\": \"TechBrand\", \"price\": 1299, \"stock\": 15, \"rating\": 4.7, \"embedding\": np.array([0.9, 0.1, 0.2, 0.3], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Budget Laptop Basic\", \"description\": \"Affordable laptop for everyday tasks\", \"category\": \"electronics\", \"brand\": \"ValueTech\", \"price\": 499, \"stock\": 50, \"rating\": 4.0, \"embedding\": np.array([0.8, 0.2, 0.3, 0.4], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Wireless Mouse\", \"description\": \"Ergonomic wireless mouse\", \"category\": \"electronics\", \"brand\": \"TechBrand\", \"price\": 35, \"stock\": 200, \"rating\": 4.3, \"embedding\": np.array([0.7, 0.3, 0.4, 0.5], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Python Programming Guide\", \"description\": \"Comprehensive Python programming guide\", \"category\": \"books\", \"brand\": \"TechBooks\", \"price\": 45, \"stock\": 100, \"rating\": 4.8, \"embedding\": np.array([0.2, 0.8, 0.1, 0.3], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Redis in Action\", \"description\": \"Learn Redis with practical examples\", \"category\": \"books\", \"brand\": \"TechBooks\", \"price\": 55, \"stock\": 75, \"rating\": 4.6, \"embedding\": np.array([0.3, 0.7, 0.2, 0.4], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Data Science Handbook\", \"description\": \"Essential data science handbook\", \"category\": \"books\", \"brand\": \"DataPress\", \"price\": 65, \"stock\": 40, \"rating\": 4.5, \"embedding\": np.array([0.25, 0.75, 0.15, 0.35], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Mechanical Keyboard\", \"description\": \"Premium mechanical keyboard with RGB\", \"category\": \"electronics\", \"brand\": \"KeyMaster\", \"price\": 149, \"stock\": 80, \"rating\": 4.6, \"embedding\": np.array([0.6, 0.4, 0.5, 0.6], dtype=np.float32).tobytes()},\n", + " {\"name\": \"USB-C Hub\", \"description\": \"Multi-port USB-C hub\", \"category\": \"electronics\", \"brand\": \"TechBrand\", \"price\": 49, \"stock\": 150, \"rating\": 4.2, \"embedding\": np.array([0.65, 0.35, 0.45, 0.55], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Desk Lamp LED\", \"description\": \"Adjustable LED desk lamp\", \"category\": \"accessories\", \"brand\": \"LightCo\", \"price\": 39, \"stock\": 120, \"rating\": 4.1, \"embedding\": np.array([0.4, 0.5, 0.6, 0.7], dtype=np.float32).tobytes()},\n", + " {\"name\": \"Monitor Stand\", \"description\": \"Ergonomic monitor stand\", \"category\": \"accessories\", \"brand\": \"DeskPro\", \"price\": 79, \"stock\": 60, \"rating\": 4.4, \"embedding\": np.array([0.45, 0.55, 0.65, 0.75], dtype=np.float32).tobytes()},\n", + "]\n", + "\n", + "# Load data into Redis\n", + "keys = index.load(products)\n", + "print(f\"Loaded {len(keys)} products into Redis\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 10 products into Redis\n" + ] + } + ], + "execution_count": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 1: Simple Tag Filtering\n", + "\n", + "**Goal:** Find all products in the \"electronics\" category.\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL FilterQuery](https://docs.redisvl.com/en/latest/api/query.html#filterquery)\n", + "- [Redis Tag Queries](https://redis.io/docs/latest/develop/ai/search-and-query/query/exact-match/)" + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T22:56:41.131491Z", + "start_time": "2026-02-05T22:56:41.113689Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Use Tag(\"category\") == \"electronics\" with FilterQuery\n", + "q= FilterQuery(\n", + " filter_expression=Tag(\"category\") == \"electronics\",\n", + " return_fields=[\"name\", \"category\", \"price\"],\n", + " num_results=10\n", + ")\n", + "\n", + "q = index.query(q)\n", + "q" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GK',\n", + " 'name': 'Wireless Mouse',\n", + " 'category': 'electronics',\n", + " 'price': '35'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GQ',\n", + " 'name': 'Mechanical Keyboard',\n", + " 'category': 'electronics',\n", + " 'price': '149'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GH',\n", + " 'name': 'Gaming Laptop Pro',\n", + " 'category': 'electronics',\n", + " 'price': '1299'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GJ',\n", + " 'name': 'Budget Laptop Basic',\n", + " 'category': 'electronics',\n", + " 'price': '499'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GR',\n", + " 'name': 'USB-C Hub',\n", + " 'category': 'electronics',\n", + " 'price': '49'}]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 5 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T22:58:38.451826Z", + "start_time": "2026-02-05T22:58:38.269871Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: SELECT ... FROM products_exercise WHERE category = 'electronics'\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, category, price\n", + " FROM {index.name}\n", + " WHERE category = 'electronics'\n", + "\"\"\")\n", + "index.query(sql_query)" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'Wireless Mouse', 'category': 'electronics', 'price': '35'},\n", + " {'name': 'Mechanical Keyboard', 'category': 'electronics', 'price': '149'},\n", + " {'name': 'Gaming Laptop Pro', 'category': 'electronics', 'price': '1299'},\n", + " {'name': 'Budget Laptop Basic', 'category': 'electronics', 'price': '499'},\n", + " {'name': 'USB-C Hub', 'category': 'electronics', 'price': '49'}]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T22:59:22.518910Z", + "start_time": "2026-02-05T22:59:22.495076Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.SEARCH\n", + "# Hint: client.execute_command(\"FT.SEARCH\", index_name, \"@category:{electronics}\", ...)\n", + "client.execute_command(\"FT.Search\", index.name, \"@category:{electronics}\")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[5,\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GK',\n", + " [b'name',\n", + " b'Wireless Mouse',\n", + " b'description',\n", + " b'Ergonomic wireless mouse',\n", + " b'category',\n", + " b'electronics',\n", + " b'brand',\n", + " b'TechBrand',\n", + " b'price',\n", + " b'35',\n", + " b'stock',\n", + " b'200',\n", + " b'rating',\n", + " b'4.3',\n", + " b'embedding',\n", + " b'333?\\x9a\\x99\\x99>\\xcd\\xcc\\xcc>\\x00\\x00\\x00?'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GQ',\n", + " [b'name',\n", + " b'Mechanical Keyboard',\n", + " b'description',\n", + " b'Premium mechanical keyboard with RGB',\n", + " b'category',\n", + " b'electronics',\n", + " b'brand',\n", + " b'KeyMaster',\n", + " b'price',\n", + " b'149',\n", + " b'stock',\n", + " b'80',\n", + " b'rating',\n", + " b'4.6',\n", + " b'embedding',\n", + " b'\\x9a\\x99\\x19?\\xcd\\xcc\\xcc>\\x00\\x00\\x00?\\x9a\\x99\\x19?'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GH',\n", + " [b'name',\n", + " b'Gaming Laptop Pro',\n", + " b'description',\n", + " b'High-performance laptop for gaming',\n", + " b'category',\n", + " b'electronics',\n", + " b'brand',\n", + " b'TechBrand',\n", + " b'price',\n", + " b'1299',\n", + " b'stock',\n", + " b'15',\n", + " b'rating',\n", + " b'4.7',\n", + " b'embedding',\n", + " b'fff?\\xcd\\xcc\\xcc=\\xcd\\xccL>\\x9a\\x99\\x99>'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GJ',\n", + " [b'name',\n", + " b'Budget Laptop Basic',\n", + " b'description',\n", + " b'Affordable laptop for everyday tasks',\n", + " b'category',\n", + " b'electronics',\n", + " b'brand',\n", + " b'ValueTech',\n", + " b'price',\n", + " b'499',\n", + " b'stock',\n", + " b'50',\n", + " b'rating',\n", + " b'4.0',\n", + " b'embedding',\n", + " b'\\xcd\\xccL?\\xcd\\xccL>\\x9a\\x99\\x99>\\xcd\\xcc\\xcc>'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GR',\n", + " [b'name',\n", + " b'USB-C Hub',\n", + " b'description',\n", + " b'Multi-port USB-C hub',\n", + " b'category',\n", + " b'electronics',\n", + " b'brand',\n", + " b'TechBrand',\n", + " b'price',\n", + " b'49',\n", + " b'stock',\n", + " b'150',\n", + " b'rating',\n", + " b'4.2',\n", + " b'embedding',\n", + " b'ff&?33\\xb3>ff\\xe6>\\xcd\\xcc\\x0c?']]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 9 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 1: RedisVL Python API\n", + "filter_expr = Tag(\"category\") == \"electronics\"\n", + "query = FilterQuery(filter_expression=filter_expr, return_fields=[\"name\", \"category\", \"price\"], num_results=10)\n", + "results_api = index.query(query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "for r in results_api:\n", + " print(f\" {r['name']} - ${r['price']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 2: RedisVL SQL\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, category, price\n", + " FROM {index.name}\n", + " WHERE category = 'electronics'\n", + "\"\"\")\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['name']} - ${r['price']}\")\n", + "\n", + "# Show the translated Redis command\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Raw Redis FT.SEARCH\n", + "raw_results = client.execute_command(\"FT.SEARCH\", index.name, \"@category:{electronics}\", \"RETURN\", \"3\", \"name\", \"category\", \"price\", \"LIMIT\", \"0\", \"10\")\n", + "print(\"=== Method 3: Raw FT.SEARCH ===\")\n", + "total = raw_results[0]\n", + "print(f\"Total matches: {total}\")\n", + "for i in range(1, len(raw_results), 2):\n", + " if i + 1 < len(raw_results):\n", + " fields = raw_results[i + 1]\n", + " field_dict = {fields[j].decode(): fields[j+1].decode() for j in range(0, len(fields), 2)}\n", + " print(f\" {field_dict.get('name', 'N/A')} - ${field_dict.get('price', 'N/A')}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 2: Numeric Range Queries\n", + "\n", + "**Goal:** Find all products with price between $40 and $100.\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL Numeric Filters](https://docs.redisvl.com/en/latest/api/query.html#redisvl.query.filter.Num)\n", + "- [Redis Numeric Range Queries](https://redis.io/docs/latest/develop/ai/search-and-query/query/range/)" + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:01:53.873244Z", + "start_time": "2026-02-05T23:01:53.847120Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Use Num(\"price\").between(40, 100) with FilterQuery\n", + "q = FilterQuery(\n", + " filter_expression = Num(\"price\").between(40,100),\n", + " return_fields=[\"name\", \"price\"],\n", + " num_results=10\n", + ")\n", + "index.query(q)" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GM',\n", + " 'name': 'Python Programming Guide',\n", + " 'price': '45'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GN',\n", + " 'name': 'Redis in Action',\n", + " 'price': '55'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GP',\n", + " 'name': 'Data Science Handbook',\n", + " 'price': '65'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GR',\n", + " 'name': 'USB-C Hub',\n", + " 'price': '49'},\n", + " {'id': 'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GT',\n", + " 'name': 'Monitor Stand',\n", + " 'price': '79'}]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 14 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:02:52.791662Z", + "start_time": "2026-02-05T23:02:52.770651Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: SELECT ... WHERE price BETWEEN 40 AND 100\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, price from {index.name} where price between 40 and 100\n", + "\"\"\")\n", + "\n", + "index.query(sql_query)\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'Python Programming Guide', 'price': '45'},\n", + " {'name': 'Redis in Action', 'price': '55'},\n", + " {'name': 'Data Science Handbook', 'price': '65'},\n", + " {'name': 'USB-C Hub', 'price': '49'},\n", + " {'name': 'Monitor Stand', 'price': '79'}]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 17 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:03:50.055289Z", + "start_time": "2026-02-05T23:03:50.029265Z" + } + }, + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.SEARCH\n", + "# Hint: @price:[40 100]\n", + "client.execute_command(\"FT.SEARCH\", index.name, \"@price:[40 100]\", \"RETURN\", \"2\", \"name\", \"price\", \"LIMIT\", \"0\", \"10\")" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "[5,\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GM',\n", + " [b'name', b'Python Programming Guide', b'price', b'45'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GN',\n", + " [b'name', b'Redis in Action', b'price', b'55'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GP',\n", + " [b'name', b'Data Science Handbook', b'price', b'65'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GR',\n", + " [b'name', b'USB-C Hub', b'price', b'49'],\n", + " b'product_exercise:01KGR04DCSDX9D0KSD7GAYC1GT',\n", + " [b'name', b'Monitor Stand', b'price', b'79']]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 19 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 1: RedisVL Python API\n", + "filter_expr = Num(\"price\").between(40, 100)\n", + "query = FilterQuery(filter_expression=filter_expr, return_fields=[\"name\", \"price\"], num_results=10)\n", + "results_api = index.query(query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "for r in results_api:\n", + " print(f\" {r['name']} - ${r['price']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 2: RedisVL SQL\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, price\n", + " FROM {index.name}\n", + " WHERE price BETWEEN 40 AND 100\n", + "\"\"\")\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['name']} - ${r['price']}\")\n", + "\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Raw Redis FT.SEARCH\n", + "raw_results = client.execute_command(\"FT.SEARCH\", index.name, \"@price:[40 100]\", \"RETURN\", \"2\", \"name\", \"price\", \"LIMIT\", \"0\", \"10\")\n", + "print(\"=== Method 3: Raw FT.SEARCH ===\")\n", + "for i in range(1, len(raw_results), 2):\n", + " if i + 1 < len(raw_results):\n", + " fields = raw_results[i + 1]\n", + " field_dict = {fields[j].decode(): fields[j+1].decode() for j in range(0, len(fields), 2)}\n", + " print(f\" {field_dict.get('name', 'N/A')} - ${field_dict.get('price', 'N/A')}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 3: Combined Filters (AND/OR)\n", + "\n", + "**Goal:** Find electronics products under $100.\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL Filter Expressions](https://docs.redisvl.com/en/latest/api/query.html#filter-expressions)\n", + "- [Redis Combined Queries](https://redis.io/docs/latest/develop/ai/search-and-query/query/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Combine filters with & operator: (Tag(\"category\") == \"electronics\") & (Num(\"price\") < 100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: WHERE category = 'electronics' AND price < 100\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.SEARCH\n", + "# Hint: (@category:{electronics} @price:[-inf 100])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 1: RedisVL Python API\n", + "filter_expr = (Tag(\"category\") == \"electronics\") & (Num(\"price\") < 100)\n", + "query = FilterQuery(filter_expression=filter_expr, return_fields=[\"name\", \"category\", \"price\"], num_results=10)\n", + "results_api = index.query(query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "for r in results_api:\n", + " print(f\" {r['name']} ({r['category']}) - ${r['price']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 2: RedisVL SQL\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, category, price\n", + " FROM {index.name}\n", + " WHERE category = 'electronics' AND price < 100\n", + "\"\"\")\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['name']} ({r['category']}) - ${r['price']}\")\n", + "\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Raw Redis FT.SEARCH\n", + "raw_results = client.execute_command(\"FT.SEARCH\", index.name, \"(@category:{electronics} @price:[-inf (100])\", \"RETURN\", \"3\", \"name\", \"category\", \"price\", \"LIMIT\", \"0\", \"10\")\n", + "print(\"=== Method 3: Raw FT.SEARCH ===\")\n", + "for i in range(1, len(raw_results), 2):\n", + " if i + 1 < len(raw_results):\n", + " fields = raw_results[i + 1]\n", + " field_dict = {fields[j].decode(): fields[j+1].decode() for j in range(0, len(fields), 2)}\n", + " print(f\" {field_dict.get('name', 'N/A')} ({field_dict.get('category', 'N/A')}) - ${field_dict.get('price', 'N/A')}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 4: Text Search\n", + "\n", + "**Goal:** Find products with \"laptop\" in the name.\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL Text Filters](https://docs.redisvl.com/en/latest/api/query.html#redisvl.query.filter.Text)\n", + "- [Redis Full-Text Search](https://redis.io/docs/latest/develop/ai/search-and-query/query/full-text/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Use Text(\"name\") % \"laptop\" with FilterQuery\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: WHERE name = 'laptop'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.SEARCH\n", + "# Hint: @name:laptop\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 1: RedisVL Python API\n", + "filter_expr = Text(\"name\") % \"laptop\"\n", + "query = FilterQuery(filter_expression=filter_expr, return_fields=[\"name\", \"description\", \"price\"], num_results=10)\n", + "results_api = index.query(query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "for r in results_api:\n", + " print(f\" {r['name']} - ${r['price']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 2: RedisVL SQL\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, description, price\n", + " FROM {index.name}\n", + " WHERE name = 'laptop'\n", + "\"\"\")\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['name']} - ${r['price']}\")\n", + "\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Raw Redis FT.SEARCH\n", + "raw_results = client.execute_command(\"FT.SEARCH\", index.name, \"@name:laptop\", \"RETURN\", \"3\", \"name\", \"description\", \"price\", \"LIMIT\", \"0\", \"10\")\n", + "print(\"=== Method 3: Raw FT.SEARCH ===\")\n", + "for i in range(1, len(raw_results), 2):\n", + " if i + 1 < len(raw_results):\n", + " fields = raw_results[i + 1]\n", + " field_dict = {fields[j].decode(): fields[j+1].decode() for j in range(0, len(fields), 2)}\n", + " print(f\" {field_dict.get('name', 'N/A')} - ${field_dict.get('price', 'N/A')}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 5: Vector Similarity Search\n", + "\n", + "**Goal:** Find products most similar to a query vector (simulating a semantic search).\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL VectorQuery](https://docs.redisvl.com/en/latest/api/query.html#vectorquery)\n", + "- [Redis Vector Search](https://redis.io/docs/latest/develop/ai/search-and-query/vectors/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Use VectorQuery with a query vector\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: SELECT ... ORDER BY cosine_distance(embedding, ) LIMIT k\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.SEARCH\n", + "# Hint: FT.SEARCH with KNN and BLOB parameter\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 5" + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:05:53.684001Z", + "start_time": "2026-02-05T23:05:53.656720Z" + } + }, + "source": [ + "# Query vector (similar to electronics products)\n", + "query_vector = np.array([0.85, 0.15, 0.25, 0.35], dtype=np.float32)\n", + "\n", + "# Method 1: RedisVL Python API\n", + "vector_query = VectorQuery(\n", + " vector=query_vector,\n", + " vector_field_name=\"embedding\",\n", + " return_fields=[\"name\", \"category\", \"price\"],\n", + " num_results=3\n", + ")\n", + "results_api = index.query(vector_query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "for r in results_api:\n", + " print(f\" {r['name']} ({r['category']}) - distance: {r.get('vector_distance', 'N/A')}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Method 1: RedisVL Python API ===\n", + " Gaming Laptop Pro (electronics) - distance: 0.00526285171509\n", + " Budget Laptop Basic (electronics) - distance: 0.00537633895874\n", + " Wireless Mouse (electronics) - distance: 0.0464093089104\n" + ] + } + ], + "execution_count": 21 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:07:23.911432Z", + "start_time": "2026-02-05T23:07:23.889830Z" + } + }, + "source": [ + "# Method 2: RedisVL SQL\n", + "# Note: sql-redis uses cosine_distance() function for vector search\n", + "vector_bytes = query_vector.tobytes()\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT name, category, price\n", + " FROM {index.name}\n", + " ORDER BY cosine_distance(embedding, :vector)\n", + " LIMIT 3\n", + "\"\"\",\n", + "params={\"vector\": vector_bytes}\n", + " )\n", + "\n", + "\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['name']} ({r['category']})\")\n", + "\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd[:100]}...\")\n", + "\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Method 2: RedisVL SQL ===\n", + " Wireless Mouse (electronics)\n", + " Monitor Stand (accessories)\n", + " Redis in Action (books)\n", + "\n", + "Translated Redis command: FT.SEARCH products_exercise \"*\" RETURN 3 name category price LIMIT 0 3...\n" + ] + } + ], + "execution_count": 26 + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Raw Redis FT.SEARCH with KNN\n", + "import struct\n", + "vector_blob = struct.pack(f'{len(query_vector)}f', *query_vector)\n", + "raw_results = client.execute_command(\n", + " \"FT.SEARCH\", index.name,\n", + " \"*=>[KNN 3 @embedding $vec AS vector_distance]\",\n", + " \"PARAMS\", \"2\", \"vec\", vector_blob,\n", + " \"RETURN\", \"4\", \"name\", \"category\", \"price\", \"vector_distance\",\n", + " \"SORTBY\", \"vector_distance\", \"ASC\",\n", + " \"DIALECT\", \"2\"\n", + ")\n", + "print(\"=== Method 3: Raw FT.SEARCH ===\")\n", + "for i in range(1, len(raw_results), 2):\n", + " if i + 1 < len(raw_results):\n", + " fields = raw_results[i + 1]\n", + " field_dict = {fields[j].decode() if isinstance(fields[j], bytes) else fields[j]:\n", + " fields[j+1].decode() if isinstance(fields[j+1], bytes) else fields[j+1]\n", + " for j in range(0, len(fields), 2)}\n", + " print(f\" {field_dict.get('name', 'N/A')} ({field_dict.get('category', 'N/A')}) - distance: {field_dict.get('vector_distance', 'N/A')}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Exercise 6: Aggregations (COUNT, GROUP BY, AVG)\n", + "\n", + "**Goal:** Count products by category and calculate average prices.\n", + "\n", + "### Do It Yourself\n", + "\n", + "**Documentation:**\n", + "- [RedisVL CountQuery](https://docs.redisvl.com/en/latest/api/query.html#countquery)\n", + "- [Redis Aggregations](https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/aggregations/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 1: RedisVL Python API\n", + "# Hint: Use CountQuery for counting\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 2: SQLQuery\n", + "# Hint: SELECT category, COUNT(*), AVG(price) FROM ... GROUP BY category\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YOUR CODE HERE - Method 3: Raw FT.AGGREGATE\n", + "# Hint: FT.AGGREGATE with GROUPBY and REDUCE\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution: Exercise 6" + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:08:26.684434Z", + "start_time": "2026-02-05T23:08:26.651689Z" + } + }, + "source": [ + "# Method 1: RedisVL Python API - Count total products\n", + "count_query = CountQuery(filter_expression=Tag(\"category\") == \"electronics\")\n", + "count_result = index.query(count_query)\n", + "print(\"=== Method 1: RedisVL Python API ===\")\n", + "print(f\" Electronics products count: {count_result}\")\n", + "\n", + "# Count for each category\n", + "for cat in [\"electronics\", \"books\", \"accessories\"]:\n", + " count_query = CountQuery(filter_expression=Tag(\"category\") == cat)\n", + " count = index.query(count_query)\n", + " print(f\" {cat}: {count} products\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Method 1: RedisVL Python API ===\n", + " Electronics products count: 5\n", + " electronics: 5 products\n", + " books: 3 products\n", + " accessories: 2 products\n" + ] + } + ], + "execution_count": 27 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:08:26.968246Z", + "start_time": "2026-02-05T23:08:26.948261Z" + } + }, + "source": [ + "# Method 2: RedisVL SQL - Group by with aggregations\n", + "sql_query = SQLQuery(f\"\"\"\n", + " SELECT category, COUNT(*) as count, AVG(price) as avg_price\n", + " FROM {index.name}\n", + " GROUP BY category\n", + "\"\"\")\n", + "results_sql = index.query(sql_query)\n", + "print(\"=== Method 2: RedisVL SQL ===\")\n", + "for r in results_sql:\n", + " print(f\" {r['category']}: {r['count']} products, avg price: ${float(r['avg_price']):.2f}\")\n", + "\n", + "redis_cmd = sql_query.redis_query_string(redis_client=client)\n", + "print(f\"\\nTranslated Redis command: {redis_cmd}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Method 2: RedisVL SQL ===\n", + " books: 3 products, avg price: $55.00\n", + " accessories: 2 products, avg price: $59.00\n", + " electronics: 5 products, avg price: $406.20\n", + "\n", + "Translated Redis command: FT.AGGREGATE products_exercise \"*\" LOAD 2 category price GROUPBY 1 @category REDUCE COUNT 0 AS count REDUCE AVG 1 @price AS avg_price\n" + ] + } + ], + "execution_count": 28 + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-05T23:09:08.232955Z", + "start_time": "2026-02-05T23:09:08.206373Z" + } + }, + "source": [ + "# Method 3: Raw Redis FT.AGGREGATE\n", + "raw_results = client.execute_command(\n", + " \"FT.AGGREGATE\", index.name, \"*\",\n", + " \"GROUPBY\", \"1\", \"@category\",\n", + " \"REDUCE\", \"COUNT\", \"0\", \"AS\", \"count\",\n", + " \"REDUCE\", \"AVG\", \"1\", \"@price\", \"AS\", \"avg_price\"\n", + ")\n", + "print(\"=== Method 3: Raw FT.AGGREGATE ===\")\n", + "for i in range(1, len(raw_results)):\n", + " row = raw_results[i]\n", + " row_dict = {row[j].decode() if isinstance(row[j], bytes) else row[j]:\n", + " row[j+1].decode() if isinstance(row[j+1], bytes) else row[j+1]\n", + " for j in range(0, len(row), 2)}\n", + " cat = row_dict.get('category', 'N/A')\n", + " count = row_dict.get('count', 'N/A')\n", + " avg_price = float(row_dict.get('avg_price', 0))\n", + " print(f\" {cat}: {count} products, avg price: ${avg_price:.2f}\")\n", + "\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Method 3: Raw FT.AGGREGATE ===\n", + " books: 3 products, avg price: $55.00\n", + " accessories: 2 products, avg price: $59.00\n", + " electronics: 5 products, avg price: $406.20\n" + ] + } + ], + "execution_count": 29 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Comparison Summary\n", + "\n", + "| Approach | Pros | Cons | Best For |\n", + "|----------|------|------|----------|\n", + "| **RedisVL Python API** | Type-safe, IDE autocomplete, Pythonic | Learning curve for filter expressions | Production applications, complex queries |\n", + "| **RedisVL SQL** | Familiar SQL syntax, easy migration | Limited to SQL capabilities | SQL developers, quick prototyping |\n", + "| **Raw FT.SEARCH** | Full control, all Redis features | Verbose, error-prone | Advanced use cases, debugging |\n", + "\n", + "### Key Takeaways\n", + "\n", + "1. **SQLQuery** is great for developers familiar with SQL who want to quickly query Redis\n", + "2. **RedisVL Python API** provides the best developer experience with type safety\n", + "3. **Raw FT.SEARCH** gives you full control but requires deep Redis knowledge\n", + "4. All three methods can achieve the same results - choose based on your team's expertise\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the index and clean up\n", + "index.delete(drop=True)\n", + "print(f\"Deleted index: {index.name}\")\n", + "print(\"Cleanup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Next Steps\n", + "\n", + "- Explore more complex queries in the [RedisVL documentation](https://docs.redisvl.com)\n", + "- Try the [sql-redis package](https://pypi.org/project/sql-redis/) for more SQL features\n", + "- Check out other RedisVL features like SemanticCache and SemanticRouter" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/user_guide/cli.ipynb b/docs/user_guide/cli.ipynb index ba9d645a3..02bc68b13 100644 --- a/docs/user_guide/cli.ipynb +++ b/docs/user_guide/cli.ipynb @@ -6,7 +6,7 @@ "source": [ "# The RedisVL CLI\n", "\n", - "RedisVL is a Python library with a dedicated CLI to help load and create vector search indices within Redis.\n", + "RedisVL is a Python library with a dedicated CLI to help load, inspect, migrate, and create vector search indices within Redis.\n", "\n", "This notebook will walk through how to use the Redis Vector Library CLI (``rvl``).\n", "\n", @@ -50,7 +50,16 @@ "| `rvl index` | `delete --index` or `-i ` | remove the specified index, leaving the data still in Redis|\n", "| `rvl index` | `destroy --index` or `-i `| remove the specified index, as well as the associated data|\n", "| `rvl stats` | `--index` or `-i ` | display the index statistics, including number of docs, average bytes per record, indexing time, etc|\n", - "| `rvl stats` | `--schema` or `-s ` | display the index statistics of a schema defined in . The index must have already been created within Redis|" + "| `rvl stats` | `--schema` or `-s ` | display the index statistics of a schema defined in . The index must have already been created within Redis|\n", + "| `rvl migrate` | `helper` or `list` | show migration guidance and list indexes available for migration|\n", + "| `rvl migrate` | `wizard` | interactively build a migration plan and schema patch|\n", + "| `rvl migrate` | `plan` | generate `migration_plan.yaml` from a patch or target schema|\n", + "| `rvl migrate` | `apply` | execute a reviewed `drop_recreate` migration|\n", + "| `rvl migrate` | `validate` | validate a completed migration and emit report artifacts|\n", + "| `rvl migrate` | `batch-plan` | create a batch migration plan for multiple indexes|\n", + "| `rvl migrate` | `batch-apply` | execute a batch migration|\n", + "| `rvl migrate` | `batch-resume` | resume an interrupted batch migration|\n", + "| `rvl migrate` | `batch-status` | check batch migration progress|" ] }, { diff --git a/docs/user_guide/how_to_guides/index.md b/docs/user_guide/how_to_guides/index.md index c03d705da..f6511d54c 100644 --- a/docs/user_guide/how_to_guides/index.md +++ b/docs/user_guide/how_to_guides/index.md @@ -34,6 +34,7 @@ How-to guides are **task-oriented** recipes that help you accomplish specific go :::{grid-item-card} 💾 Storage - [Choose a Storage Type](../05_hash_vs_json.ipynb) -- Hash vs JSON formats and nested data +- [Migrate an Index](migrate-indexes.md) -- use the migrator helper, wizard, plan, apply, and validate workflow ::: :::{grid-item-card} 💻 CLI Operations @@ -59,6 +60,7 @@ How-to guides are **task-oriented** recipes that help you accomplish specific go | Optimize index performance | [Optimize Indexes with SVS-VAMANA](../09_svs_vamana.ipynb) | | Decide on storage format | [Choose a Storage Type](../05_hash_vs_json.ipynb) | | Manage indices from terminal | [Manage Indices with the CLI](../cli.ipynb) | +| Plan and run a supported index migration | [Migrate an Index](migrate-indexes.md) | ```{toctree} :hidden: @@ -74,4 +76,5 @@ Optimize Indexes with SVS-VAMANA <../09_svs_vamana> Cache Embeddings <../10_embeddings_cache> Use Advanced Query Types <../11_advanced_queries> Write SQL Queries for Redis <../12_sql_to_redis_queries> +Migrate an Index ``` diff --git a/docs/user_guide/how_to_guides/migrate-indexes.md b/docs/user_guide/how_to_guides/migrate-indexes.md new file mode 100644 index 000000000..505c70dff --- /dev/null +++ b/docs/user_guide/how_to_guides/migrate-indexes.md @@ -0,0 +1,933 @@ +--- +myst: + html_meta: + "description lang=en": | + How to migrate a RedisVL index schema without losing data. +--- + +# Migrate an Index + +This guide shows how to safely change your index schema using the RedisVL migrator. + +## Quick Start + +Add a field to your index in 4 commands: + +```bash +# 1. See what indexes exist +rvl migrate list --url redis://localhost:6379 + +# 2. Use the wizard to build a migration plan +rvl migrate wizard --index myindex --url redis://localhost:6379 + +# 3. Apply the migration +rvl migrate apply --plan migration_plan.yaml --url redis://localhost:6379 + +# 4. Verify the result +rvl migrate validate --plan migration_plan.yaml --url redis://localhost:6379 +``` + +## Prerequisites + +- Redis with the Search module (Redis Stack, Redis Cloud, or Redis Enterprise) +- An existing index to migrate +- `redisvl` installed (`pip install redisvl`) + +```bash +# Local development with Redis 8.0+ (recommended for full feature support) +docker run -d --name redis -p 6379:6379 redis:8.0 +``` + +**Note:** Redis 8.0+ is required for INT8/UINT8 vector datatypes. SVS-VAMANA algorithm requires Redis 8.2+ and Intel AVX-512 hardware. + +## Step 1: Discover Available Indexes + +```bash +rvl migrate list --url redis://localhost:6379 +``` + +**Example output:** +``` +Available indexes: + 1. products_idx + 2. users_idx + 3. orders_idx +``` + +## Step 2: Build Your Schema Change + +Choose one of these approaches: + +### Option A: Use the Wizard (Recommended) + +The wizard guides you through building a migration interactively. Run: + +```bash +rvl migrate wizard --index myindex --url redis://localhost:6379 +``` + +**Example wizard session (adding a field):** + +```text +Building a migration plan for index 'myindex' +Current schema: +- Index name: myindex +- Storage type: hash + - title (text) + - embedding (vector) + +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 1 + +Field name: category +Field type options: text, tag, numeric, geo +Field type: tag + Sortable: enables sorting and aggregation on this field +Sortable [y/n]: n + Separator: character that splits multiple values (default: comma) +Separator [leave blank to keep existing/default]: | + +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 5 + +Migration plan written to /path/to/migration_plan.yaml +Mode: drop_recreate +Supported: True +Warnings: +- Index downtime is required +``` + +**Example wizard session (quantizing vectors):** + +```text +Choose an action: +1. Add field (text, tag, numeric, geo) +2. Update field (sortable, weight, separator) +3. Remove field +4. Preview patch (show pending changes as YAML) +5. Finish +Enter a number: 2 + +Updatable fields: +1. title (text) +2. embedding (vector) +Select a field to update by number or name: 2 + +Current vector config for 'embedding': + algorithm: HNSW + datatype: float32 + distance_metric: cosine + dims: 384 (cannot be changed) + m: 16 + ef_construction: 200 + +Leave blank to keep current value. + Algorithm: vector search method (FLAT=brute force, HNSW=graph, SVS-VAMANA=compressed graph) +Algorithm [current: HNSW]: + Datatype: float16, float32, bfloat16, float64, int8, uint8 + (float16 reduces memory ~50%, int8/uint8 reduce ~75%) +Datatype [current: float32]: float16 + Distance metric: how similarity is measured (cosine, l2, ip) +Distance metric [current: cosine]: + M: number of connections per node (higher=better recall, more memory) +M [current: 16]: + EF_CONSTRUCTION: build-time search depth (higher=better recall, slower build) +EF_CONSTRUCTION [current: 200]: + +Choose an action: +... +5. Finish +Enter a number: 5 + +Migration plan written to /path/to/migration_plan.yaml +Mode: drop_recreate +Supported: True +``` + +### Option B: Write a Schema Patch (YAML) + +Create `schema_patch.yaml` manually: + +```yaml +version: 1 +changes: + add_fields: + - name: category + type: tag + path: $.category + attrs: + separator: "|" + remove_fields: + - legacy_field + update_fields: + - name: title + attrs: + sortable: true + - name: embedding + attrs: + datatype: float16 # quantize vectors + algorithm: HNSW + distance_metric: cosine +``` + +Then generate the plan: + +```bash +rvl migrate plan \ + --index myindex \ + --schema-patch schema_patch.yaml \ + --url redis://localhost:6379 \ + --plan-out migration_plan.yaml +``` + +### Option C: Provide a Target Schema + +If you have the complete target schema, use it directly: + +```bash +rvl migrate plan \ + --index myindex \ + --target-schema target_schema.yaml \ + --url redis://localhost:6379 \ + --plan-out migration_plan.yaml +``` + +## Step 3: Review the Migration Plan + +Before applying, review `migration_plan.yaml`: + +```yaml +# migration_plan.yaml (example) +version: 1 +mode: drop_recreate + +source: + schema_snapshot: + index: + name: myindex + prefix: "doc:" + storage_type: json + fields: + - name: title + type: text + - name: embedding + type: vector + attrs: + dims: 384 + algorithm: hnsw + datatype: float32 + stats_snapshot: + num_docs: 10000 + keyspace: + prefixes: ["doc:"] + key_sample: ["doc:1", "doc:2", "doc:3"] + +requested_changes: + add_fields: + - name: category + type: tag + +diff_classification: + supported: true + mode: drop_recreate + warnings: + - "Index will be unavailable during migration" + blocked_reasons: [] + +rename_operations: + rename_index: null + change_prefix: null + rename_fields: [] + +merged_target_schema: + index: + name: myindex + prefix: "doc:" + storage_type: json + fields: + - name: title + type: text + - name: category + type: tag + - name: embedding + type: vector + attrs: + dims: 384 + algorithm: hnsw + datatype: float32 + +warnings: [] +``` + +**Key fields to check:** +- `diff_classification.supported` - Must be `true` to proceed +- `diff_classification.blocked_reasons` - Must be empty +- `merged_target_schema` - The final schema after migration + +## Understanding Downtime Requirements + +**CRITICAL**: During a `drop_recreate` migration, your application must: + +| Requirement | Description | +|-------------|-------------| +| **Pause reads** | Index is unavailable during migration | +| **Pause writes** | Writes during migration may be missed or cause conflicts | + +### Why Both Reads AND Writes Must Be Paused + +- **Reads**: The index definition is dropped and recreated. Any queries during this window will fail. +- **Writes**: Redis updates indexes synchronously on every write. If your app writes documents while the index is dropped, those writes are not indexed. Additionally, if you're quantizing vectors (float32 → float16), concurrent writes may conflict with the migration's re-encoding process. + +### What "Downtime" Means + +| Downtime Type | Reads | Writes | Safe? | +|---------------|-------|--------|-------| +| Full quiesce (recommended) | Stopped | Stopped | **YES** | +| Read-only pause | Stopped | Continuing | **NO** | +| Active | Active | Active | **NO** | + +### Recovery from Interrupted Migration + +| Interruption Point | Documents | Index | Recovery | +|--------------------|-----------|-------|----------| +| After drop, before quantize | Unchanged | **None** | Re-run apply (or `--resume` if checkpoint exists) | +| During quantization | Partially quantized | **None** | Re-run with `--resume` to continue from checkpoint | +| After quantization, before create | Quantized | **None** | Re-run apply (will recreate index) | +| After create | Correct | Rebuilding | Wait for index ready | + +The underlying documents are **never deleted** by `drop_recreate` mode. For large quantization jobs, use `--resume` to enable checkpoint-based recovery. See [Crash-safe resume for quantization](#crash-safe-resume-for-quantization) below. + +## Step 4: Apply the Migration + +The `apply` command executes the migration. The index will be temporarily unavailable during the drop-recreate process. + +```bash +rvl migrate apply \ + --plan migration_plan.yaml \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml \ + --benchmark-out benchmark_report.yaml +``` + +### What `apply` does + +The migration executor follows this sequence: + +**STEP 1: Enumerate keys** (before any modifications) +- Discovers all document keys belonging to the source index +- Uses `FT.AGGREGATE WITHCURSOR` for efficient enumeration +- Falls back to `SCAN` if the index has indexing failures +- Keys are stored in memory for quantization or rename operations + +**STEP 2: Drop source index** +- Issues `FT.DROPINDEX` to remove the index structure +- **The underlying documents remain in Redis** - only the index metadata is deleted +- After this point, the index is unavailable until step 6 completes + +**STEP 3: Quantize vectors** (if changing vector datatype) +- For each document in the enumerated key list: + - Reads the document (including the old vector) + - Converts the vector to the new datatype (e.g., float32 → float16) + - Writes back the converted vector to the same document +- Processes documents in batches of 500 using Redis pipelines +- Skipped for JSON storage (vectors are re-indexed automatically on recreate) +- **Checkpoint support**: For large datasets, use `--resume` to enable crash-safe recovery + +**STEP 4: Key renames** (if changing key prefix) +- If the migration changes the key prefix, renames each key from old prefix to new prefix +- Skipped if no prefix change + +**STEP 5: Create target index** +- Issues `FT.CREATE` with the merged target schema +- Redis begins background indexing of existing documents + +**STEP 6: Wait for re-indexing** +- Polls `FT.INFO` until indexing completes +- The index becomes available for queries when this completes + +**Summary**: The migration preserves all documents, drops only the index structure, performs any document-level transformations (quantization, renames), then recreates the index with the new schema. + +### Async execution for large migrations + +For large migrations (especially those involving vector quantization), use the `--async` flag: + +```bash +rvl migrate apply \ + --plan migration_plan.yaml \ + --async \ + --url redis://localhost:6379 +``` + +**What becomes async:** + +- Document enumeration during quantization (uses `FT.AGGREGATE WITHCURSOR` for index-specific enumeration, falling back to SCAN only if indexing failures exist) +- Vector read/write operations (sequential async HGET, batched HSET via pipeline) +- Index readiness polling (uses `asyncio.sleep()` instead of blocking) +- Validation checks + +**What stays sync:** + +- CLI prompts and user interaction +- YAML file reading/writing +- Progress display + +**When to use async:** + +- Quantizing millions of vectors (float32 to float16) +- Integrating into an async application + +For most migrations (index-only changes, small datasets), sync mode is sufficient and simpler. + +See {doc}`/concepts/index-migrations` for detailed async vs sync guidance. + +### Crash-safe resume for quantization + +When migrating large datasets with vector quantization (e.g. float32 to float16), the re-encoding step can take minutes or hours. If the process is interrupted (crash, network drop, OOM kill), you don't want to start over. The `--resume` flag enables checkpoint-based recovery. + +#### How it works + +1. **Pre-flight estimate** -- before any mutations, `apply` prints a disk space estimate showing RDB snapshot cost, AOF growth (if enabled), and post-migration memory savings. +2. **BGSAVE safety snapshot** -- the migrator triggers a Redis `BGSAVE` and waits for it to complete before modifying any data. This gives you a point-in-time snapshot to fall back on. +3. **Checkpoint file** -- when `--resume` is provided, the migrator writes a YAML checkpoint after every batch of 500 documents. The checkpoint records how many keys have been processed and the last batch of keys written. +4. **Batch undo buffer** -- if a single batch fails mid-write, original vector values are rolled back via pipeline before the error propagates. Only the current batch is held in memory. +5. **Idempotent skip** -- on resume, vectors that were already converted are detected by byte-width inspection and skipped automatically. + +#### Step-by-step: using crash-safe resume + +**1. Estimate disk space (dry-run, no mutations):** + +```bash +rvl migrate estimate --plan migration_plan.yaml +``` + +Example output: + +```text +Pre-migration disk space estimate: + Index: products_idx (1,000,000 documents) + Vector field 'embedding': 768 dims, float32 -> float16 + + RDB snapshot (BGSAVE): ~2.87 GB + AOF growth: not estimated (pass aof_enabled=True if AOF is on) + Total new disk required: ~2.87 GB + + Post-migration memory savings: ~1.43 GB (50% reduction) +``` + +If AOF is enabled: + +```bash +rvl migrate estimate --plan migration_plan.yaml --aof-enabled +``` + +**2. Apply with checkpoint enabled:** + +```bash +rvl migrate apply \ + --plan migration_plan.yaml \ + --resume quantize_checkpoint.yaml \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml +``` + +The `--resume` flag takes a path to a checkpoint file. If the file does not exist, a new checkpoint is created. If it already exists (from a previous interrupted run), the migrator resumes from where it left off. + +**3. If the process crashes or is interrupted:** + +The checkpoint file (`quantize_checkpoint.yaml`) will contain the progress: + +```yaml +index_name: products_idx +total_keys: 1000000 +completed_keys: 450000 +completed_batches: 900 +last_batch_keys: + - 'products:449501' + - 'products:449502' + # ... +status: in_progress +checkpoint_path: quantize_checkpoint.yaml +``` + +**4. Resume the migration:** + +Re-run the exact same command: + +```bash +rvl migrate apply \ + --plan migration_plan.yaml \ + --resume quantize_checkpoint.yaml \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml +``` + +The migrator will: +- Detect the existing checkpoint and skip already-processed keys +- Re-enumerate documents via SCAN (the index was already dropped before the crash) +- Continue quantizing from where it left off +- Print progress like `[4/6] Quantize vectors: 450,000/1,000,000 docs` + +**5. On successful completion:** + +The checkpoint status is set to `completed`. You can safely delete the checkpoint file. + +#### What gets rolled back on batch failure + +If a batch of 500 documents fails mid-write (e.g. Redis returns an error), the migrator: +1. Restores original vector bytes for all documents in that batch using the undo buffer +2. Saves the checkpoint (so progress up to the last successful batch is preserved) +3. Raises the error + +This means you never end up with partially-written vectors in a single batch. + +#### Limitations + +- **Same-width conversions** (float16 to bfloat16, or int8 to uint8) are **not supported** with `--resume`. These conversions cannot be detected by byte-width inspection, so idempotent skip is impossible. The migrator will refuse to proceed and suggest running without `--resume`. +- **JSON storage** does not need vector re-encoding (Redis re-indexes JSON vectors on `FT.CREATE`). The checkpoint is still created for consistency but no batched writes occur. +- The checkpoint file must match the migration plan. If you change the plan, delete the old checkpoint and start fresh. + +#### Python API with checkpoints + +```python +from redisvl.migration import MigrationExecutor + +executor = MigrationExecutor() +report = executor.apply( + plan, + redis_url="redis://localhost:6379", + checkpoint_path="quantize_checkpoint.yaml", +) +``` + +For async: + +```python +from redisvl.migration import AsyncMigrationExecutor + +executor = AsyncMigrationExecutor() +report = await executor.apply( + plan, + redis_url="redis://localhost:6379", + checkpoint_path="quantize_checkpoint.yaml", +) +``` + +## Step 5: Validate the Result + +Validation happens automatically during `apply`, but you can run it separately: + +```bash +rvl migrate validate \ + --plan migration_plan.yaml \ + --url redis://localhost:6379 \ + --report-out migration_report.yaml +``` + +**Validation checks:** +- Live schema matches `merged_target_schema` +- Document count matches the source snapshot +- Sampled keys still exist +- No increase in indexing failures + +## What's Supported + +| Change | Supported | Notes | +|--------|-----------|-------| +| Add text/tag/numeric/geo field | ✅ | | +| Remove a field | ✅ | | +| Rename a field | ✅ | Renames field in all documents | +| Change key prefix | ✅ | Renames keys via RENAME command | +| Rename the index | ✅ | Index-only | +| Make a field sortable | ✅ | | +| Change field options (separator, stemming) | ✅ | | +| Change vector algorithm (FLAT ↔ HNSW ↔ SVS-VAMANA) | ✅ | Index-only | +| Change distance metric (COSINE ↔ L2 ↔ IP) | ✅ | Index-only | +| Tune HNSW parameters (M, EF_CONSTRUCTION) | ✅ | Index-only | +| Quantize vectors (float32 → float16/bfloat16/int8/uint8) | ✅ | Auto re-encode | + +## What's Blocked + +| Change | Why | Workaround | +|--------|-----|------------| +| Change vector dimensions | Requires re-embedding | Re-embed with new model, reload data | +| Change storage type (hash ↔ JSON) | Different data format | Export, transform, reload | +| Add a new vector field | Requires vectors for all docs | Add vectors first, then migrate | + +## CLI Reference + +### Single-Index Commands + +| Command | Description | +|---------|-------------| +| `rvl migrate list` | List all indexes | +| `rvl migrate wizard` | Build a migration interactively | +| `rvl migrate plan` | Generate a migration plan | +| `rvl migrate apply` | Execute a migration | +| `rvl migrate estimate` | Estimate disk space for a migration (dry-run) | +| `rvl migrate validate` | Verify a migration result | + +### Batch Commands + +| Command | Description | +|---------|-------------| +| `rvl migrate batch-plan` | Create a batch migration plan | +| `rvl migrate batch-apply` | Execute a batch migration | +| `rvl migrate batch-resume` | Resume an interrupted batch | +| `rvl migrate batch-status` | Check batch progress | + +**Common flags:** +- `--url` : Redis connection URL +- `--index` : Index name to migrate +- `--plan` / `--plan-out` : Path to migration plan +- `--async` : Use async executor for large migrations (apply only) +- `--resume` : Path to checkpoint file for crash-safe quantization resume (apply only) +- `--report-out` : Path for validation report +- `--benchmark-out` : Path for performance metrics + +**Batch-specific flags:** +- `--pattern` : Glob pattern to match index names (e.g., `*_idx`) +- `--indexes` : Explicit list of index names +- `--indexes-file` : File containing index names (one per line) +- `--schema-patch` : Path to shared schema patch YAML +- `--state` : Path to checkpoint state file +- `--failure-policy` : `fail_fast` or `continue_on_error` +- `--accept-data-loss` : Required for quantization (lossy changes) +- `--retry-failed` : Retry previously failed indexes on resume + +## Troubleshooting + +### Migration blocked: "unsupported change" + +The planner detected a change that requires data transformation. Check `diff_classification.blocked_reasons` in the plan for details. + +### Apply failed: "source schema mismatch" + +The live index schema changed since the plan was generated. Re-run `rvl migrate plan` to create a fresh plan. + +### Apply failed: "timeout waiting for index ready" + +The index is taking longer to rebuild than expected. This can happen with large datasets. Check Redis logs and consider increasing the timeout or running during lower traffic periods. + +### Validation failed: "document count mismatch" + +Documents were added or removed between plan and apply. This is expected if your application is actively writing. Re-run `plan` and `apply` during a quieter period when the document count is stable, or verify the mismatch is due only to normal application traffic. + +### How to recover from a failed migration + +If `apply` fails mid-migration: + +1. **Check if the index exists:** `rvl index info --index myindex` +2. **If the index exists but is wrong:** Re-run `apply` with the same plan +3. **If the index was dropped:** Recreate it from the plan's `merged_target_schema` + +The underlying documents are never deleted by `drop_recreate`. + +## Python API + +For programmatic migrations, use the migration classes directly: + +### Sync API + +```python +from redisvl.migration import MigrationPlanner, MigrationExecutor + +planner = MigrationPlanner() +plan = planner.create_plan( + "myindex", + redis_url="redis://localhost:6379", + schema_patch_path="schema_patch.yaml", +) + +executor = MigrationExecutor() +report = executor.apply(plan, redis_url="redis://localhost:6379") +print(f"Migration result: {report.result}") +``` + +### Async API + +```python +import asyncio +from redisvl.migration import AsyncMigrationPlanner, AsyncMigrationExecutor + +async def migrate(): + planner = AsyncMigrationPlanner() + plan = await planner.create_plan( + "myindex", + redis_url="redis://localhost:6379", + schema_patch_path="schema_patch.yaml", + ) + + executor = AsyncMigrationExecutor() + report = await executor.apply(plan, redis_url="redis://localhost:6379") + print(f"Migration result: {report.result}") + +asyncio.run(migrate()) +``` + +## Batch Migration + +When you need to apply the same schema change to multiple indexes, use batch migration. This is common for: + +- Quantizing all indexes from float32 → float16 +- Standardizing vector algorithms across indexes +- Coordinated migrations during maintenance windows + +### Quick Start: Batch Migration + +```bash +# 1. Create a shared patch (applies to any index with an 'embedding' field) +cat > quantize_patch.yaml << 'EOF' +version: 1 +changes: + update_fields: + - name: embedding + attrs: + datatype: float16 +EOF + +# 2. Create a batch plan for all indexes matching a pattern +rvl migrate batch-plan \ + --pattern "*_idx" \ + --schema-patch quantize_patch.yaml \ + --plan-out batch_plan.yaml \ + --url redis://localhost:6379 + +# 3. Apply the batch plan +rvl migrate batch-apply \ + --plan batch_plan.yaml \ + --accept-data-loss \ + --url redis://localhost:6379 + +# 4. Check status +rvl migrate batch-status --state batch_state.yaml +``` + +### Batch Plan Options + +**Select indexes by pattern:** +```bash +rvl migrate batch-plan \ + --pattern "*_idx" \ + --schema-patch quantize_patch.yaml \ + --plan-out batch_plan.yaml \ + --url redis://localhost:6379 +``` + +**Select indexes by explicit list:** +```bash +rvl migrate batch-plan \ + --indexes "products_idx,users_idx,orders_idx" \ + --schema-patch quantize_patch.yaml \ + --plan-out batch_plan.yaml \ + --url redis://localhost:6379 +``` + +**Select indexes from a file (for 100+ indexes):** +```bash +# Create index list file +echo -e "products_idx\nusers_idx\norders_idx" > indexes.txt + +rvl migrate batch-plan \ + --indexes-file indexes.txt \ + --schema-patch quantize_patch.yaml \ + --plan-out batch_plan.yaml \ + --url redis://localhost:6379 +``` + +### Batch Plan Review + +The generated `batch_plan.yaml` shows which indexes will be migrated: + +```yaml +version: 1 +batch_id: "batch_20260320_100000" +mode: drop_recreate +failure_policy: fail_fast +requires_quantization: true + +shared_patch: + version: 1 + changes: + update_fields: + - name: embedding + attrs: + datatype: float16 + +indexes: + - name: products_idx + applicable: true + skip_reason: null + - name: users_idx + applicable: true + skip_reason: null + - name: legacy_idx + applicable: false + skip_reason: "Field 'embedding' not found" + +created_at: "2026-03-20T10:00:00Z" +``` + +**Key fields:** +- `applicable: true` means the patch applies to this index +- `skip_reason` explains why an index will be skipped + +### Applying a Batch Plan + +```bash +# Apply with fail-fast (default: stop on first error) +rvl migrate batch-apply \ + --plan batch_plan.yaml \ + --accept-data-loss \ + --url redis://localhost:6379 + +# Apply with continue-on-error (set at batch-plan time) +# Note: failure_policy is set during batch-plan, not batch-apply +rvl migrate batch-plan \ + --pattern "*_idx" \ + --schema-patch quantize_patch.yaml \ + --failure-policy continue_on_error \ + --plan-out batch_plan.yaml \ + --url redis://localhost:6379 + +rvl migrate batch-apply \ + --plan batch_plan.yaml \ + --accept-data-loss \ + --url redis://localhost:6379 +``` + +**Flags for batch-apply:** +- `--accept-data-loss` : Required when quantizing vectors (float32 → float16 is lossy) +- `--state` : Path to checkpoint file (default: `batch_state.yaml`) +- `--report-dir` : Directory for per-index reports (default: `./reports/`) + +**Note:** `--failure-policy` is set during `batch-plan`, not `batch-apply`. The policy is stored in the batch plan file. + +### Resume After Failure + +Batch migration automatically checkpoints progress. If interrupted: + +```bash +# Resume from where it left off +rvl migrate batch-resume \ + --state batch_state.yaml \ + --url redis://localhost:6379 + +# Retry previously failed indexes +rvl migrate batch-resume \ + --state batch_state.yaml \ + --retry-failed \ + --url redis://localhost:6379 +``` + +### Checking Batch Status + +```bash +rvl migrate batch-status --state batch_state.yaml +``` + +**Example output:** +``` +Batch Migration Status +====================== +Batch ID: batch_20260320_100000 +Started: 2026-03-20T10:00:00Z +Updated: 2026-03-20T10:25:00Z + +Completed: 2 + - products_idx: succeeded (10:02:30) + - users_idx: failed - Redis connection timeout (10:05:45) + +In Progress: inventory_idx +Remaining: 1 (analytics_idx) +``` + +### Batch Report + +After completion, a `batch_report.yaml` is generated: + +```yaml +version: 1 +batch_id: "batch_20260320_100000" +status: completed # or partial_failure, failed +summary: + total_indexes: 3 + successful: 3 + failed: 0 + skipped: 0 + total_duration_seconds: 127.5 +indexes: + - name: products_idx + status: succeeded + duration_seconds: 45.2 + docs_migrated: 15000 + report_path: ./reports/products_idx_report.yaml + - name: users_idx + status: succeeded + duration_seconds: 38.1 + docs_migrated: 8500 + - name: orders_idx + status: succeeded + duration_seconds: 44.2 + docs_migrated: 22000 +completed_at: "2026-03-20T10:02:07Z" +``` + +### Python API for Batch Migration + +```python +from redisvl.migration import BatchMigrationPlanner, BatchMigrationExecutor + +# Create batch plan +planner = BatchMigrationPlanner() +batch_plan = planner.create_batch_plan( + redis_url="redis://localhost:6379", + pattern="*_idx", + schema_patch_path="quantize_patch.yaml", +) + +# Review applicability +for idx in batch_plan.indexes: + if idx.applicable: + print(f"Will migrate: {idx.name}") + else: + print(f"Skipping {idx.name}: {idx.skip_reason}") + +# Execute batch +executor = BatchMigrationExecutor() +report = executor.apply( + batch_plan, + redis_url="redis://localhost:6379", + state_path="batch_state.yaml", + report_dir="./reports/", + progress_callback=lambda name, pos, total, status: print(f"[{pos}/{total}] {name}: {status}"), +) + +print(f"Batch status: {report.status}") +print(f"Successful: {report.summary.successful}/{report.summary.total_indexes}") +``` + +### Batch Migration Tips + +1. **Test on a single index first**: Run a single-index migration to verify the patch works before applying to a batch. + +2. **Use `continue_on_error` for large batches**: This ensures one failure doesn't block all remaining indexes. + +3. **Schedule during low-traffic periods**: Each index has downtime during migration. + +4. **Review skipped indexes**: The `skip_reason` often indicates schema differences that need attention. + +5. **Keep checkpoint files**: The `batch_state.yaml` is essential for resume. Don't delete it until the batch completes successfully. + +## Learn more + +- {doc}`/concepts/index-migrations`: How migrations work and which changes are supported diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index 5d2cf6dfd..d85177e73 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -39,7 +39,7 @@ Schema → Index → Load → Query **Solve specific problems.** Task-oriented recipes for LLM extensions, querying, embeddings, optimization, and storage. +++ -LLM Caching • Filtering • Vectorizers • Reranking +LLM Caching • Filtering • Vectorizers • Reranking • Migrations ::: :::{grid-item-card} 💻 CLI Reference @@ -49,7 +49,7 @@ LLM Caching • Filtering • Vectorizers • Reranking **Command-line tools.** Manage indices, inspect stats, and work with schemas using the `rvl` CLI. +++ -rvl index • rvl stats • Schema YAML +rvl index • rvl stats • rvl migrate • Schema YAML ::: :::{grid-item-card} 💡 Use Cases diff --git a/redisvl/cli/main.py b/redisvl/cli/main.py index 6b3287535..c5e82fd74 100644 --- a/redisvl/cli/main.py +++ b/redisvl/cli/main.py @@ -54,3 +54,4 @@ def stats(self): def migrate(self): Migrate() + exit(0) diff --git a/redisvl/cli/migrate.py b/redisvl/cli/migrate.py index d111a93a8..9b948e298 100644 --- a/redisvl/cli/migrate.py +++ b/redisvl/cli/migrate.py @@ -1,10 +1,18 @@ import argparse +import asyncio import sys from pathlib import Path from typing import Optional from redisvl.cli.utils import add_redis_connection_options, create_redis_url -from redisvl.migration import MigrationExecutor, MigrationPlanner, MigrationValidator +from redisvl.migration import ( + AsyncMigrationExecutor, + BatchMigrationExecutor, + BatchMigrationPlanner, + MigrationExecutor, + MigrationPlanner, + MigrationValidator, +) from redisvl.migration.utils import ( detect_aof_enabled, estimate_disk_space, @@ -14,6 +22,7 @@ write_benchmark_report, write_migration_report, ) +from redisvl.migration.wizard import MigrationWizard from redisvl.redis.connection import RedisConnectionFactory from redisvl.utils.log import get_logger @@ -28,9 +37,16 @@ class Migrate: "\thelper Show migration guidance and supported capabilities", "\tlist List all available indexes", "\tplan Generate a migration plan for a document-preserving drop/recreate migration", - "\tapply Execute a reviewed drop/recreate migration plan", + "\twizard Interactively build a migration plan and schema patch", + "\tapply Execute a reviewed drop/recreate migration plan (use --async for large migrations)", "\testimate Estimate disk space required for a migration plan (dry-run, no mutations)", "\tvalidate Validate a completed migration plan against the live index", + "", + "Batch Commands:", + "\tbatch-plan Generate a batch migration plan for multiple indexes", + "\tbatch-apply Execute a batch migration plan with checkpointing", + "\tbatch-resume Resume an interrupted batch migration", + "\tbatch-status Show status of an in-progress or completed batch migration", "\n", ] ) @@ -92,6 +108,7 @@ def helper(self): Commands: rvl migrate list List all indexes + rvl migrate wizard --index Guided migration builder rvl migrate plan --index --schema-patch rvl migrate apply --plan rvl migrate validate --plan """ @@ -144,14 +161,79 @@ def plan(self): planner.write_plan(plan, args.plan_out) self._print_plan_summary(args.plan_out, plan) + def wizard(self): + parser = argparse.ArgumentParser( + usage=( + "rvl migrate wizard [--index ] " + "[--patch ] " + "[--plan-out ] [--patch-out ]" + ) + ) + parser.add_argument("-i", "--index", help="Source index name", required=False) + parser.add_argument( + "--patch", + help="Load an existing schema patch to continue editing", + default=None, + ) + parser.add_argument( + "--plan-out", + help="Path to write migration_plan.yaml", + default="migration_plan.yaml", + ) + parser.add_argument( + "--patch-out", + help="Path to write schema_patch.yaml (for later editing)", + default="schema_patch.yaml", + ) + parser.add_argument( + "--target-schema-out", + help="Optional path to write the merged target schema", + default=None, + ) + parser.add_argument( + "--key-sample-limit", + help="Maximum number of keys to sample from the index keyspace", + type=int, + default=10, + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + redis_url = create_redis_url(args) + wizard = MigrationWizard( + planner=MigrationPlanner(key_sample_limit=args.key_sample_limit) + ) + plan = wizard.run( + index_name=args.index, + redis_url=redis_url, + existing_patch_path=args.patch, + plan_out=args.plan_out, + patch_out=args.patch_out, + target_schema_out=args.target_schema_out, + ) + self._print_plan_summary(args.plan_out, plan) + def apply(self): parser = argparse.ArgumentParser( usage=( "rvl migrate apply --plan " + "[--async] [--resume ] " "[--report-out ]" ) ) parser.add_argument("--plan", help="Path to migration_plan.yaml", required=True) + parser.add_argument( + "--async", + dest="use_async", + help="Use async executor (recommended for large migrations with quantization)", + action="store_true", + ) + parser.add_argument( + "--resume", + dest="checkpoint_path", + help="Path to quantization checkpoint file for crash-safe resume", + default=None, + ) parser.add_argument( "--report-out", help="Path to write migration_report.yaml", @@ -173,7 +255,32 @@ def apply(self): redis_url = create_redis_url(args) plan = load_migration_plan(args.plan) - report = self._apply_sync(plan, redis_url, args.query_check_file) + # Print disk space estimate for quantization migrations + aof_enabled = False + try: + client = RedisConnectionFactory.get_redis_connection(redis_url=redis_url) + try: + aof_enabled = detect_aof_enabled(client) + finally: + client.close() + except Exception as exc: + logger.debug("Could not detect AOF for CLI preflight estimate: %s", exc) + + disk_estimate = estimate_disk_space(plan, aof_enabled=aof_enabled) + if disk_estimate.has_quantization: + print(f"\n{disk_estimate.summary()}\n") + + checkpoint_path = args.checkpoint_path + if args.use_async: + report = asyncio.run( + self._apply_async( + plan, redis_url, args.query_check_file, checkpoint_path + ) + ) + else: + report = self._apply_sync( + plan, redis_url, args.query_check_file, checkpoint_path + ) write_migration_report(report, args.report_out) if args.benchmark_out: @@ -199,7 +306,7 @@ def estimate(self): @staticmethod def _make_progress_callback(): - """Create a progress callback for migration apply.""" + """Create a progress callback for migration apply (shared by sync and async).""" step_labels = { "enumerate": "[1/8] Enumerate keys", "bgsave": "[2/8] BGSAVE snapshot", @@ -226,6 +333,7 @@ def _apply_sync( plan, redis_url: str, query_check_file: Optional[str], + checkpoint_path: Optional[str] = None, ): """Execute migration synchronously.""" executor = MigrationExecutor() @@ -237,6 +345,30 @@ def _apply_sync( redis_url=redis_url, query_check_file=query_check_file, progress_callback=self._make_progress_callback(), + checkpoint_path=checkpoint_path, + ) + + self._print_apply_result(report) + return report + + async def _apply_async( + self, + plan, + redis_url: str, + query_check_file: Optional[str], + checkpoint_path: Optional[str] = None, + ): + """Execute migration asynchronously (non-blocking for large quantization jobs).""" + executor = AsyncMigrationExecutor() + + print(f"\nApplying migration to '{plan.source.index_name}' (async mode)...") + + report = await executor.apply( + plan, + redis_url=redis_url, + query_check_file=query_check_file, + progress_callback=self._make_progress_callback(), + checkpoint_path=checkpoint_path, ) self._print_apply_result(report) @@ -336,11 +468,9 @@ def _print_plan_summary(self, plan_out: str, plan) -> None: import os abs_path = os.path.abspath(plan_out) - print( - f"""Migration plan written to {abs_path} -Mode: {plan.mode} -Supported: {plan.diff_classification.supported}""" - ) + print(f"Migration plan written to {abs_path}") + print(f"Mode: {plan.mode}") + print(f"Supported: {plan.diff_classification.supported}") if plan.warnings: print("Warnings:") for warning in plan.warnings: @@ -350,13 +480,17 @@ def _print_plan_summary(self, plan_out: str, plan) -> None: for reason in plan.diff_classification.blocked_reasons: print(f"- {reason}") + print("\nNext steps:") + print(f" Review the plan: cat {plan_out}") + print(f" Apply the migration: rvl migrate apply --plan {plan_out}") + print(f" Validate the result: rvl migrate validate --plan {plan_out}") print( - f"""\nNext steps: - Review the plan: cat {plan_out} - Apply the migration: rvl migrate apply --plan {plan_out} - Validate the result: rvl migrate validate --plan {plan_out} - To cancel: rm {plan_out}""" + f"\nTo add more changes: rvl migrate wizard --index {plan.source.index_name} --patch schema_patch.yaml" ) + print( + f"To start over: rvl migrate wizard --index {plan.source.index_name}" + ) + print(f"To cancel: rm {plan_out}") def _print_report_summary( self, @@ -364,14 +498,12 @@ def _print_report_summary( report, benchmark_out: Optional[str], ) -> None: - print( - f"""Migration report written to {report_out} -Result: {report.result} -Schema match: {report.validation.schema_match} -Doc count match: {report.validation.doc_count_match} -Key sample exists: {report.validation.key_sample_exists} -Indexing failures delta: {report.validation.indexing_failures_delta}""" - ) + print(f"Migration report written to {report_out}") + print(f"Result: {report.result}") + print(f"Schema match: {report.validation.schema_match}") + print(f"Doc count match: {report.validation.doc_count_match}") + print(f"Key sample exists: {report.validation.key_sample_exists}") + print(f"Indexing failures delta: {report.validation.indexing_failures_delta}") if report.validation.errors: print("Errors:") for error in report.validation.errors: @@ -382,3 +514,268 @@ def _print_report_summary( print(f"- {action}") if benchmark_out: print(f"Benchmark report written to {benchmark_out}") + + # ------------------------------------------------------------------------- + # Batch migration commands + # ------------------------------------------------------------------------- + + def batch_plan(self): + """Generate a batch migration plan for multiple indexes.""" + parser = argparse.ArgumentParser( + usage=( + "rvl migrate batch-plan --schema-patch " + "(--pattern | --indexes | --indexes-file )" + ) + ) + parser.add_argument( + "--schema-patch", help="Path to shared schema patch file", required=True + ) + parser.add_argument( + "--pattern", help="Glob pattern to match index names (e.g., '*_idx')" + ) + parser.add_argument("--indexes", help="Comma-separated list of index names") + parser.add_argument( + "--indexes-file", help="File with index names (one per line)" + ) + parser.add_argument( + "--failure-policy", + help="How to handle failures: fail_fast or continue_on_error", + choices=["fail_fast", "continue_on_error"], + default="fail_fast", + ) + parser.add_argument( + "--plan-out", + help="Path to write batch_plan.yaml", + default="batch_plan.yaml", + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + redis_url = create_redis_url(args) + indexes = ( + [idx.strip() for idx in args.indexes.split(",") if idx.strip()] + if args.indexes + else None + ) + + planner = BatchMigrationPlanner() + batch_plan = planner.create_batch_plan( + indexes=indexes, + pattern=args.pattern, + indexes_file=args.indexes_file, + schema_patch_path=args.schema_patch, + redis_url=redis_url, + failure_policy=args.failure_policy, + ) + + planner.write_batch_plan(batch_plan, args.plan_out) + self._print_batch_plan_summary(args.plan_out, batch_plan) + + def batch_apply(self): + """Execute a batch migration plan with checkpointing.""" + parser = argparse.ArgumentParser( + usage=( + "rvl migrate batch-apply --plan " + "[--state ] [--report-dir <./reports>]" + ) + ) + parser.add_argument("--plan", help="Path to batch_plan.yaml", required=True) + parser.add_argument( + "--accept-data-loss", + help="Acknowledge that quantization is lossy and cannot be reverted", + action="store_true", + ) + parser.add_argument( + "--state", + help="Path to checkpoint state file", + default="batch_state.yaml", + ) + parser.add_argument( + "--report-dir", + help="Directory for per-index migration reports", + default="./reports", + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + # Local import to avoid circular dependency with migration module + from redisvl.migration.models import BatchPlan + + plan_data = load_yaml(args.plan) + batch_plan = BatchPlan.model_validate(plan_data) + + # Check for quantization warning + if batch_plan.requires_quantization and not args.accept_data_loss: + print( + """WARNING: This batch migration includes quantization (e.g., float32 -> float16). + Vector data will be modified. Original precision cannot be recovered. + To proceed, add --accept-data-loss flag. + + If you need to preserve original vectors, backup your data first: + redis-cli BGSAVE""" + ) + exit(1) + + redis_url = create_redis_url(args) + executor = BatchMigrationExecutor() + + def progress_callback( + index_name: str, position: int, total: int, status: str + ) -> None: + print(f"[{position}/{total}] {index_name}: {status}") + + report = executor.apply( + batch_plan, + batch_plan_path=args.plan, + state_path=args.state, + report_dir=args.report_dir, + redis_url=redis_url, + progress_callback=progress_callback, + ) + + self._print_batch_report_summary(report) + + def batch_resume(self): + """Resume an interrupted batch migration.""" + parser = argparse.ArgumentParser( + usage=( + "rvl migrate batch-resume --state " + "[--plan ] [--retry-failed]" + ) + ) + parser.add_argument( + "--state", help="Path to checkpoint state file", required=True + ) + parser.add_argument( + "--plan", help="Path to batch_plan.yaml (optional, uses state.plan_path)" + ) + parser.add_argument( + "--retry-failed", + help="Retry previously failed indexes", + action="store_true", + ) + parser.add_argument( + "--report-dir", + help="Directory for per-index migration reports", + default="./reports", + ) + parser = add_redis_connection_options(parser) + args = parser.parse_args(sys.argv[3:]) + + redis_url = create_redis_url(args) + executor = BatchMigrationExecutor() + + def progress_callback( + index_name: str, position: int, total: int, status: str + ) -> None: + print(f"[{position}/{total}] {index_name}: {status}") + + report = executor.resume( + args.state, + batch_plan_path=args.plan, + retry_failed=args.retry_failed, + report_dir=args.report_dir, + redis_url=redis_url, + progress_callback=progress_callback, + ) + + self._print_batch_report_summary(report) + + def batch_status(self): + """Show status of an in-progress or completed batch migration.""" + parser = argparse.ArgumentParser( + usage="rvl migrate batch-status --state " + ) + parser.add_argument( + "--state", help="Path to checkpoint state file", required=True + ) + args = parser.parse_args(sys.argv[3:]) + + state_path = Path(args.state).resolve() + if not state_path.exists(): + print(f"State file not found: {args.state}") + exit(1) + + # Local import to avoid circular dependency with migration module + from redisvl.migration.models import BatchState + + state_data = load_yaml(args.state) + state = BatchState.model_validate(state_data) + + print( + f"""Batch ID: {state.batch_id} +Started at: {state.started_at} +Updated at: {state.updated_at} +Current index: {state.current_index or '(none)'} +Remaining: {len(state.remaining)} +Completed: {len(state.completed)} + - Succeeded: {state.success_count} + - Failed: {state.failed_count} + - Skipped: {state.skipped_count}""" + ) + + if state.completed: + print("\nCompleted indexes:") + for idx in state.completed: + if idx.status == "success": + status_icon = "[OK]" + elif idx.status == "skipped": + status_icon = "[SKIP]" + else: + status_icon = "[FAIL]" + print(f" {status_icon} {idx.name}") + if idx.error: + print(f" Error: {idx.error}") + + if state.remaining: + print(f"\nRemaining indexes ({len(state.remaining)}):") + for name in state.remaining[:10]: + print(f" - {name}") + if len(state.remaining) > 10: + print(f" ... and {len(state.remaining) - 10} more") + + def _print_batch_plan_summary(self, plan_out: str, batch_plan) -> None: + """Print summary after generating batch plan.""" + import os + + abs_path = os.path.abspath(plan_out) + print(f"Batch plan written to {abs_path}") + print(f"Batch ID: {batch_plan.batch_id}") + print(f"Mode: {batch_plan.mode}") + print(f"Failure policy: {batch_plan.failure_policy}") + print(f"Requires quantization: {batch_plan.requires_quantization}") + print(f"Total indexes: {len(batch_plan.indexes)}") + print(f" - Applicable: {batch_plan.applicable_count}") + print(f" - Skipped: {batch_plan.skipped_count}") + + if batch_plan.skipped_count > 0: + print("\nSkipped indexes:") + for idx in batch_plan.indexes: + if not idx.applicable: + print(f" - {idx.name}: {idx.skip_reason}") + + print( + f""" +Next steps: + Review the plan: cat {plan_out} + Apply the migration: rvl migrate batch-apply --plan {plan_out}""" + ) + + if batch_plan.requires_quantization: + print(" (add --accept-data-loss for quantization)") + + def _print_batch_report_summary(self, report) -> None: + """Print summary after batch migration completes.""" + print(f"\nBatch migration {report.status}") + print(f"Batch ID: {report.batch_id}") + print(f"Duration: {report.summary.total_duration_seconds}s") + print(f"Total: {report.summary.total_indexes}") + print(f" - Succeeded: {report.summary.successful}") + print(f" - Failed: {report.summary.failed}") + print(f" - Skipped: {report.summary.skipped}") + + if report.summary.failed > 0: + print("\nFailed indexes:") + for idx in report.indexes: + if idx.status == "failed": + print(f" - {idx.name}: {idx.error}") diff --git a/redisvl/migration/batch_executor.py b/redisvl/migration/batch_executor.py index 42c63c35c..145dbbb9d 100644 --- a/redisvl/migration/batch_executor.py +++ b/redisvl/migration/batch_executor.py @@ -332,7 +332,7 @@ def _build_batch_report( error=idx_state.error, ) ) - if idx_state.status in ("succeeded", "success"): + if idx_state.status == "success": succeeded += 1 elif idx_state.status == "failed": failed += 1