From 0700702d2c005978fad01adaf0c208c84b39abcb Mon Sep 17 00:00:00 2001 From: Dina Berry Date: Thu, 26 Feb 2026 08:11:23 -0800 Subject: [PATCH] TypeScript topic 2 - create index --- ai/create-index-typescript/README.md | 271 +++++++++++ ai/create-index-typescript/article.md | 576 ++++++++++++++++++++++ ai/create-index-typescript/index.js | 606 ++++++++++++++++++++++++ ai/create-index-typescript/package.json | 29 ++ 4 files changed, 1482 insertions(+) create mode 100644 ai/create-index-typescript/README.md create mode 100644 ai/create-index-typescript/article.md create mode 100644 ai/create-index-typescript/index.js create mode 100644 ai/create-index-typescript/package.json diff --git a/ai/create-index-typescript/README.md b/ai/create-index-typescript/README.md new file mode 100644 index 0000000..6699fd6 --- /dev/null +++ b/ai/create-index-typescript/README.md @@ -0,0 +1,271 @@ +# Azure DocumentDB (MongoDB vCore) - Indexing for Embeddings + +This sample demonstrates **how to create and verify vector search indexes** in Azure DocumentDB, focusing on index lifecycle, configuration, and health verification. + +## What You'll Learn + +This sample answers the key questions: +- **How do I create a vector index in my database?** +- **Why does index creation take time and what does "building" mean?** +- **How do I know if my index is working correctly?** +- **What dimension requirements must I follow?** + +You'll learn to: +- Define vector indexes via `cosmosSearchOptions` on BSON fields +- Verify dimension compatibility between index and embeddings +- Observe index build timing and understand the BUILDING → READY lifecycle +- Check index status/health via `listSearchIndexes()` output +- Confirm the index is active by testing queries + +## Focus: Index Creation & Verification (Not Algorithm Comparison) + +**This is Topic 2**: Index lifecycle and configuration +**Not Topic 3**: Algorithm comparison and parameter tuning + +We demonstrate: +✅ Index creation syntax and configuration +✅ Dimension compatibility verification +✅ Index build monitoring (BUILDING → READY) +✅ Health checks via listSearchIndexes() +✅ Confirming index works with test queries + +We do NOT cover (that's Topic 3): +❌ Comparing IVF vs HNSW vs DiskANN algorithms +❌ Tuning parameters like nprobe, ef, m +❌ Recall vs latency trade-offs + +## Prerequisites + +- Node.js 18.x or later +- Azure subscription +- Azure DocumentDB account (MongoDB vCore) +- Azure OpenAI resource with embeddings deployment + +## Setup + +1. Install dependencies: +```bash +npm install +``` + +2. Configure environment: +```bash +cp .env.example .env +# Edit .env with your Azure credentials +``` + +3. Update `.env` with your: + - DocumentDB connection string + - Database and collection names + - Azure OpenAI endpoint and API key + - Embedding deployment name and dimensions + +## Run the Sample + +```bash +npm start +``` + +## Sample Flow + +The sample demonstrates the complete index lifecycle: + +### Step 1: Verify Embedding Dimensions +``` +✓ Embedding generated successfully + Actual dimensions: 1536 + Expected dimensions: 1536 +✓ Dimensions match - safe to proceed +``` + +**Why this matters**: Index dimensions MUST exactly match embedding model output. + +### Step 2: Create Vector Index +``` +Index Configuration: + Name: vectorSearchIndex + Type: vector-ivf + Field Path: embedding + Dimensions: 1536 + Similarity: COS (cosine distance) + +✓ Index creation initiated + Status will be BUILDING initially + Will transition to READY when complete +``` + +**What happens**: DocumentDB initiates asynchronous index build process. + +### Step 3: Monitor Index Build Status +``` +Waiting for index "vectorSearchIndex" to be READY... + +Check 1 [5s]: Status = BUILDING + Still building... (this is normal) +Check 2 [10s]: Status = BUILDING + Still building... (this is normal) +Check 3 [15s]: Status = READY + +✓ Index is READY after 15 seconds +``` + +**Why it takes time**: DocumentDB reads all documents, calculates index structures, and stores index data. + +### Step 4: Validate Index Configuration +``` +Index Health Checklist: +✓ Index exists: PASS +✓ Status is READY: PASS +✓ Dimensions match: PASS +✓ Correct type: PASS +✓ Correct path: PASS + +✓ Index is HEALTHY and ready to use +``` + +**Health checks**: Verify all configuration is correct before using. + +### Step 5: Insert Documents +``` +Inserting 5 test documents... + ✓ Inserted: Understanding Vector Indexes (1536 dims) + ✓ Inserted: Index Build Process (1536 dims) + ... + +✓ Successfully inserted 5/5 documents +``` + +**Dimension validation**: Each insert verifies dimensions match index. + +### Step 6: Confirm Index Works +``` +Test query: "How do I check if my vector index is healthy?" + +1. Generating query embedding... + ✓ Generated (1536 dimensions) + +2. Executing vector search query... + ✓ Query completed in 45ms + ✓ Found 3 results + +3. Top results: + 1. Monitoring Index Health + 2. Understanding Vector Indexes + 3. Index Build Process + +✓ INDEX IS WORKING CORRECTLY +``` + +**Confirmation**: Successfully executes vector similarity search. + +## Key Concepts + +### Index Lifecycle + +``` +1. DEFINITION → You define index via createSearchIndex() +2. BUILDING → DocumentDB builds index structure +3. READY → Index is active and queries use it +4. (FAILED) → Build failed (check logs) +``` + +### Dimension Requirements + +The **numDimensions** must EXACTLY match your embedding model: + +| Embedding Model | Dimensions | +|-----------------|------------| +| text-embedding-ada-002 | 1536 | +| text-embedding-3-small | 1536 | +| text-embedding-3-large | 3072 | + +**Mismatch = insertion errors!** + +### Index Build Timing + +| Dataset Size | Typical Build Time | +|--------------|-------------------| +| < 1,000 docs | Seconds to 1 minute | +| 1K - 10K docs | 1-5 minutes | +| 10K - 100K docs | 5-30 minutes | +| > 100K docs | 30+ minutes | + +### cosmosSearchOptions Syntax + +```javascript +const indexDefinition = { + name: "vectorSearchIndex", // Unique identifier + type: "vector-ivf", // Algorithm (IVF or HNSW) + definition: { + fields: [ + { + path: "embedding", // Field with BSON array + type: "vector", // Must be "vector" + numDimensions: 1536, // Match embedding model + similarity: "COS" // COS, IP, or L2 + } + ] + } +}; +``` + +### Checking Index Status + +```javascript +const indexes = await collection.listSearchIndexes().toArray(); +const index = indexes.find(idx => idx.name === "vectorSearchIndex"); + +console.log(index.status); // BUILDING, READY, or FAILED +``` + +## Troubleshooting + +### Issue: Dimension mismatch error on insert +**Cause**: Index dimensions don't match embedding dimensions +**Fix**: Drop index and recreate with correct dimensions + +### Issue: Index status stuck on BUILDING +**Cause**: Large dataset or resource constraints +**Fix**: Wait longer; large datasets take more time + +### Issue: Index status is FAILED +**Cause**: Configuration error +**Fix**: Check logs; verify field path exists; recreate index + +### Issue: Queries still slow after index created +**Cause**: Index not READY yet +**Fix**: Verify status is READY (not BUILDING) + +## Index Health Checklist + +Use this to verify your index is healthy: + +✅ Index exists (listSearchIndexes returns it) +✅ Status is READY (not BUILDING or FAILED) +✅ Dimensions match embedding model +✅ Path matches document field name +✅ Type is "vector" +✅ Similarity is set (COS, IP, or L2) +✅ Queries execute without errors +✅ Results are returned +✅ Performance is acceptable + +## Next Steps + +Now that you know how to create and verify indexes: + +1. **Topic 3: Vector Index Algorithms & Query Behavior** + - Compare IVF vs HNSW vs DiskANN + - Understand recall vs latency trade-offs + - Learn parameter tuning (nprobe, ef, m) + +2. **Topic 4: Vector Store Semantic Search** + - Use verified indexes for production search + - Implement query patterns + - Optimize for your use case + +## Resources + +- [Azure DocumentDB Vector Search](https://learn.microsoft.com/azure/documentdb/vector-search) +- [MongoDB Vector Search](https://www.mongodb.com/docs/atlas/atlas-vector-search/) +- [cosmosSearchOptions reference](https://learn.microsoft.com/azure/documentdb/mongodb-feature-support) diff --git a/ai/create-index-typescript/article.md b/ai/create-index-typescript/article.md new file mode 100644 index 0000000..0d9ad62 --- /dev/null +++ b/ai/create-index-typescript/article.md @@ -0,0 +1,576 @@ +# Indexing for Embeddings in Azure DocumentDB (MongoDB) + +**Purpose:** Learn how to **create and verify** vector search indexes in Azure DocumentDB. This article focuses on **index lifecycle and configuration**: defining indexes via cosmosSearchOptions, monitoring build status, validating dimension compatibility, and confirming indexes are active and working correctly. + +## Prerequisites + +- An Azure account with an active subscription +- Azure DocumentDB account (MongoDB vCore) +- Node.js 18.x or later +- Azure OpenAI resource with an embeddings model deployed +- Familiarity with the [DocumentDB vector search quickstart](https://learn.microsoft.com/en-us/azure/documentdb/quickstart-nodejs-vector-search) + +## What You'll Learn + +In this article, you'll answer the key questions: +- **How do I create a vector index in my database?** +- **Why does index creation take time and what does "building" mean?** +- **How do I know if my index is working correctly?** +- **What dimension requirements must I follow?** + +You'll learn to: +- Define vector indexes via `cosmosSearchOptions` on BSON fields +- Verify dimension compatibility between index and embeddings +- Observe index build timing and understand resource impact +- Check index status and health via `listSearchIndexes()` output +- Confirm the index is active by comparing query performance before/after + +## Understanding Vector Indexes in DocumentDB + +### What is a Vector Index? + +A vector index is a specialized data structure that enables **fast similarity search** on high-dimensional embedding vectors. Without an index, DocumentDB would need to scan every document and calculate distances—slow and expensive. With an index, similarity searches become orders of magnitude faster. + +### Index Lifecycle + +Vector indexes in DocumentDB go through distinct phases: + +``` +1. DEFINITION → You define index via createSearchIndex() +2. BUILDING → DocumentDB builds index structure in background +3. READY → Index is active and queries use it automatically +4. (FAILED) → Build failed (check logs for errors) +``` + +**Why "BUILDING" takes time:** +- DocumentDB must read all existing documents +- Calculate index structures (clusters, graphs, etc.) +- Store index data separately from documents +- The more documents, the longer this takes + +### cosmosSearchOptions + +DocumentDB uses `cosmosSearchOptions` to configure vector search indexes. This is MongoDB's vector search API, compatible with Azure DocumentDB. + +## Index Creation Syntax + +### Basic Vector Index Definition + +Here's the core syntax for creating a vector search index: + +```javascript +const indexDefinition = { + name: "vectorSearchIndex", // Index name (must be unique) + type: "vector-ivf", // Algorithm type (covered in Topic 3) + definition: { + fields: [ + { + path: "embedding", // Field containing vector array + type: "vector", // Must be "vector" for vector search + numDimensions: 1536, // MUST match your embedding model + similarity: "COS" // Distance metric (COS, IP, L2) + } + ] + } +}; + +await collection.createSearchIndex(indexDefinition); +``` + +### Configuration Options Explained + +| Option | Description | Required | Notes | +|--------|-------------|----------|-------| +| **name** | Index identifier | Yes | Must be unique per collection | +| **type** | Algorithm type | Yes | `vector-ivf` or `vector-hnsw` (algorithms in Topic 3) | +| **path** | Field with embeddings | Yes | Must contain BSON array of numbers | +| **type** (field) | Field type | Yes | Must be `"vector"` for vector search | +| **numDimensions** | Vector size | Yes | **MUST match embedding model exactly** | +| **similarity** | Distance function | Yes | `COS` (cosine), `IP` (inner product), `L2` (euclidean) | + +## Dimension Requirements + +### Critical Rule: Dimensions Must Match + +The **numDimensions** in your index definition **MUST exactly match** your embedding model's output: + +| Embedding Model | Dimensions | +|-----------------|------------| +| text-embedding-ada-002 | 1536 | +| text-embedding-3-small | 1536 | +| text-embedding-3-large | 3072 | + +**What happens if dimensions don't match:** + +```javascript +// Index defined with 1536 dimensions +numDimensions: 1536 + +// But embedding has 768 dimensions +embedding: [0.1, 0.2, ...] // only 768 values + +// Result: INSERT FAILS with dimension mismatch error +``` + +### Verifying Dimension Compatibility + +Before inserting documents, verify your embedding dimensions: + +```javascript +// Generate a test embedding +const testEmbedding = await generateEmbedding("test"); +console.log(`Embedding dimensions: ${testEmbedding.length}`); + +// Compare to index definition +const indexes = await collection.listSearchIndexes().toArray(); +const vectorIndex = indexes.find(idx => idx.name === "vectorSearchIndex"); +const indexDimensions = vectorIndex.definition.fields[0].numDimensions; + +if (testEmbedding.length !== indexDimensions) { + throw new Error( + `Dimension mismatch! Embedding: ${testEmbedding.length}, Index: ${indexDimensions}` + ); +} + +console.log("✓ Dimensions match - safe to insert documents"); +``` + +## Index Build Process + +### Understanding Build Time + +Index builds are **asynchronous** and take time based on: + +| Factor | Impact on Build Time | +|--------|---------------------| +| **Document count** | More documents = longer build | +| **Vector dimensions** | Higher dimensions = more computation | +| **Index algorithm** | Different algorithms have different build costs | +| **Resource allocation** | DocumentDB cluster resources affect speed | + +**Typical build times:** +- Small (< 1,000 docs): Seconds to 1 minute +- Medium (1,000 - 10,000 docs): 1-5 minutes +- Large (10,000 - 100,000 docs): 5-30 minutes +- Very large (> 100,000 docs): 30+ minutes + +### What Happens During "BUILDING" + +While the index status is "BUILDING": + +1. **DocumentDB reads all documents** with the specified field path +2. **Extracts embedding vectors** from BSON arrays +3. **Builds index structures** (clusters, graphs, etc. based on algorithm) +4. **Stores index data** separately from documents +5. **Updates status to READY** when complete + +**Can you query during BUILDING?** +- Yes, queries will execute +- But they may **not use the index** (slower performance) +- Wait for READY status for optimal performance + +### Resource Impact + +Index builds consume: +- **CPU**: For index structure computation +- **Memory**: For holding index data structures +- **Storage**: Indexes are stored separately (adds to total storage) +- **I/O**: Reading documents and writing index + +**Best practice:** For large datasets, create indexes during off-peak hours or on a new collection before switching traffic. + +## Checking Index Status and Health + +### Using listSearchIndexes() + +The primary way to check index status: + +```javascript +const indexes = await collection.listSearchIndexes().toArray(); + +indexes.forEach(index => { + console.log(`Index: ${index.name}`); + console.log(` Status: ${index.status}`); // BUILDING, READY, FAILED + console.log(` Type: ${index.type}`); // vector-ivf, vector-hnsw + + if (index.definition && index.definition.fields) { + index.definition.fields.forEach(field => { + console.log(` Field: ${field.path}`); + console.log(` Dimensions: ${field.numDimensions}`); + console.log(` Similarity: ${field.similarity}`); + }); + } +}); +``` + +### Index Status Values + +| Status | Meaning | Action | +|--------|---------|--------| +| **BUILDING** | Index is being created | Wait for READY before querying | +| **READY** | Index is active and queryable | Safe to run vector searches | +| **FAILED** | Index build failed | Check logs; verify configuration; recreate | +| **(not present)** | Index doesn't exist | Create index first | + +### Monitoring Index Build Progress + +```javascript +async function waitForIndexReady(collection, indexName, maxWaitMs = 300000) { + const startTime = Date.now(); + const checkIntervalMs = 5000; + + console.log(`Waiting for index "${indexName}" to be READY...`); + + while (Date.now() - startTime < maxWaitMs) { + const indexes = await collection.listSearchIndexes().toArray(); + const index = indexes.find(idx => idx.name === indexName); + + if (!index) { + console.log("Index not found"); + await new Promise(resolve => setTimeout(resolve, checkIntervalMs)); + continue; + } + + const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(0); + console.log(`[${elapsedSec}s] Status: ${index.status}`); + + if (index.status === "READY") { + console.log(`✓ Index ready after ${elapsedSec} seconds`); + return true; + } + + if (index.status === "FAILED") { + throw new Error("Index build failed"); + } + + await new Promise(resolve => setTimeout(resolve, checkIntervalMs)); + } + + throw new Error("Index build timeout"); +} +``` + +## Confirming Index is Working + +### Test 1: Index Exists and is READY + +```javascript +async function verifyIndexActive(collection, indexName) { + const indexes = await collection.listSearchIndexes().toArray(); + const index = indexes.find(idx => idx.name === indexName); + + if (!index) { + return { active: false, reason: "Index not found" }; + } + + if (index.status !== "READY") { + return { active: false, reason: `Status is ${index.status}, not READY` }; + } + + return { active: true, index: index }; +} +``` + +### Test 2: Query Executes Successfully + +```javascript +async function testVectorQuery(collection, embedding) { + try { + const results = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: embedding, + path: "embedding", + k: 5 + }, + returnStoredSource: true + } + }, + { + $project: { + _id: 1, + title: 1, + score: { $meta: "searchScore" } + } + } + ]).toArray(); + + return { success: true, resultCount: results.length }; + } catch (error) { + return { success: false, error: error.message }; + } +} +``` + +### Test 3: Performance Comparison (Before/After) + +The most definitive test: compare query performance with and without the index. + +**Approach:** +1. Insert documents WITHOUT an index +2. Measure query time (will be slow or fail) +3. Create the index and wait for READY +4. Measure query time again (should be much faster) + +```javascript +async function comparePerformanceBeforeAfter(collection) { + console.log("=== Performance Comparison ==="); + + // Generate test documents + const testDocs = await generateTestDocuments(100); + + // Insert WITHOUT index + console.log("\n1. Inserting documents without index..."); + await collection.insertMany(testDocs); + + // Try querying without index + console.log("\n2. Querying WITHOUT index..."); + try { + const queryEmbedding = await generateEmbedding("test query"); + const startTime = Date.now(); + + const results = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryEmbedding, + path: "embedding", + k: 5 + }, + returnStoredSource: true + } + } + ]).toArray(); + + const withoutIndexTime = Date.now() - startTime; + console.log(` Query time: ${withoutIndexTime}ms (or may fail without index)`); + } catch (error) { + console.log(` Query failed: ${error.message}`); + console.log(" (This is expected - vector search requires an index)"); + } + + // Create index + console.log("\n3. Creating vector index..."); + await createVectorSearchIndex(collection, "vector-ivf"); + + // Wait for index to be ready + console.log("\n4. Waiting for index to be READY..."); + await waitForIndexReady(collection, "vectorSearchIndex"); + + // Query WITH index + console.log("\n5. Querying WITH index..."); + const queryEmbedding = await generateEmbedding("test query"); + const startTime = Date.now(); + + const results = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryEmbedding, + path: "embedding", + k: 5 + }, + returnStoredSource: true + } + } + ]).toArray(); + + const withIndexTime = Date.now() - startTime; + console.log(` Query time: ${withIndexTime}ms`); + console.log(` Results: ${results.length} documents`); + + console.log("\n6. Summary:"); + console.log(` ✓ Index is working correctly`); + console.log(` ✓ Query completed in ${withIndexTime}ms`); + console.log(` ✓ Returned ${results.length} results`); +} +``` + +## Index Health Checklist + +Use this checklist to verify your index is healthy: + +✅ **Index exists**: `listSearchIndexes()` returns your index +✅ **Status is READY**: `index.status === "READY"` +✅ **Dimensions match**: Index dimensions = embedding dimensions +✅ **Path is correct**: `field.path` matches your document field name +✅ **Type is vector**: `field.type === "vector"` +✅ **Similarity is set**: `field.similarity` is COS, IP, or L2 +✅ **Queries execute**: Vector searches complete without errors +✅ **Results are returned**: Queries return expected documents +✅ **Performance is good**: Query latency is acceptable (< 100ms typical) + +## Complete Working Sample + +### Full Index Creation and Verification Flow + +```javascript +const { MongoClient } = require("mongodb"); +const { OpenAIClient, AzureKeyCredential } = require("@azure/openai"); +require("dotenv").config(); + +async function main() { + // Connect to DocumentDB + const client = new MongoClient(process.env.DOCUMENTDB_CONNECTION_STRING); + await client.connect(); + + const database = client.db(process.env.DOCUMENTDB_DATABASE_NAME); + const collection = database.collection(process.env.DOCUMENTDB_COLLECTION_NAME); + + try { + // Step 1: Verify embedding dimensions + console.log("Step 1: Verifying embedding dimensions..."); + const testEmbedding = await generateEmbedding("test"); + console.log(`✓ Embedding dimensions: ${testEmbedding.length}`); + + // Step 2: Create index with matching dimensions + console.log("\nStep 2: Creating vector index..."); + const indexDefinition = { + name: "vectorSearchIndex", + type: "vector-ivf", + definition: { + fields: [ + { + path: "embedding", + type: "vector", + numDimensions: testEmbedding.length, // Match embedding size + similarity: "COS" + } + ] + } + }; + + await collection.createSearchIndex(indexDefinition); + console.log("✓ Index creation initiated"); + + // Step 3: Monitor build status + console.log("\nStep 3: Monitoring index build..."); + await waitForIndexReady(collection, "vectorSearchIndex"); + + // Step 4: Verify index configuration + console.log("\nStep 4: Verifying index configuration..."); + const indexes = await collection.listSearchIndexes().toArray(); + const index = indexes.find(idx => idx.name === "vectorSearchIndex"); + + console.log("Index Configuration:"); + console.log(` Name: ${index.name}`); + console.log(` Status: ${index.status}`); + console.log(` Type: ${index.type}`); + console.log(` Dimensions: ${index.definition.fields[0].numDimensions}`); + console.log(` Similarity: ${index.definition.fields[0].similarity}`); + + // Step 5: Insert test documents + console.log("\nStep 5: Inserting test documents..."); + const docs = await generateTestDocuments(10); + await collection.insertMany(docs); + console.log(`✓ Inserted ${docs.length} documents`); + + // Step 6: Confirm index works with query + console.log("\nStep 6: Testing vector query..."); + const queryResult = await testVectorQuery(collection, testEmbedding); + + if (queryResult.success) { + console.log(`✓ Index is working correctly`); + console.log(`✓ Query returned ${queryResult.resultCount} results`); + } else { + console.log(`✗ Index test failed: ${queryResult.error}`); + } + + } finally { + await client.close(); + } +} +``` + +## Troubleshooting + +### Issue: Index status stuck on BUILDING +**Cause**: Large dataset or resource constraints +**Solution**: +- Wait longer (check every 5 minutes for large datasets) +- Monitor DocumentDB cluster metrics +- Consider creating index during off-peak hours + +### Issue: Dimension mismatch error on insert +**Cause**: Index dimensions don't match embedding dimensions +**Solution**: +```javascript +// Drop the index +await collection.dropSearchIndex("vectorSearchIndex"); + +// Recreate with correct dimensions +const correctDimensions = yourEmbedding.length; +// Create index with correctDimensions +``` + +### Issue: Index status is FAILED +**Cause**: Configuration error or resource issue +**Solution**: +- Check DocumentDB logs for specific error +- Verify field path exists in documents +- Ensure field contains BSON arrays of numbers +- Drop and recreate index with fixed configuration + +### Issue: Queries don't use index (still slow) +**Cause**: Index not READY or path mismatch +**Solution**: +- Verify status is READY (not BUILDING) +- Check index path matches query path exactly +- Ensure query uses `cosmosSearch` syntax correctly + +### Issue: "Index not found" even after creation +**Cause**: Creation command didn't complete or network issue +**Solution**: +- Check for errors in `createSearchIndex()` response +- Verify collection name is correct +- List all indexes to see what exists: `listSearchIndexes().toArray()` + +## Best Practices + +### Index Creation +✅ Create indexes BEFORE inserting large datasets (faster than retrofitting) +✅ Verify dimensions match before creating index +✅ Use descriptive index names (e.g., "contentEmbedding_ivf") +✅ Monitor build status for large collections +✅ Document your index configuration for team reference + +### Dimension Management +✅ Store embedding model name in documents for tracking +✅ Validate dimensions before bulk inserts +✅ Use consistent embedding model across all documents +✅ Test with sample documents before production deployment + +### Index Health Monitoring +✅ Check index status before querying (ensure READY) +✅ Monitor query performance metrics +✅ Set up alerts for index build failures +✅ Regularly verify index exists and is active +✅ Test queries after index creation to confirm functionality + +### Resource Management +✅ Create indexes during off-peak hours for large datasets +✅ Monitor cluster resources during index builds +✅ Consider scaling up temporarily for large index builds +✅ Account for index storage in capacity planning + +## Next Steps + +Now that you understand how to create and verify vector indexes: + +1. **Topic 3: Vector Index Algorithms & Query Behavior** + - Learn when to choose IVF vs. HNSW vs. DiskANN + - Understand recall vs. latency trade-offs + - Tune algorithm parameters for your workload + +2. **Topic 4: Vector Store Semantic Search** + - Use your verified indexes for production semantic search + - Implement query patterns and result handling + - Optimize for your specific use case + +## Additional Resources + +- [Azure DocumentDB Vector Search documentation](https://learn.microsoft.com/azure/documentdb/vector-search) +- [MongoDB Vector Search documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/) +- [cosmosSearchOptions reference](https://learn.microsoft.com/azure/documentdb/mongodb-feature-support) +- [BSON array format documentation](https://www.mongodb.com/docs/manual/reference/bson-types/) diff --git a/ai/create-index-typescript/index.js b/ai/create-index-typescript/index.js new file mode 100644 index 0000000..7e133e4 --- /dev/null +++ b/ai/create-index-typescript/index.js @@ -0,0 +1,606 @@ +/** + * Azure DocumentDB (MongoDB vCore) - Indexing for Embeddings Sample + * + * This sample demonstrates INDEX LIFECYCLE AND VERIFICATION: + * - Define vector indexes via cosmosSearchOptions on BSON fields + * - Verify dimension compatibility between index and embeddings + * - Observe index build timing and resource impact + * - Check index status/health via listSearchIndexes() output + * - Confirm the index is active by testing queries + * + * This is Topic 2: Focus on "How do I create and verify an index?" + * NOT Topic 3: Algorithm comparison and parameter tuning + */ + +const { MongoClient } = require("mongodb"); +const { OpenAIClient, AzureKeyCredential } = require("@azure/openai"); +require("dotenv").config(); + +// Configuration +const config = { + documentdb: { + connectionString: process.env.DOCUMENTDB_CONNECTION_STRING, + databaseName: process.env.DOCUMENTDB_DATABASE_NAME || "vectordb", + collectionName: process.env.DOCUMENTDB_COLLECTION_NAME || "embeddings" + }, + openai: { + endpoint: process.env.AZURE_OPENAI_ENDPOINT, + key: process.env.AZURE_OPENAI_API_KEY, + embeddingDeployment: process.env.AZURE_OPENAI_EMBEDDING_DEPLOYMENT || "text-embedding-ada-002", + dimensions: parseInt(process.env.AZURE_OPENAI_EMBEDDING_DIMENSIONS || "1536") + } +}; + +// Initialize OpenAI client +const openaiClient = new OpenAIClient( + config.openai.endpoint, + new AzureKeyCredential(config.openai.key) +); + +/** + * Generate embedding for text using Azure OpenAI + */ +async function generateEmbedding(text) { + try { + const embeddings = await openaiClient.getEmbeddings( + config.openai.embeddingDeployment, + [text] + ); + return embeddings.data[0].embedding; + } catch (error) { + console.error("Error generating embedding:", error.message); + throw error; + } +} + +/** + * Connect to DocumentDB + */ +async function connectToDocumentDB() { + const client = new MongoClient(config.documentdb.connectionString); + await client.connect(); + console.log("✓ Connected to DocumentDB"); + return client; +} + +/** + * STEP 1: Verify embedding dimensions + * Critical: Index dimensions MUST match embedding model output + */ +async function verifyEmbeddingDimensions() { + console.log("\n=== STEP 1: Verifying Embedding Dimensions ==="); + console.log("Why this matters: Index dimensions MUST exactly match embedding model output"); + + // Generate a test embedding + const testText = "This is a test to verify embedding dimensions"; + console.log(`\nGenerating test embedding for: "${testText}"`); + + const embedding = await generateEmbedding(testText); + const actualDimensions = embedding.length; + + console.log(`✓ Embedding generated successfully`); + console.log(` Actual dimensions: ${actualDimensions}`); + console.log(` Expected dimensions (from config): ${config.openai.dimensions}`); + + if (actualDimensions !== config.openai.dimensions) { + console.log(`⚠ WARNING: Dimension mismatch detected!`); + console.log(` Please update AZURE_OPENAI_EMBEDDING_DIMENSIONS in .env to ${actualDimensions}`); + + // Update config for this session + config.openai.dimensions = actualDimensions; + console.log(` Auto-corrected for this session`); + } else { + console.log(`✓ Dimensions match - safe to proceed`); + } + + return actualDimensions; +} + +/** + * STEP 2: Create vector search index + * Demonstrates: cosmosSearchOptions syntax and configuration + */ +async function createVectorSearchIndex(collection, dimensions) { + console.log("\n=== STEP 2: Creating Vector Search Index ==="); + console.log("This initiates the index build process (asynchronous)"); + + // Check if index already exists + const existingIndexes = await collection.listSearchIndexes().toArray(); + const existingIndex = existingIndexes.find(idx => idx.name === "vectorSearchIndex"); + + if (existingIndex) { + console.log(`\n⚠ Index "vectorSearchIndex" already exists`); + console.log(` Current status: ${existingIndex.status}`); + console.log(` Skipping creation (will monitor existing index)`); + return existingIndex; + } + + // Define index using cosmosSearchOptions + const indexDefinition = { + name: "vectorSearchIndex", + type: "vector-ivf", // Using IVF for this demonstration + definition: { + fields: [ + { + path: "embedding", // Field containing BSON array + type: "vector", // Must be "vector" for vector search + numDimensions: dimensions, // MUST match embedding model + similarity: "COS" // Cosine distance (most common) + } + ] + } + }; + + console.log("\nIndex Configuration:"); + console.log(` Name: ${indexDefinition.name}`); + console.log(` Type: ${indexDefinition.type}`); + console.log(` Field Path: embedding`); + console.log(` Dimensions: ${dimensions}`); + console.log(` Similarity: COS (cosine distance)`); + + console.log("\nCreating index..."); + const startTime = Date.now(); + + try { + await collection.createSearchIndex(indexDefinition); + const creationTime = Date.now() - startTime; + + console.log(`✓ Index creation initiated (${creationTime}ms)`); + console.log(` Status will be BUILDING initially`); + console.log(` Will transition to READY when complete`); + + return indexDefinition; + } catch (error) { + console.error(`✗ Index creation failed: ${error.message}`); + throw error; + } +} + +/** + * STEP 3: Monitor index build status + * Demonstrates: Index lifecycle (BUILDING → READY) and timing + */ +async function monitorIndexBuildStatus(collection, indexName, maxWaitMs = 300000) { + console.log("\n=== STEP 3: Monitoring Index Build Status ==="); + console.log("Why indexes take time to build:"); + console.log(" • DocumentDB must read all existing documents"); + console.log(" • Calculate index structures (clusters, graphs, etc.)"); + console.log(" • Store index data separately from documents"); + console.log(" • More documents = longer build time"); + + const startTime = Date.now(); + const checkIntervalMs = 5000; // Check every 5 seconds + let checkCount = 0; + + console.log(`\nWaiting for index "${indexName}" to be READY...`); + console.log(`(Will check every ${checkIntervalMs / 1000} seconds, max ${maxWaitMs / 1000 / 60} minutes)\n`); + + while (Date.now() - startTime < maxWaitMs) { + checkCount++; + + try { + const indexes = await collection.listSearchIndexes().toArray(); + const index = indexes.find(idx => idx.name === indexName); + + if (!index) { + console.log(`Check ${checkCount}: Index not found (may be creating...)`); + await new Promise(resolve => setTimeout(resolve, checkIntervalMs)); + continue; + } + + const elapsedSec = ((Date.now() - startTime) / 1000).toFixed(0); + const status = index.status || "UNKNOWN"; + + console.log(`Check ${checkCount} [${elapsedSec}s]: Status = ${status}`); + + if (status === "READY") { + console.log(`\n✓ Index is READY after ${elapsedSec} seconds`); + console.log(` Total checks: ${checkCount}`); + console.log(` The index is now active and will be used for queries`); + return true; + } else if (status === "FAILED") { + console.log(`\n✗ Index build FAILED`); + console.log(` Check DocumentDB logs for details`); + throw new Error("Index build failed"); + } else if (status === "BUILDING") { + console.log(` Still building... (this is normal)`); + } + + } catch (error) { + console.log(`Check ${checkCount}: Error checking status - ${error.message}`); + } + + await new Promise(resolve => setTimeout(resolve, checkIntervalMs)); + } + + console.log(`\n⚠ Index build did not complete within ${maxWaitMs / 1000 / 60} minutes`); + console.log(` This may be normal for very large datasets`); + console.log(` Check back later or increase timeout`); + return false; +} + +/** + * STEP 4: Validate index configuration + * Demonstrates: How to check index health and verify settings + */ +async function validateIndexConfiguration(collection, expectedDimensions) { + console.log("\n=== STEP 4: Validating Index Configuration ==="); + console.log("Index Health Checklist:"); + + const checks = { + exists: false, + ready: false, + dimensionsMatch: false, + correctType: false, + correctPath: false + }; + + try { + // Get all indexes + const indexes = await collection.listSearchIndexes().toArray(); + console.log(`\nTotal search indexes found: ${indexes.length}`); + + if (indexes.length === 0) { + console.log("✗ No indexes found"); + return { healthy: false, checks }; + } + + // Find our vector index + const vectorIndex = indexes.find(idx => idx.name === "vectorSearchIndex"); + + if (!vectorIndex) { + console.log("✗ Vector index not found"); + return { healthy: false, checks }; + } + + checks.exists = true; + console.log(`✓ Index exists: ${vectorIndex.name}`); + + // Check status + console.log(`\nStatus: ${vectorIndex.status || "UNKNOWN"}`); + if (vectorIndex.status === "READY") { + checks.ready = true; + console.log("✓ Status is READY"); + } else { + console.log(`✗ Status is not READY (current: ${vectorIndex.status})`); + } + + // Check configuration + console.log("\nConfiguration Details:"); + console.log(` Type: ${vectorIndex.type}`); + checks.correctType = vectorIndex.type === "vector-ivf" || vectorIndex.type === "vector-hnsw"; + + if (vectorIndex.definition && vectorIndex.definition.fields) { + vectorIndex.definition.fields.forEach((field, i) => { + console.log(`\n Field ${i + 1}:`); + console.log(` Path: ${field.path}`); + console.log(` Type: ${field.type}`); + console.log(` Dimensions: ${field.numDimensions}`); + console.log(` Similarity: ${field.similarity}`); + + checks.correctPath = field.path === "embedding"; + checks.dimensionsMatch = field.numDimensions === expectedDimensions; + + if (!checks.correctPath) { + console.log(` ⚠ Path mismatch: expected "embedding"`); + } + + if (!checks.dimensionsMatch) { + console.log(` ⚠ Dimension mismatch: expected ${expectedDimensions}`); + } + }); + } + + // Summary + console.log("\n--- Health Check Summary ---"); + console.log(`✓ Index exists: ${checks.exists ? "PASS" : "FAIL"}`); + console.log(`✓ Status is READY: ${checks.ready ? "PASS" : "FAIL"}`); + console.log(`✓ Dimensions match: ${checks.dimensionsMatch ? "PASS" : "FAIL"}`); + console.log(`✓ Correct type: ${checks.correctType ? "PASS" : "FAIL"}`); + console.log(`✓ Correct path: ${checks.correctPath ? "PASS" : "FAIL"}`); + + const healthy = Object.values(checks).every(check => check === true); + + if (healthy) { + console.log("\n✓ Index is HEALTHY and ready to use"); + } else { + console.log("\n⚠ Index has issues - review checks above"); + } + + return { healthy, checks, index: vectorIndex }; + + } catch (error) { + console.error("Error validating index:", error.message); + return { healthy: false, checks, error: error.message }; + } +} + +/** + * STEP 5: Insert documents with embeddings + */ +async function insertDocumentsWithEmbeddings(collection) { + console.log("\n=== STEP 5: Inserting Documents with Embeddings ==="); + + const documents = [ + { + _id: "1", + title: "Understanding Vector Indexes", + content: "Vector indexes enable fast similarity search on high-dimensional embeddings by organizing data for efficient retrieval.", + category: "tutorial" + }, + { + _id: "2", + title: "Index Build Process", + content: "When you create an index, DocumentDB reads documents, extracts embeddings, and builds specialized data structures for fast queries.", + category: "concepts" + }, + { + _id: "3", + title: "Dimension Requirements", + content: "The dimension count in your index definition must exactly match your embedding model output to avoid insertion errors.", + category: "best-practices" + }, + { + _id: "4", + title: "Monitoring Index Health", + content: "Use listSearchIndexes to check index status, verify configuration, and ensure your index is READY before querying.", + category: "operations" + }, + { + _id: "5", + title: "BSON Array Format", + content: "DocumentDB stores embeddings as native BSON arrays, which provides efficient storage and query performance for vector data.", + category: "technical" + } + ]; + + console.log(`Inserting ${documents.length} test documents...`); + let successCount = 0; + + for (const doc of documents) { + try { + // Generate embedding + const embedding = await generateEmbedding(doc.content); + + // Verify dimensions before insert + if (embedding.length !== config.openai.dimensions) { + console.log(`⚠ Skipping ${doc._id}: dimension mismatch (${embedding.length} vs ${config.openai.dimensions})`); + continue; + } + + // Add embedding to document (BSON array format) + const docWithEmbedding = { + ...doc, + embedding: embedding, // Stored as native BSON array + embeddingModel: config.openai.embeddingDeployment, + embeddingDimensions: embedding.length, + createdAt: new Date() + }; + + // Insert + await collection.insertOne(docWithEmbedding); + successCount++; + console.log(` ✓ Inserted: ${doc.title} (${embedding.length} dims)`); + + } catch (error) { + console.error(` ✗ Error inserting ${doc._id}: ${error.message}`); + } + } + + console.log(`\n✓ Successfully inserted ${successCount}/${documents.length} documents`); + return successCount; +} + +/** + * STEP 6: Confirm index is working by testing queries + */ +async function confirmIndexWorking(collection) { + console.log("\n=== STEP 6: Confirming Index Works ==="); + console.log("Testing vector similarity search..."); + + const testQuery = "How do I check if my vector index is healthy?"; + console.log(`\nTest query: "${testQuery}"`); + + try { + // Generate query embedding + console.log("\n1. Generating query embedding..."); + const queryEmbedding = await generateEmbedding(testQuery); + console.log(` ✓ Generated (${queryEmbedding.length} dimensions)`); + + // Execute vector search + console.log("\n2. Executing vector search query..."); + const startTime = Date.now(); + + const results = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryEmbedding, + path: "embedding", + k: 3 // Top 3 results + }, + returnStoredSource: true + } + }, + { + $project: { + _id: 1, + title: 1, + category: 1, + score: { $meta: "searchScore" } + } + } + ]).toArray(); + + const queryTime = Date.now() - startTime; + + // Display results + console.log(` ✓ Query completed in ${queryTime}ms`); + console.log(` ✓ Found ${results.length} results`); + + if (results.length > 0) { + console.log("\n3. Top results:"); + results.forEach((result, i) => { + console.log(` ${i + 1}. ${result.title}`); + console.log(` Category: ${result.category}`); + console.log(` Score: ${result.score.toFixed(4)}`); + }); + + console.log("\n✓ INDEX IS WORKING CORRECTLY"); + console.log(` • Query executed successfully`); + console.log(` • Results returned in ${queryTime}ms`); + console.log(` • Semantic matches found`); + + return { working: true, queryTime, resultCount: results.length }; + } else { + console.log("\n⚠ Query executed but returned no results"); + console.log(" This may mean:"); + console.log(" • No documents match the query"); + console.log(" • Index may not be fully active yet"); + + return { working: false, queryTime, resultCount: 0 }; + } + + } catch (error) { + console.error(`\n✗ Query failed: ${error.message}`); + console.log("\nPossible causes:"); + console.log(" • Index is not READY yet"); + console.log(" • Index path doesn't match document field"); + console.log(" • Dimension mismatch"); + + return { working: false, error: error.message }; + } +} + +/** + * Display index lifecycle summary + */ +function displayIndexLifecycleSummary() { + console.log("\n" + "=".repeat(80)); + console.log("INDEX LIFECYCLE SUMMARY"); + console.log("=".repeat(80)); + + console.log("\n📋 What We Demonstrated:"); + console.log(" 1. ✓ Verified embedding dimensions match index requirements"); + console.log(" 2. ✓ Created vector index via cosmosSearchOptions"); + console.log(" 3. ✓ Monitored index build status (BUILDING → READY)"); + console.log(" 4. ✓ Validated index configuration and health"); + console.log(" 5. ✓ Inserted documents with proper dimension validation"); + console.log(" 6. ✓ Confirmed index works by executing queries"); + + console.log("\n🔑 Key Takeaways:"); + console.log(" • Index dimensions MUST exactly match embedding model"); + console.log(" • Index builds are asynchronous (BUILDING → READY)"); + console.log(" • Always verify index status before querying"); + console.log(" • Use listSearchIndexes() to check health"); + console.log(" • Test queries to confirm index is working"); + + console.log("\n📊 Index Build Timing:"); + console.log(" • Small datasets (< 1K docs): Seconds to 1 minute"); + console.log(" • Medium (1K-10K docs): 1-5 minutes"); + console.log(" • Large (10K-100K docs): 5-30 minutes"); + console.log(" • Very large (> 100K docs): 30+ minutes"); + + console.log("\n🎯 Next Steps:"); + console.log(" → Topic 3: Learn about algorithm choices (IVF vs HNSW vs DiskANN)"); + console.log(" → Topic 4: Implement semantic search patterns"); + console.log(" → Production: Monitor index health and query performance"); +} + +/** + * Main execution flow + */ +async function main() { + console.log("=".repeat(80)); + console.log("Azure DocumentDB - Vector Indexing Lifecycle & Verification"); + console.log("=".repeat(80)); + console.log("\nThis sample demonstrates:"); + console.log(" • How to create vector indexes via cosmosSearchOptions"); + console.log(" • Index build process and timing"); + console.log(" • How to verify index health and configuration"); + console.log(" • Dimension compatibility requirements"); + console.log(" • Confirming indexes work correctly"); + + let client; + + try { + // Connect + client = await connectToDocumentDB(); + const database = client.db(config.documentdb.databaseName); + const collection = database.collection(config.documentdb.collectionName); + + // STEP 1: Verify embedding dimensions + const dimensions = await verifyEmbeddingDimensions(); + + // STEP 2: Create index + await createVectorSearchIndex(collection, dimensions); + + // STEP 3: Monitor build status + const isReady = await monitorIndexBuildStatus(collection, "vectorSearchIndex"); + + if (!isReady) { + console.log("\n⚠ Index not ready yet - skipping remaining steps"); + console.log(" You can re-run this script later to complete validation"); + return; + } + + // STEP 4: Validate configuration + const validation = await validateIndexConfiguration(collection, dimensions); + + if (!validation.healthy) { + console.log("\n⚠ Index validation failed - review issues above"); + return; + } + + // STEP 5: Insert documents + const insertedCount = await insertDocumentsWithEmbeddings(collection); + + if (insertedCount === 0) { + console.log("\n⚠ No documents inserted - skipping query test"); + return; + } + + // STEP 6: Confirm index works + const queryTest = await confirmIndexWorking(collection); + + if (!queryTest.working) { + console.log("\n⚠ Index may not be working correctly - review errors above"); + return; + } + + // Summary + displayIndexLifecycleSummary(); + + console.log("\n" + "=".repeat(80)); + console.log("✓ Sample completed successfully"); + console.log("=".repeat(80)); + + } catch (error) { + console.error("\n" + "=".repeat(80)); + console.error("✗ Error:", error.message); + console.error("=".repeat(80)); + console.error(error); + process.exit(1); + } finally { + if (client) { + await client.close(); + console.log("\n✓ Connection closed"); + } + } +} + +// Run the sample +if (require.main === module) { + main().catch(console.error); +} + +module.exports = { + generateEmbedding, + connectToDocumentDB, + verifyEmbeddingDimensions, + createVectorSearchIndex, + monitorIndexBuildStatus, + validateIndexConfiguration, + insertDocumentsWithEmbeddings, + confirmIndexWorking +}; diff --git a/ai/create-index-typescript/package.json b/ai/create-index-typescript/package.json new file mode 100644 index 0000000..ef0360a --- /dev/null +++ b/ai/create-index-typescript/package.json @@ -0,0 +1,29 @@ +{ + "name": "documentdb-vector-indexing", + "version": "1.0.0", + "description": "Azure DocumentDB (MongoDB) Vector Indexing for Embeddings Sample", + "main": "index.js", + "scripts": { + "start": "node index.js", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [ + "azure", + "documentdb", + "mongodb", + "vector-search", + "embeddings", + "indexing", + "ai" + ], + "author": "", + "license": "MIT", + "dependencies": { + "mongodb": "^6.3.0", + "@azure/openai": "^1.0.0-beta.12", + "dotenv": "^16.4.5" + }, + "engines": { + "node": ">=18.0.0" + } +}