From 5e0eec4ae5c31cb2d5e16558b3fcdc46868b6958 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Mon, 11 May 2026 14:45:08 -0700 Subject: [PATCH 01/11] Add select-algorithm samples for all languages with CI Add DocumentDB vector index algorithm selection samples demonstrating HNSW, IVF, and DiskANN index types across TypeScript, Python, Go, Java, and .NET. Each sample creates indexes with documented defaults, performs vector searches, and compares results. CI updated to validate all new samples in the existing workflow matrix. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/validate-samples.yml | 38 +- .../.devcontainer/devcontainer.json | 48 ++ ai/select-algorithm-dotnet/.gitignore | 7 + ai/select-algorithm-dotnet/CompareAll.cs | 259 ++++++ .../Models/Configuration.cs | 41 + .../Models/HotelData.cs | 19 + ai/select-algorithm-dotnet/Program.cs | 40 + ai/select-algorithm-dotnet/README.md | 137 ++++ .../SelectAlgorithm.csproj | 23 + .../Utilities/AzureIdentityTokenHandler.cs | 32 + ai/select-algorithm-dotnet/Utils.cs | 190 +++++ ai/select-algorithm-dotnet/appsettings.json | 24 + ai/select-algorithm-dotnet/data/README.md | 5 + .../output/compare_all.txt | 47 ++ ai/select-algorithm-dotnet/quickstart.md | 542 +++++++++++++ ai/select-algorithm-go/.gitignore | 7 + ai/select-algorithm-go/README.md | 199 +++++ ai/select-algorithm-go/data/README.md | 5 + ai/select-algorithm-go/go.mod | 35 + ai/select-algorithm-go/go.sum | 95 +++ ai/select-algorithm-go/output/compare_all.txt | 39 + ai/select-algorithm-go/quickstart.md | 495 ++++++++++++ ai/select-algorithm-go/src/compare_all.go | 325 ++++++++ ai/select-algorithm-go/src/main.go | 28 + ai/select-algorithm-go/src/utils.go | 385 +++++++++ ai/select-algorithm-java/.gitignore | 7 + ai/select-algorithm-java/README.md | 128 +++ ai/select-algorithm-java/data/README.md | 5 + .../output/compare_all.txt | 50 ++ ai/select-algorithm-java/pom.xml | 79 ++ ai/select-algorithm-java/quickstart.md | 528 +++++++++++++ .../selectalgorithm/CompareAll.java | 223 ++++++ .../documentdb/selectalgorithm/Main.java | 17 + .../documentdb/selectalgorithm/Utils.java | 190 +++++ ai/select-algorithm-python/.gitignore | 8 + ai/select-algorithm-python/README.md | 96 +++ ai/select-algorithm-python/data/README.md | 5 + .../output/compare_all.txt | 47 ++ ai/select-algorithm-python/quickstart.md | 457 +++++++++++ ai/select-algorithm-python/requirements.txt | 11 + ai/select-algorithm-python/src/compare_all.py | 206 +++++ ai/select-algorithm-python/src/utils.py | 171 ++++ ai/select-algorithm-typescript/.gitignore | 8 + ai/select-algorithm-typescript/README.md | 116 +++ ai/select-algorithm-typescript/data/README.md | 5 + .../output/compare_all.txt | 42 + .../package-lock.json | 735 ++++++++++++++++++ ai/select-algorithm-typescript/package.json | 20 + ai/select-algorithm-typescript/quickstart.md | 441 +++++++++++ .../src/compare-all.ts | 232 ++++++ .../src/select-algorithm.ts | 287 +++++++ ai/select-algorithm-typescript/src/utils.ts | 205 +++++ ai/select-algorithm-typescript/tsconfig.json | 18 + 53 files changed, 7393 insertions(+), 9 deletions(-) create mode 100644 ai/select-algorithm-dotnet/.devcontainer/devcontainer.json create mode 100644 ai/select-algorithm-dotnet/.gitignore create mode 100644 ai/select-algorithm-dotnet/CompareAll.cs create mode 100644 ai/select-algorithm-dotnet/Models/Configuration.cs create mode 100644 ai/select-algorithm-dotnet/Models/HotelData.cs create mode 100644 ai/select-algorithm-dotnet/Program.cs create mode 100644 ai/select-algorithm-dotnet/README.md create mode 100644 ai/select-algorithm-dotnet/SelectAlgorithm.csproj create mode 100644 ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs create mode 100644 ai/select-algorithm-dotnet/Utils.cs create mode 100644 ai/select-algorithm-dotnet/appsettings.json create mode 100644 ai/select-algorithm-dotnet/data/README.md create mode 100644 ai/select-algorithm-dotnet/output/compare_all.txt create mode 100644 ai/select-algorithm-dotnet/quickstart.md create mode 100644 ai/select-algorithm-go/.gitignore create mode 100644 ai/select-algorithm-go/README.md create mode 100644 ai/select-algorithm-go/data/README.md create mode 100644 ai/select-algorithm-go/go.mod create mode 100644 ai/select-algorithm-go/go.sum create mode 100644 ai/select-algorithm-go/output/compare_all.txt create mode 100644 ai/select-algorithm-go/quickstart.md create mode 100644 ai/select-algorithm-go/src/compare_all.go create mode 100644 ai/select-algorithm-go/src/main.go create mode 100644 ai/select-algorithm-go/src/utils.go create mode 100644 ai/select-algorithm-java/.gitignore create mode 100644 ai/select-algorithm-java/README.md create mode 100644 ai/select-algorithm-java/data/README.md create mode 100644 ai/select-algorithm-java/output/compare_all.txt create mode 100644 ai/select-algorithm-java/pom.xml create mode 100644 ai/select-algorithm-java/quickstart.md create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java create mode 100644 ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java create mode 100644 ai/select-algorithm-python/.gitignore create mode 100644 ai/select-algorithm-python/README.md create mode 100644 ai/select-algorithm-python/data/README.md create mode 100644 ai/select-algorithm-python/output/compare_all.txt create mode 100644 ai/select-algorithm-python/quickstart.md create mode 100644 ai/select-algorithm-python/requirements.txt create mode 100644 ai/select-algorithm-python/src/compare_all.py create mode 100644 ai/select-algorithm-python/src/utils.py create mode 100644 ai/select-algorithm-typescript/.gitignore create mode 100644 ai/select-algorithm-typescript/README.md create mode 100644 ai/select-algorithm-typescript/data/README.md create mode 100644 ai/select-algorithm-typescript/output/compare_all.txt create mode 100644 ai/select-algorithm-typescript/package-lock.json create mode 100644 ai/select-algorithm-typescript/package.json create mode 100644 ai/select-algorithm-typescript/quickstart.md create mode 100644 ai/select-algorithm-typescript/src/compare-all.ts create mode 100644 ai/select-algorithm-typescript/src/select-algorithm.ts create mode 100644 ai/select-algorithm-typescript/src/utils.ts create mode 100644 ai/select-algorithm-typescript/tsconfig.json diff --git a/.github/workflows/validate-samples.yml b/.github/workflows/validate-samples.yml index 7bd29ec..06f494c 100644 --- a/.github/workflows/validate-samples.yml +++ b/.github/workflows/validate-samples.yml @@ -31,6 +31,7 @@ jobs: sample: - vector-search-typescript - vector-search-agent-typescript + - select-algorithm-typescript steps: - name: Checkout code @@ -52,10 +53,16 @@ jobs: run: npm run build validate-dotnet: - name: .NET + name: .NET - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - documentdb-samples.sln + - ai/select-algorithm-dotnet/SelectAlgorithm.csproj steps: - name: Checkout code @@ -66,8 +73,8 @@ jobs: with: dotnet-version: '8.0.x' - - name: Build solution - run: dotnet build documentdb-samples.sln + - name: Build + run: dotnet build ${{ matrix.sample }} validate-go: name: Go - ${{ matrix.sample }} @@ -80,6 +87,7 @@ jobs: sample: - vector-search-go - vector-search-agent-go + - select-algorithm-go steps: - name: Checkout code @@ -102,14 +110,20 @@ jobs: go build -o /dev/null "$f" utils.go done else - go build ./... + go build -o /dev/null ./... fi validate-python: - name: Python + name: Python - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - vector-search-python + - select-algorithm-python steps: - name: Checkout code @@ -121,19 +135,25 @@ jobs: python-version: '3.11' - name: Install dependencies - working-directory: ai/vector-search-python + working-directory: ai/${{ matrix.sample }} run: pip install -r requirements.txt - name: Validate Python syntax - working-directory: ai/vector-search-python + working-directory: ai/${{ matrix.sample }} run: | find . -name "*.py" -exec python -m py_compile {} + validate-java: - name: Java + name: Java - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - vector-search-java + - select-algorithm-java steps: - name: Checkout code @@ -147,5 +167,5 @@ jobs: cache: 'maven' - name: Compile Java - working-directory: ai/vector-search-java + working-directory: ai/${{ matrix.sample }} run: mvn compile -DskipTests diff --git a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json new file mode 100644 index 0000000..aafd623 --- /dev/null +++ b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json @@ -0,0 +1,48 @@ +{ + "name": "Azure DocumentDB Select Algorithm - .NET 8", + "image": "mcr.microsoft.com/devcontainers/dotnet:1-8.0-bookworm", + + "features": { + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/devcontainers/features/common-utils:2": { + "installZsh": true, + "configureZshAsDefaultShell": true, + "installOhMyZsh": true + } + }, + + "customizations": { + "vscode": { + "extensions": [ + "ms-dotnettools.csdevkit", + "ms-dotnettools.vscodeintellicode-csharp", + "ms-azuretools.vscode-azureresourcegroups", + "ms-azuretools.vscode-cosmosdb", + "mongodb.mongodb-vscode" + ], + "settings": { + "dotnet.completion.showCompletionItemsFromUnimportedNamespaces": true, + "files.exclude": { + "**/bin": true, + "**/obj": true + } + } + } + }, + + "postCreateCommand": "dotnet restore && dotnet build", + "remoteUser": "vscode", + + "containerEnv": { + "DOTNET_CLI_TELEMETRY_OPTOUT": "1", + "DOTNET_NOLOGO": "1" + }, + + "mounts": [ + "source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure,type=bind,consistency=cached" + ], + + "capAdd": ["SYS_PTRACE"], + "securityOpt": ["seccomp:unconfined"] +} diff --git a/ai/select-algorithm-dotnet/.gitignore b/ai/select-algorithm-dotnet/.gitignore new file mode 100644 index 0000000..de285c3 --- /dev/null +++ b/ai/select-algorithm-dotnet/.gitignore @@ -0,0 +1,7 @@ +bin/ +obj/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs new file mode 100644 index 0000000..8a4dec3 --- /dev/null +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -0,0 +1,259 @@ +/// Unified comparison runner for all 9 combinations (3 algorithms × 3 similarity metrics). +/// Executes vector searches sequentially for fair timing and prints a formatted comparison table. + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +public static class CompareAll +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + private record SearchResult(string Algorithm, string Metric, string Top1Name, double Top1Score, string Top2Name, double Top2Score); + + private static string GetAlgoDisplay(string kind) => kind switch + { + "vector-ivf" => "IVF", + "vector-hnsw" => "HNSW", + "vector-diskann" => "DiskANN", + _ => kind + }; + + public static void Run(AppConfiguration appConfig) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" Compare All Algorithms × Metrics"); + Console.WriteLine(" 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP"); + Console.WriteLine(new string('=', 60)); + + // Use config values with env var overrides for compare-specific settings + var databaseName = appConfig.MongoDB.DatabaseName; + var dataFile = appConfig.DataFiles.WithVectors; + var vectorField = appConfig.Embedding.EmbeddedField; + var dimensions = appConfig.Embedding.Dimensions; + var batchSize = appConfig.MongoDB.LoadBatchSize; + var queryText = Environment.GetEnvironmentVariable("QUERY_TEXT") ?? "luxury hotel near the beach"; + var topK = int.Parse(Environment.GetEnvironmentVariable("TOP_K") ?? "5"); + + var mongoClient = Utils.GetMongoClientPasswordless(appConfig); + var embeddingClient = Utils.GetEmbeddingClient(appConfig); + + try + { + var database = mongoClient.GetDatabase(databaseName); + + // Drop collection for a clean comparison + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + + var collection = database.GetCollection("hotels"); + + // Load data once into single collection + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, batchSize); + + // Generate ONE embedding for the query (reused for all 9 searches) + Console.WriteLine($"\nQuery: \"{queryText}\""); + Console.WriteLine($"Top K: {topK}"); + var embeddingResult = embeddingClient.GenerateEmbedding(queryText); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated (reused for all searches)\n"); + + // Define 9 index configurations + var configs = BuildIndexConfigs(dimensions); + + // Run each config sequentially: drop→create→wait→search + // DocumentDB doesn't allow multiple vector indexes of the same kind on the same field + Console.WriteLine("Running 9 algorithm × metric combinations...\n"); + var results = new List(); + foreach (var config in configs) + { + // 1. Drop all existing vector indexes + DropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + CreateIndex(collection, vectorField, config); + Console.WriteLine($" ✓ {config.Name} created"); + + // 3. Wait for index to build + Thread.Sleep(5000); + + // 4. Search + var searchResults = RunVectorSearch(collection, queryVector, vectorField, config.Name, topK); + + // 5. Extract top 2 results and record + var algoDisplay = GetAlgoDisplay(config.Kind); + var top1Name = "-"; var top1Score = 0.0; + var top2Name = "-"; var top2Score = 0.0; + if (searchResults.Count > 0) + { + var doc1 = searchResults[0]; + top1Name = doc1.Contains("HotelName") ? doc1["HotelName"].AsString : "Unknown"; + top1Score = doc1.Contains("score") ? doc1["score"].ToDouble() : 0.0; + } + if (searchResults.Count > 1) + { + var doc2 = searchResults[1]; + top2Name = doc2.Contains("HotelName") ? doc2["HotelName"].AsString : "Unknown"; + top2Score = doc2.Contains("score") ? doc2["score"].ToDouble() : 0.0; + } + results.Add(new SearchResult(algoDisplay, config.Similarity, top1Name, top1Score, top2Name, top2Score)); + } + + // Print comparison table + PrintComparisonTable(results); + } + finally + { + // Cleanup: drop the comparison collection + try + { + var database = mongoClient.GetDatabase(databaseName); + database.DropCollection("hotels"); + Console.WriteLine("\nCleanup: dropped collection 'hotels'"); + } + catch (Exception ex) + { + Console.WriteLine($"Cleanup warning: {ex.Message}"); + } + mongoClient.Cluster.Dispose(); + } + } + + private static List BuildIndexConfigs(int dimensions) + { + string[] metrics = ["COS", "L2", "IP"]; + var configs = new List(); + + // IVF + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_ivf_{metric.ToLower()}", "vector-ivf", metric, new BsonDocument { { "numLists", 1 } })); + + // HNSW + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_hnsw_{metric.ToLower()}", "vector-hnsw", metric, new BsonDocument { { "m", 16 }, { "efConstruction", 64 } })); + + // DiskANN + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_diskann_{metric.ToLower()}", "vector-diskann", metric, new BsonDocument { { "maxDegree", 32 }, { "lBuild", 50 } })); + + return configs; + } + + private static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + foreach (var idx in cursor.ToList()) + { + var name = idx.GetValue("name", "").AsString; + var key = idx.GetValue("key", new BsonDocument()).AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + try { collection.Indexes.DropOne(name); } catch { } + } + } + } + catch { } + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + // Drop existing index with same name if present + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + // Index doesn't exist, that's fine + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + // Index already exists with same config — idempotent + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static void PrintComparisonTable(List results) + { + Console.WriteLine(); + Console.WriteLine("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + Console.WriteLine($"│ {"Algorithm",-9}│ {"Metric",-7}│ {"Top 1 Result",-27}│ {"Score",-7}│ {"Top 2 Result",-27}│ {"Score",-7}│ {"Diff",-6}│"); + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (var i = 0; i < results.Count; i++) + { + var r = results[i]; + var diff = Math.Abs(r.Top1Score - r.Top2Score); + var top1Display = r.Top1Name.Length > 27 ? r.Top1Name[..24] + "..." : r.Top1Name; + var top2Display = r.Top2Name.Length > 27 ? r.Top2Name[..24] + "..." : r.Top2Name; + Console.WriteLine($"│ {r.Algorithm,-9}│ {r.Metric,-7}│ {top1Display,-27}│ {r.Top1Score,-7:F4}│ {top2Display,-27}│ {r.Top2Score,-7:F4}│ {diff,-6:F4}│"); + if (i < results.Count - 1) + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + Console.WriteLine("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } +} diff --git a/ai/select-algorithm-dotnet/Models/Configuration.cs b/ai/select-algorithm-dotnet/Models/Configuration.cs new file mode 100644 index 0000000..cbca25b --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/Configuration.cs @@ -0,0 +1,41 @@ +namespace SelectAlgorithm.Models; + +public class AppConfiguration +{ + public AzureOpenAIConfiguration AzureOpenAI { get; set; } = new(); + public MongoDBConfiguration MongoDB { get; set; } = new(); + public EmbeddingConfiguration Embedding { get; set; } = new(); + public VectorSearchConfiguration VectorSearch { get; set; } = new(); + public DataFilesConfiguration DataFiles { get; set; } = new(); +} + +public class AzureOpenAIConfiguration +{ + public string Endpoint { get; set; } = string.Empty; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; +} + +public class MongoDBConfiguration +{ + public string ClusterName { get; set; } = string.Empty; + public string DatabaseName { get; set; } = "Hotels"; + public int LoadBatchSize { get; set; } = 100; +} + +public class EmbeddingConfiguration +{ + public string EmbeddedField { get; set; } = "DescriptionVector"; + public int Dimensions { get; set; } = 1536; +} + +public class VectorSearchConfiguration +{ + public string Query { get; set; } = "luxury hotel near the beach"; + public string Similarity { get; set; } = ""; + public int TopK { get; set; } = 5; +} + +public class DataFilesConfiguration +{ + public string WithVectors { get; set; } = "data/Hotels_Vector.json"; +} diff --git a/ai/select-algorithm-dotnet/Models/HotelData.cs b/ai/select-algorithm-dotnet/Models/HotelData.cs new file mode 100644 index 0000000..4821ee3 --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/HotelData.cs @@ -0,0 +1,19 @@ +using MongoDB.Bson; +using MongoDB.Bson.Serialization.Attributes; + +namespace SelectAlgorithm.Models; + +public class HotelData +{ + [BsonId] + [BsonRepresentation(BsonType.ObjectId)] + public string? Id { get; set; } + + public string HotelId { get; set; } = string.Empty; + public string HotelName { get; set; } = string.Empty; + public string Description { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + + [BsonExtraElements] + public BsonDocument? ExtraElements { get; set; } +} diff --git a/ai/select-algorithm-dotnet/Program.cs b/ai/select-algorithm-dotnet/Program.cs new file mode 100644 index 0000000..37992ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Program.cs @@ -0,0 +1,40 @@ +using Microsoft.Extensions.Configuration; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +class Program +{ + static void Main(string[] args) + { + Console.WriteLine(); + Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); + Console.WriteLine(new string('-', 60)); + Console.WriteLine(); + + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + + var appConfig = new AppConfiguration(); + configuration.Bind(appConfig); + + var command = args.Length > 0 ? args[0].ToLower() : "compare-all"; + + switch (command) + { + case "compare-all": + CompareAll.Run(appConfig); + break; + default: + Console.WriteLine($"Unknown command: {command}"); + Console.WriteLine("Usage: dotnet run -- compare-all"); + return; + } + + Console.WriteLine(); + Console.WriteLine("Done!"); + } +} diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md new file mode 100644 index 0000000..2621f77 --- /dev/null +++ b/ai/select-algorithm-dotnet/README.md @@ -0,0 +1,137 @@ +# Select Algorithm - .NET (C#) + +Demonstrates three vector index algorithms available in Azure DocumentDB: + +| Algorithm | Best For | Cluster Tier | Key Parameters | +|-----------|----------|--------------|----------------| +| **IVF** | < 10,000 documents | M10+ | `numLists` | +| **HNSW** | 10,000–50,000 documents | M30+ | `m`, `efConstruction` | +| **DiskANN** | 50,000+ documents | M40+ | `maxDegree`, `lBuild` | + +## Prerequisites + +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) +- Azure DocumentDB cluster +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. **Configure environment:** + + The .NET sample uses `appsettings.json` for configuration. After deploying with `azd up`, you can export values: + + ```bash + azd env get-values + ``` + + Then update `appsettings.json` with your Azure resource values. + +2. Edit `appsettings.json` with your configuration: + + ```json + { + "AzureOpenAI": { + "EmbeddingModel": "text-embedding-3-small", + "Endpoint": "https://.openai.azure.com" + }, + "MongoDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "DataFiles": { + "WithVectors": "../data/Hotels_Vector.json" + } + } + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Restore packages: + + ```bash + dotnet restore + ``` + +## Usage + +Run all 9 combinations (default): + +```bash +dotnet run +``` + +## Configuration + +| Setting (appsettings.json) | Default | Description | +|---------------------------|---------|-------------| +| `MongoDB:ClusterName` | (required) | DocumentDB cluster name | +| `AzureOpenAI:Endpoint` | (required) | Azure OpenAI endpoint | +| `AzureOpenAI:EmbeddingModel` | (required) | Embedding model deployment name | +| `DataFiles:WithVectors` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `Embedding:EmbeddedField` | `DescriptionVector` | Field name containing embeddings | +| `Embedding:Dimensions` | `1536` | Vector dimensions | +| `MongoDB:DatabaseName` | `Hotels` | Target database name | +| `MongoDB:LoadBatchSize` | `100` | Batch size for data loading | +| `Embedding:EmbeddingSizeBatch` | `16` | Batch size for embedding requests | + +**Additional environment variables for compare mode:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show detailed per-result output | + +## How It Works + +1. **Connect** to DocumentDB using Microsoft Entra ID (OIDC) passwordless authentication +2. **Load** hotel documents with pre-computed embeddings from `Hotels_Vector.json` +3. For each of 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +## Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + +## Authentication + +This sample uses `DefaultAzureCredential` for both: +- **DocumentDB**: OIDC-based MongoDB authentication +- **Azure OpenAI**: Token-based authentication with `https://cognitiveservices.azure.com/.default` scope + +Ensure you are logged in with `az login` and have appropriate RBAC roles assigned. + +## Project Structure + +``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json # Dev container configuration +├── Models/ +│ ├── Configuration.cs # App configuration model +│ └── HotelData.cs # Hotel document model +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs # OIDC token handler +├── appsettings.json # Configuration file +├── CompareAll.cs # Unified 9-combination comparison runner +├── Program.cs # Entry point +├── README.md # This file +├── SelectAlgorithm.csproj # Project file +└── Utils.cs # Shared helpers (connection, embedding, search) +``` diff --git a/ai/select-algorithm-dotnet/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj new file mode 100644 index 0000000..331e522 --- /dev/null +++ b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj @@ -0,0 +1,23 @@ + + + Exe + net8.0 + enable + enable + SelectAlgorithm + + + + + + + + + + + + + PreserveNewest + + + diff --git a/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs new file mode 100644 index 0000000..eca94fd --- /dev/null +++ b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs @@ -0,0 +1,32 @@ +using Azure.Core; +using MongoDB.Driver.Authentication.Oidc; + +namespace SelectAlgorithm.Utilities; + +internal sealed class AzureIdentityTokenHandler( + TokenCredential credential, + string? tenantId +) : IOidcCallback +{ + private readonly string[] scopes = ["https://ossrdbms-aad.database.windows.net/.default"]; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = credential.GetToken( + new TokenRequestContext(scopes, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = await credential.GetTokenAsync( + new TokenRequestContext(scopes, parentRequestId: null, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} diff --git a/ai/select-algorithm-dotnet/Utils.cs b/ai/select-algorithm-dotnet/Utils.cs new file mode 100644 index 0000000..62590ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Utils.cs @@ -0,0 +1,190 @@ +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; +using MongoDB.Bson; +using MongoDB.Bson.Serialization; +using Azure.Identity; +using Azure.Core; +using Azure.AI.OpenAI; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +public class AzureOidcCallback : IOidcCallback +{ + private readonly DefaultAzureCredential _credential; + private static readonly string[] Scopes = { "https://ossrdbms-aad.database.windows.net/.default" }; + + public AzureOidcCallback(DefaultAzureCredential credential) => _credential = credential; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = _credential.GetToken(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = await _credential.GetTokenAsync(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} + +public static class Utils +{ + public static IMongoClient GetMongoClientPasswordless(AppConfiguration config) + { + var clusterName = config.MongoDB.ClusterName; + if (string.IsNullOrEmpty(clusterName)) + throw new InvalidOperationException("MongoDB:ClusterName is required in appsettings.json"); + + var credential = new DefaultAzureCredential(); + + var connectionString = $"mongodb+srv://{clusterName}.global.mongocluster.cosmos.azure.com/"; + var settings = MongoClientSettings.FromConnectionString(connectionString); + settings.ConnectTimeout = TimeSpan.FromSeconds(120); + settings.UseTls = true; + settings.RetryWrites = false; + + // Custom OIDC callback using DefaultAzureCredential + // Chains through CLI, managed identity, etc. + var oidcCallback = new AzureOidcCallback(credential); + settings.Credential = MongoCredential.CreateOidcCredential(oidcCallback, null); + + return new MongoClient(settings); + } + + public static EmbeddingClient GetEmbeddingClient(AppConfiguration config) + { + var endpoint = config.AzureOpenAI.Endpoint; + if (string.IsNullOrEmpty(endpoint)) + throw new InvalidOperationException("AzureOpenAI:Endpoint is required in appsettings.json"); + + var model = config.AzureOpenAI.EmbeddingModel; + + var credential = new DefaultAzureCredential(); + var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); + return azureClient.GetEmbeddingClient(model); + } + + public static List ReadJsonFile(string path) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Data file not found: {path}"); + + var json = File.ReadAllText(path); + return BsonSerializer.Deserialize>(json); + } + + public static void InsertData(IMongoCollection collection, List data, int batchSize) + { + var totalDocuments = data.Count; + var existingCount = collection.CountDocuments(new BsonDocument()); + + if (existingCount >= totalDocuments) + { + Console.WriteLine($"Collection already has {existingCount} documents, skipping insert"); + return; + } + + if (existingCount > 0) + { + collection.DeleteMany(new BsonDocument()); + } + + var insertedCount = 0; + for (var i = 0; i < totalDocuments; i += batchSize) + { + var batch = data.Skip(i).Take(batchSize).ToList(); + try + { + collection.InsertMany(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; + } + catch (MongoBulkWriteException) + { + // Some documents may have been inserted before the error + insertedCount += batch.Count; + } + Thread.Sleep(100); + } + + Console.WriteLine($"Inserted {insertedCount}/{totalDocuments} documents"); + } + + public static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + var indexes = cursor.ToList(); + foreach (var index in indexes) + { + if (index.Contains("key")) + { + var key = index["key"].AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + var indexName = index["name"].AsString; + collection.Indexes.DropOne(indexName); + Console.WriteLine($"Dropped existing vector index: {indexName}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Warning: Error dropping indexes: {ex.Message}"); + } + } + + public static List PerformVectorSearch( + IMongoCollection collection, + EmbeddingClient client, + string query, + string vectorField, + string model, + int topK = 5) + { + var embeddingResult = client.GenerateEmbedding(query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "document", "$$ROOT" }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + public static void PrintSearchResults(List results, string algorithm) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm} Search Results ({results.Count} found)"); + Console.WriteLine(new string('=', 60)); + + for (var i = 0; i < results.Count; i++) + { + var result = results[i]; + var doc = result.Contains("document") ? result["document"].AsBsonDocument : result; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString + : doc.Contains("name") ? doc["name"].AsString + : "Unknown"; + var score = result.Contains("score") ? result["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json new file mode 100644 index 0000000..5572a48 --- /dev/null +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -0,0 +1,24 @@ +{ + "AzureOpenAI": { + "Endpoint": "https://oaidctfqpct77ndi.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "MongoDB": { + "ClusterName": "docdb-dctfqpct77ndi", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "data/Hotels_Vector.json" + } +} diff --git a/ai/select-algorithm-dotnet/data/README.md b/ai/select-algorithm-dotnet/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-dotnet/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-dotnet/output/compare_all.txt b/ai/select-algorithm-dotnet/output/compare_all.txt new file mode 100644 index 0000000..4f4d995 --- /dev/null +++ b/ai/select-algorithm-dotnet/output/compare_all.txt @@ -0,0 +1,47 @@ +============================================================ + Compare All Algorithms × Metrics + 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 5 +Embedding generated (reused for all searches) + +Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md new file mode 100644 index 0000000..d0b761b --- /dev/null +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -0,0 +1,542 @@ +--- +title: Quickstart - Vector index with .NET +description: Compare DiskANN, HNSW, and IVF vector search algorithms in Azure DocumentDB using the .NET client library with passwordless authentication. +ms.devlang: csharp +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with .NET in Azure DocumentDB + +This article shows you how to compare all three vector search algorithms (DiskANN, HNSW, and IVF) in Azure DocumentDB using the .NET client library. The sample demonstrates how each algorithm performs with different similarity functions (COS, L2, IP) and helps you choose the right configuration for your workload. This quickstart uses a sample hotel dataset in a JSON file with pre-calculated vectors from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-dotnet) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [.NET 9.0 SDK](https://dotnet.microsoft.com/download/dotnet/9.0) or later. .NET 9.0 is a Standard Term Support (STS) release. Use the latest available .NET SDK for long-term production workloads. + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` [raw data file with vectors](https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json) to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file downloaded successfully: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a .NET project + +1. Create a new directory for your project and initialize the .NET console application: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-dotnet + cd select-algorithm-dotnet + dotnet new console --framework net9.0 + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-dotnet + Set-Location select-algorithm-dotnet + dotnet new console --framework net9.0 + ``` + + --- + + Verify the project was created: + + ### [Bash](#tab/bash) + + ```bash + ls *.csproj + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem *.csproj + ``` + + --- + +2. Install the required NuGet packages: + + ```bash + dotnet add package Azure.AI.OpenAI --version 2.1.0 + dotnet add package Azure.Identity --version 1.17.1 + dotnet add package MongoDB.Driver --version 3.0.0 + dotnet add package Microsoft.Extensions.Configuration --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.Binder --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.EnvironmentVariables --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.Json --version 9.0.0 + dotnet add package Microsoft.Extensions.DependencyInjection --version 9.0.0 + dotnet add package Microsoft.Extensions.Logging --version 9.0.0 + dotnet add package Microsoft.Extensions.Logging.Console --version 9.0.0 + ``` + + These packages provide: + - `Azure.AI.OpenAI`: Azure OpenAI client library to create vector embeddings + - `Azure.Identity`: Azure Identity library for passwordless authentication with DefaultAzureCredential + - `MongoDB.Driver`: MongoDB driver for .NET to interact with DocumentDB + - `Microsoft.Extensions.*`: Configuration, dependency injection, and logging infrastructure + + Verify installed packages: + + ```bash + dotnet list package + ``` + +3. Create environment variables for authentication. The sample uses DefaultAzureCredential for passwordless authentication: + + ### [Bash](#tab/bash) + + ```bash + export AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" + export AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" + export MONGO_CLUSTER_NAME="" + export AZURE_TENANT_ID="" + export DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" + $env:AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" + $env:MONGO_CLUSTER_NAME="" + $env:AZURE_TENANT_ID="" + $env:DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" + ``` + + --- + + Replace the placeholder values with your own information: + - ``: Your Azure OpenAI resource name + - ``: Your Azure DocumentDB cluster name + - ``: Your Microsoft Entra tenant ID + + You should always prefer passwordless authentication. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate .NET apps to Azure services by using the Azure SDK for .NET](/dotnet/azure/sdk/authentication). + +4. Sign in with Azure CLI for passwordless authentication: + + ```bash + az login + ``` + +5. Create an `appsettings.json` configuration file: + + ### [Bash](#tab/bash) + + ```bash + touch appsettings.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType File -Name appsettings.json + ``` + + --- + + Add this content to `appsettings.json`: + + ```json + { + "DatabaseName": "Hotels", + "EmbeddedField": "DescriptionVector", + "EmbeddingDimensions": 1536, + "LoadBatchSize": 100, + "SearchQuery": "quintessential lodging near running trails, eateries, retail", + "TopK": 5 + } + ``` + +## Create code files + +Continue the project by creating code files for vector search comparison. When you are done, the project structure should look like this: + +``` +├── data/ +│ └── Hotels_Vector.json # Hotel data with vector embeddings +└── select-algorithm-dotnet/ + ├── Services/ + │ └── VectorComparisonService.cs # Service to compare vector algorithms + ├── Utilities/ + │ └── Utils.cs # Shared utility functions + ├── Program.cs # Main application entry point + ├── appsettings.json # Configuration settings + ├── global.json # .NET SDK version specification + └── SelectAlgorithm.csproj # Project file +``` + +1. Create the directory structure: + + ### [Bash](#tab/bash) + + ```bash + mkdir Services + mkdir Utilities + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name Services + New-Item -ItemType Directory -Name Utilities + ``` + + --- + +2. Create the code files: + + ### [Bash](#tab/bash) + + ```bash + touch Services/VectorComparisonService.cs + touch Utilities/Utils.cs + touch global.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType File -Path Services\VectorComparisonService.cs + New-Item -ItemType File -Path Utilities\Utils.cs + New-Item -ItemType File -Name global.json + ``` + + --- + +## Create the algorithm comparison code + +### Program.cs + +Replace the contents of `Program.cs` with this code: + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Program.cs" ::: + +This main entry point: +- Loads configuration from appsettings.json and environment variables +- Sets up dependency injection with logging infrastructure +- Initializes Azure OpenAI and DocumentDB clients using passwordless authentication +- Creates a VectorComparisonService to test all algorithms +- Runs the comparison and prints results in a table format + +### CompareAll.cs + +Add this code to `CompareAll.cs`: + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/CompareAll.cs" ::: + +This service: +- Manages the comparison workflow for all algorithms +- Creates collections and indexes for each algorithm/similarity combination +- Inserts data and executes vector searches +- Measures and collects latency metrics +- Configures algorithm-specific parameters for index creation and search + +### Supporting files + +Create the following supporting files in the project: + +#### Utils.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Utils.cs" ::: + +#### Utilities/AzureIdentityTokenHandler.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs" ::: + +#### Models/Configuration.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Models/Configuration.cs" ::: + +#### Models/HotelData.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Models/HotelData.cs" ::: + +These supporting files provide: +- Passwordless authentication setup for Azure OpenAI and DocumentDB +- OIDC token handler for automatic token refresh +- JSON file reading and deserialization +- Batch data insertion with error handling +- Results formatting and display + +### global.json + +Add this code to `global.json`: + +```json +{ + "sdk": { + "version": "9.0.200", + "rollForward": "latestFeature" + } +} +``` + +This file specifies the .NET SDK version requirements for the project. + +## Run the code + +1. Build the project: + + ```bash + dotnet build + ``` + +2. Run the application to compare all algorithms with COS similarity (default): + + ```bash + dotnet run + ``` + + The application creates three collections (`hotels_diskann_cos`, `hotels_hnsw_cos`, `hotels_ivf_cos`), inserts data, creates vector indexes, and performs searches on each. + +3. To compare all algorithms with all similarity functions, set environment variables: + + ### [Bash](#tab/bash) + + ```bash + export ALGORITHM=all + export SIMILARITY=all + dotnet run + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:ALGORITHM="all" + $env:SIMILARITY="all" + dotnet run + ``` + + --- + + This creates nine collections (3 algorithms x 3 similarity functions) and compares all combinations. + +4. To test a specific algorithm with a specific similarity function: + + ### [Bash](#tab/bash) + + ```bash + export ALGORITHM=diskann + export SIMILARITY=COS + dotnet run + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:ALGORITHM="diskann" + $env:SIMILARITY="COS" + dotnet run + ``` + + --- + +### Expected output + +The application displays progress logs and a comparison table. Results vary based on data and server load: + +``` +Vector Algorithm Comparison + Database: Hotels + Algorithms: all + Similarity: COS + Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos + Search query: "quintessential lodging near running trails, eateries, retail" + +Generating query embedding... +Query embedding: 1536 dimensions + +--- DiskANN / COS --- +Collection: hotels_diskann_cos +Created collection: hotels_diskann_cos +Inserted: 50/50 +Created vector index: vectorIndex_diskann_cos +Executing vector search... +[OK] 5 results, 45ms + +--- HNSW / COS --- +Collection: hotels_hnsw_cos +Created collection: hotels_hnsw_cos +Inserted: 50/50 +Created vector index: vectorIndex_hnsw_cos +Executing vector search... +[OK] 5 results, 38ms + +--- IVF / COS --- +Collection: hotels_ivf_cos +Created collection: hotels_ivf_cos +Inserted: 50/50 +Created vector index: vectorIndex_ivf_cos +Executing vector search... +[OK] 5 results, 52ms + +========================================================================================== + Vector Algorithm Comparison Results +========================================================================================== +Algorithm Similarity Top Result Score Latency(ms) +------------------------------------------------------------------------------------------ +DiskANN COS Historic Downtown Inn 0.8342 45 +HNSW COS Historic Downtown Inn 0.8342 38 +IVF COS Historic Downtown Inn 0.8342 52 +========================================================================================== + +--- DiskANN / COS (hotels_diskann_cos) --- + 1. Historic Downtown Inn, Score: 0.8342 + 2. Mountain Trail Lodge, Score: 0.7891 + 3. Riverside Retreat, Score: 0.7654 + 4. Urban Fitness Suites, Score: 0.7210 + 5. Lakeside Wellness Resort, Score: 0.7045 + Latency: 45ms + +--- HNSW / COS (hotels_hnsw_cos) --- + 1. Historic Downtown Inn, Score: 0.8342 + 2. Mountain Trail Lodge, Score: 0.7891 + 3. Riverside Retreat, Score: 0.7654 + 4. Urban Fitness Suites, Score: 0.7210 + 5. Lakeside Wellness Resort, Score: 0.7045 + Latency: 38ms + +--- IVF / COS (hotels_ivf_cos) --- + 1. Historic Downtown Inn, Score: 0.8342 + 2. Mountain Trail Lodge, Score: 0.7891 + 3. Riverside Retreat, Score: 0.7654 + 4. Urban Fitness Suites, Score: 0.7210 + 5. Lakeside Wellness Resort, Score: 0.7045 + Latency: 52ms +``` + +## Understanding the results + +Use this guidance to choose the right vector search algorithm for your workload: + +| Algorithm | Best for | Index creation | Search speed | Memory usage | Accuracy | +|-----------|----------|---------------|--------------|--------------|----------| +| **DiskANN** | Large datasets, disk-based storage | Slow | Fast | Low (disk-based) | High | +| **HNSW** | Real-time search, high throughput | Medium | Fastest | High (memory-intensive) | Very high | +| **IVF** | Cost-sensitive, approximate search | Fast | Medium | Low | Medium | + +### Similarity functions + +| Function | Formula | Best for | +|----------|---------|----------| +| **COS** (Cosine) | Angle between vectors | Text embeddings, normalized vectors | +| **L2** (Euclidean) | Distance between points | Image embeddings, coordinate data | +| **IP** (Inner Product) | Dot product | Recommendation systems, unnormalized data | + +### Tuning parameters + +Each algorithm has tuning parameters that control the accuracy/performance tradeoff: + +**DiskANN:** +- `maxDegree`: Higher values (20-64) improve accuracy but increase memory +- `lBuild`: Higher values (10-100) improve index quality but slow build time +- `lSearch`: Higher values (100-200) improve search accuracy but slow queries + +**HNSW:** +- `m`: Higher values (16-48) improve accuracy but increase memory +- `efConstruction`: Higher values (64-200) improve index quality but slow build time +- `efSearch`: Higher values (80-200) improve search accuracy but slow queries + +**IVF:** +- `numLists`: More lists improve speed but may reduce accuracy +- `nProbes`: Higher values (1-10) improve accuracy but slow queries + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `TimeoutException` during connection | Verify your connection string and environment variables. Ensure your IP is in the DocumentDB firewall rules. | +| `AuthenticationException` | Check that `DefaultAzureCredential` can acquire a token. Run `az login` to refresh your credentials. | +| Build errors with .NET version | Ensure you have .NET 9.0 or later installed. Run `dotnet --version` to check. | +| `BsonSerializationException` | Ensure your model classes match the document structure in the collection. | +| Empty search results | The vector index might not be ready yet. The sample includes retry logic, but if you still see empty results, wait a few seconds and retry. | +| `IndexOptionsConflict` (code 85) | DocumentDB doesn't allow multiple vector indexes of the same kind on the same field. Drop the existing index before creating a new one. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the Azure portal. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "" +use Hotels +db.dropDatabase() +``` + +### [Azure portal](#tab/portal) + +1. Navigate to your DocumentDB resource in the Azure portal. +2. Select **Data Explorer**. +3. Right-click the **Hotels** database and select **Delete Database**. + +--- + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-go/.gitignore b/ai/select-algorithm-go/.gitignore new file mode 100644 index 0000000..76985d9 --- /dev/null +++ b/ai/select-algorithm-go/.gitignore @@ -0,0 +1,7 @@ +*.exe +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md new file mode 100644 index 0000000..a76ab08 --- /dev/null +++ b/ai/select-algorithm-go/README.md @@ -0,0 +1,199 @@ +# DocumentDB Vector Search - Go Algorithm Comparison Sample + +This sample demonstrates how to compare different vector search algorithms (IVF, HNSW, DiskANN) and similarity metrics (Cosine, L2, Inner Product) with Azure DocumentDB. + +## Prerequisites + +- [Go 1.24+](https://golang.org/dl/) +- [Azure DocumentDB cluster](/azure/documentdb/) (M40+ tier for DiskANN) +- [Azure OpenAI resource](https://learn.microsoft.com/azure/ai-services/openai/) with an embedding model deployed +- [Azure CLI](https://learn.microsoft.com/cli/azure/) (for passwordless authentication) +- Pre-generated embeddings file (`Hotels_Vector.json`) — see the `vector-search-go` sample + +## Setup + +1. **Clone the repository** and navigate to this directory: + + ```bash + cd ai/select-algorithm-go + ``` + +2. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + Required variables: + ```env + MONGO_CLUSTER_NAME=your-cluster-name + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_DOCUMENTDB_DATABASENAME=Hotels + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + ``` + +3. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +4. **Install dependencies**: + + ```bash + go mod download + ``` + +5. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +## Usage + +### Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single execution: + +```bash +go run ./src/... +``` + +This creates indexes sequentially (create/search/drop per combo — DocumentDB allows one vector index per kind per field) and prints a comparison table showing scores and top results. + +**Output:** +``` +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== + ... +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +ALGORITHM SIMILARITY #1 RESULT #1 SCORE #2 RESULT #2 SCORE DIFF +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +### On Windows (PowerShell) + +```powershell +go run ./src/... +``` + +## Environment Variables + +| Variable | Default | Description | +|--------------|----------------------------------|---------------------------------| +| `MONGO_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | *(required)* | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model name | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Database name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to data file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Embedding vector dimensions | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data insertion | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query | +| `VERBOSE` | `false` | Show full results | + +## How It Works + +### Comparison Mode (`compare_all.go`) + +1. **Data Loading:** Loads hotel data with pre-generated embeddings +2. **Index Creation:** Creates vector indexes sequentially (one at a time): + - For each algorithm (IVF, HNSW, DiskANN) × each metric (COS, L2, IP): + - Create the index → wait for readiness → search → drop the index + - DocumentDB only allows one vector index per kind per field +3. **Query Execution:** Generates embedding once, reuses for all 9 searches +4. **Result Comparison:** Prints formatted table with #1/#2 results, scores, and diff + +## Index Parameters + +| Algorithm | Kind | Key Parameters | Values Used | +|-----------|-----------------|-----------------------------|-----------------------------| +| IVF | `vector-ivf` | `numLists` | 1 (optimized for small datasets) | +| HNSW | `vector-hnsw` | `m`, `efConstruction` | 16, 64 | +| DiskANN | `vector-diskann`| `maxDegree`, `lBuild` | 32, 50 | + +## Project Structure + +``` +select-algorithm-go/ +├── .env.example # Environment variable template +├── go.mod # Go module dependencies +├── go.sum # Go module checksums +├── output/ # Sample output files +├── README.md # This file +└── src/ + ├── main.go # Entry point + ├── utils.go # Shared config, auth, data, and search helpers + └── compare_all.go # Unified 9-combination comparison runner (create/search/drop) +``` + +## Authentication + +This sample uses **passwordless (OIDC) authentication** with `DefaultAzureCredential`. Ensure your Azure identity has: + +- **DocumentDB**: Appropriate RBAC role on the cluster +- **Azure OpenAI**: `Cognitive Services OpenAI User` role on the OpenAI resource + +The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.default` scope, and the OpenAI client uses Azure token credentials. + +## Important Notes + +- **COS/IP scores:** Higher = more similar (0–1 range) +- **L2 scores:** Lower = more similar (distance metric) +- **Sequential indexing:** DocumentDB requires create/search/drop per combo (one vector index per kind per field) +- **Cleanup:** The sample automatically drops collections on exit +- **bson.D ordering:** All MongoDB commands use `bson.D` (ordered) instead of `bson.M` (unordered) to avoid "multi-key map" errors + +## Troubleshooting + +**"OIDC authentication failed"** +- Run `az login` and ensure you're authenticated +- Verify your Azure identity has RBAC permissions on the DocumentDB cluster +- Check that `MONGO_CLUSTER_NAME` matches your cluster name + +**"DiskANN indexes require a higher cluster tier"** +- DiskANN requires M40+ cluster tier +- Try IVF or HNSW instead, or upgrade your cluster + +**"No documents found with embeddings"** +- Ensure `DATA_FILE_WITH_VECTORS` points to the correct file +- Verify the file contains the field specified in `EMBEDDED_FIELD` +- Check that embeddings were generated with the correct dimensions + +## Learn More + +- [Azure DocumentDB Documentation](/azure/documentdb/) +- [Vector Search in DocumentDB](/azure/documentdb/vector-search) +- [Choosing a Vector Index Algorithm](/azure/documentdb/vector-search-algorithms) +- [Go MongoDB driver](https://pkg.go.dev/go.mongodb.org/mongo-driver) diff --git a/ai/select-algorithm-go/data/README.md b/ai/select-algorithm-go/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-go/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod new file mode 100644 index 0000000..f669ace --- /dev/null +++ b/ai/select-algorithm-go/go.mod @@ -0,0 +1,35 @@ +module documentdb-select-algorithm + +go 1.24.0 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/openai/openai-go/v3 v3.12.0 + go.mongodb.org/mongo-driver v1.17.6 +) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect +) diff --git a/ai/select-algorithm-go/go.sum b/ai/select-algorithm-go/go.sum new file mode 100644 index 0000000..6263657 --- /dev/null +++ b/ai/select-algorithm-go/go.sum @@ -0,0 +1,95 @@ +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= +github.com/openai/openai-go/v3 v3.12.0 h1:NkrImaglFQeDycc/n/fEmpFV8kKr8snl9/8X2x4eHOg= +github.com/openai/openai-go/v3 v3.12.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ai/select-algorithm-go/output/compare_all.txt b/ai/select-algorithm-go/output/compare_all.txt new file mode 100644 index 0000000..0eeb9a3 --- /dev/null +++ b/ai/select-algorithm-go/output/compare_all.txt @@ -0,0 +1,39 @@ +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 5 + +Loading data from data/Hotels_Vector.json... +Loaded 50 documents with embeddings +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create→search→drop)... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md new file mode 100644 index 0000000..c9db665 --- /dev/null +++ b/ai/select-algorithm-go/quickstart.md @@ -0,0 +1,495 @@ +--- +title: Quickstart - Vector index with Go +description: Compare DiskANN, HNSW, and IVF vector index algorithms using Go to select and tune the optimal index for your workload +ms.devlang: golang +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Go in Azure DocumentDB + +This quickstart walks you through building a Go application that compares all three vector index algorithms (DiskANN, HNSW, and IVF) side by side with different similarity functions to help you choose the best configuration for your workload. The sample uses a hotels dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-go) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Go](https://go.dev/doc/install) 1.22 or greater + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` [raw data file with vectors](https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json) to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Go project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-go + cd select-algorithm-go + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-go + Set-Location select-algorithm-go + code . + ``` + + --- + +2. Initialize a new Go module: + + ```bash + go mod init documentdb-vector-samples + ``` + + Verify the module was initialized: + + ### [Bash](#tab/bash) + + ```bash + cat go.mod + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content go.mod + ``` + + --- + +3. Install the required packages: + + ```bash + go get github.com/Azure/azure-sdk-for-go/sdk/azcore@v1.20.0 + go get github.com/Azure/azure-sdk-for-go/sdk/azidentity@v1.13.1 + go get github.com/openai/openai-go/v3@v3.12.0 + go get go.mongodb.org/mongo-driver@v1.17.6 + go mod tidy + ``` + + - `azcore`: Core Azure SDK functionality for Go + - `azidentity`: Azure Identity library for passwordless authentication with DefaultAzureCredential + - `openai-go/v3`: OpenAI client library with Azure support to generate embeddings + - `mongo-driver`: Official MongoDB driver for Go to work with DocumentDB + + Verify the packages are installed: + + ### [Bash](#tab/bash) + + ```bash + go list -m all | grep mongo + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + go list -m all | Select-String mongo + ``` + + --- + +4. Create a `.env` file for environment variables in `select-algorithm-go`: + + ```bash + # Azure OpenAI Embedding Configuration + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-openai-resource.openai.azure.com/ + + # Data File Configuration + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 + + # DocumentDB Configuration + MONGO_CLUSTER_NAME=your-cluster-name + + # Algorithm Selection + # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" + ALGORITHM=all + + # SIMILARITY: "all" | "COS" | "L2" | "IP" + SIMILARITY=COS + + # Database name + AZURE_DOCUMENTDB_DATABASENAME=Hotels + ``` + + For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name (not the full connection string, just the name) + + Verify the `.env` file was created: + + ### [Bash](#tab/bash) + + ```bash + cat .env + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content .env + ``` + + --- + + You should always prefer passwordless authentication. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Go apps to Azure services by using the Azure SDK for Go](/azure/developer/go/azure-sdk-authentication). + +## Create code files + +Create a `src` directory and add the main application file: + +### [Bash](#tab/bash) + +```bash +mkdir src +touch src/main.go +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Name src +New-Item -ItemType File -Path src/main.go +``` + +--- + +When you're done, the project structure should look like this: + +``` +├── data/ +│ ├── Hotels.json # Source hotel data (without vectors) +│ └── Hotels_Vector.json # Hotel data with vector embeddings +└── select-algorithm-go/ + ├── src/ + │ └── main.go # Main application comparing all algorithms + ├── go.mod # Go module dependencies + ├── go.sum # Dependency checksums + └── .env # Environment configuration +``` + +## Create the algorithm comparison code + +Create the following source files in the `src` directory. + +### src/main.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/main.go" ::: + +### src/compare_all.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/compare_all.go" ::: + +### src/utils.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/utils.go" ::: + +This code provides a complete vector algorithm comparison application with these key features: + +- **Passwordless authentication**: Uses `DefaultAzureCredential` for both Azure OpenAI and DocumentDB via OIDC +- **Three vector algorithms**: Implements DiskANN, HNSW, and IVF with algorithm-specific tuning parameters +- **Three similarity functions**: Supports COS (cosine), L2 (Euclidean), and IP (inner product) +- **Flexible configuration**: Use environment variables to compare all algorithms or test specific combinations +- **Performance measurement**: Tracks query latency for each algorithm/similarity pair +- **Comparison output**: Generates a formatted table showing results side by side +- **Production-ready patterns**: Includes batched insertion, error handling, and connection pooling + +## Run the code + +Before running the code, source your `.env` file to load environment variables into your shell session. + +### [Bash](#tab/bash) + +```bash +export $(grep -v '^#' .env | xargs) +``` + +### [PowerShell](#tab/powershell) + +```powershell +Get-Content .env | ForEach-Object { + if ($_ -match '^\s*([^#][^=]+)=(.*)') { + [System.Environment]::SetEnvironmentVariable($Matches[1].Trim(), $Matches[2].Trim()) + } +} +``` + +--- + +After sourcing the environment variables, run the application: + +```bash +go run src/main.go +``` + +The application will: + +1. Connect to Azure DocumentDB and Azure OpenAI using passwordless authentication +2. Create separate collections for each algorithm/similarity combination +3. Insert the hotel data into each collection +4. Create a vector index on each collection with algorithm-specific parameters +5. Generate an embedding for the search query +6. Execute vector searches across all collections +7. Display a comparison table with results and latencies + +Expected output: + +``` +Vector Algorithm Comparison + Database: Hotels + Algorithms: all + Similarity: COS + Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos + Search query: "quintessential lodging near running trails, eateries, retail" + +Initializing MongoDB and Azure OpenAI clients... +Loading data from ../data/Hotels_Vector.json... +Loaded 50 documents +Generating query embedding... +Query embedding: 1536 dimensions + +━━━ DiskANN / COS ━━━ +Collection: hotels_diskann_cos +Created collection: hotels_diskann_cos +Inserted: 50/50 +Created vector index: vectorIndex_diskann_cos +Executing vector search... +[OK] 5 results, 42ms + +━━━ HNSW / COS ━━━ +Collection: hotels_hnsw_cos +Created collection: hotels_hnsw_cos +Inserted: 50/50 +Created vector index: vectorIndex_hnsw_cos +Executing vector search... +[OK] 5 results, 38ms + +━━━ IVF / COS ━━━ +Collection: hotels_ivf_cos +Created collection: hotels_ivf_cos +Inserted: 50/50 +Created vector index: vectorIndex_ivf_cos +Executing vector search... +[OK] 5 results, 35ms + +╔══════════════════════════════════════════════════════════════════════════════════╗ +║ Vector Algorithm Comparison Results ║ +╠══════════════════════════════════════════════════════════════════════════════════╣ +║ Algorithm Similarity Top Result Score Latency(ms) ║ +╠══════════════════════════════════════════════════════════════════════════════════╣ +║ DiskANN COS Secret Point Motel 0.8562 42 ║ +║ HNSW COS Secret Point Motel 0.8562 38 ║ +║ IVF COS Secret Point Motel 0.8562 35 ║ +╚══════════════════════════════════════════════════════════════════════════════════╝ + +--- DiskANN / COS (hotels_diskann_cos) --- + 1. Secret Point Motel, Score: 0.8562 + 2. Countryside Hotel, Score: 0.8457 + 3. Downtown Modern Hotel, Score: 0.8398 + 4. Old Century Hotel, Score: 0.8321 + 5. Save-the-Light Deluxe Inn, Score: 0.8298 + Latency: 42ms + +--- HNSW / COS (hotels_hnsw_cos) --- + 1. Secret Point Motel, Score: 0.8562 + 2. Countryside Hotel, Score: 0.8457 + 3. Downtown Modern Hotel, Score: 0.8398 + 4. Old Century Hotel, Score: 0.8321 + 5. Save-the-Light Deluxe Inn, Score: 0.8298 + Latency: 38ms + +--- IVF / COS (hotels_ivf_cos) --- + 1. Secret Point Motel, Score: 0.8562 + 2. Countryside Hotel, Score: 0.8457 + 3. Downtown Modern Hotel, Score: 0.8398 + 4. Old Century Hotel, Score: 0.8321 + 5. Save-the-Light Deluxe Inn, Score: 0.8298 + Latency: 35ms + +Done. +``` + +## Understanding the results + +The comparison table shows how different algorithms perform on the same dataset with the same query: + +- **Algorithm**: DiskANN, HNSW, or IVF +- **Similarity**: The distance metric (COS, L2, or IP) +- **Top Result**: The highest-scoring hotel from the search +- **Score**: Similarity score (higher is better for COS and IP, lower is better for L2) +- **Latency**: Query execution time in milliseconds + +### Choosing the right algorithm + +Use this comparison to select the best algorithm for your workload: + +**DiskANN** (disk-based approximate nearest neighbor): +- Best for: Large datasets that don't fit in memory +- Pros: Memory efficient, good recall with high dimensions +- Cons: Requires disk I/O, slower build time +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall + +**HNSW** (hierarchical navigable small world): +- Best for: High-speed queries with excellent recall +- Pros: Fastest queries, excellent recall, stable performance +- Cons: Higher memory usage than DiskANN +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall + +**IVF** (inverted file index): +- Best for: Large datasets with good clustering properties +- Pros: Fast queries, low memory overhead +- Cons: Recall depends on `numLists` and `nProbes` tuning +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall + +### Choosing the right similarity function + +The similarity function should match your embedding model and use case: + +- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) +- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data +- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful + +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. + +## Experiment with different configurations + +You can compare different combinations by setting environment variables: + +**Compare all algorithms with cosine similarity (default):** + +```bash +# .env file +ALGORITHM=all +SIMILARITY=COS +``` + +**Compare all algorithms with all similarity functions (9 collections):** + +```bash +# .env file +ALGORITHM=all +SIMILARITY=all +``` + +**Test only DiskANN with all similarity functions:** + +```bash +# .env file +ALGORITHM=diskann +SIMILARITY=all +``` + +**Test only cosine similarity across all algorithms:** + +```bash +# .env file +ALGORITHM=all +SIMILARITY=COS +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `server selection error` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `authentication failed` | Check credentials in connection string. Ensure `DefaultAzureCredential` is configured (run `az login`). | +| `go: module not found` | Run `go mod tidy` to resolve dependencies. | +| Build errors | Ensure Go 1.22+ is installed. Run `go version` to check. | +| Empty search results | The vector index may not be ready yet. The code includes retry logic, but larger datasets may need more time. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the Azure portal. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "" +use Hotels +db.dropDatabase() +``` + +### [Azure portal](#tab/portal) + +1. Navigate to your DocumentDB resource in the Azure portal +2. Select **Data Explorer** +3. Right-click the **Hotels** database and select **Delete Database** + +--- + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go new file mode 100644 index 0000000..81cb7ef --- /dev/null +++ b/ai/select-algorithm-go/src/compare_all.go @@ -0,0 +1,325 @@ +package main + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CompareResult holds the result of a single algorithm+metric search +type CompareResult struct { + Algorithm string + Metric string + Results []SearchResult + Top1Name string + Top1Score float64 + Top2Name string + Top2Score float64 + Error error +} + +// indexSpec defines one of the 9 combinations +type indexSpec struct { + Algorithm string + Kind string + Metric string + IndexName string + Options bson.D +} + +// RunCompareAll executes all 9 algorithm×metric combinations on a single collection +func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + queryText := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + topK, _ := strconv.Atoi(getEnvOrDefault("TOP_K", "5")) + + fmt.Println("\n" + strings.Repeat("=", 70)) + fmt.Println(" COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations)") + fmt.Println(strings.Repeat("=", 70)) + fmt.Printf("Query: %q\n", queryText) + fmt.Printf("Top-K: %d\n", topK) + + // 1. Drop collection for clean comparison, then load data + database := dbClient.Database(config.DatabaseName) + collection := database.Collection("hotels") + + // Drop existing collection for a clean comparison + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("\nCleanup: dropping comparison collection...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels'") + } + }() + + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // 2. Generate ONE embedding for the query (reused for all 9 searches) + fmt.Printf("\nGenerating embedding for query: %q\n", queryText) + queryEmbedding, err := GenerateEmbedding(ctx, aiClient, queryText, config.ModelName) + if err != nil { + return fmt.Errorf("failed to generate query embedding: %v", err) + } + fmt.Printf("Embedding generated (%d dimensions)\n", len(queryEmbedding)) + + // 3. Define all 9 index specs + metrics := []string{"COS", "L2", "IP"} + specs := buildIndexSpecs(config.VectorField, config.Dimensions, metrics) + + // 4. Create→search→drop each index sequentially (DocumentDB only allows one vector index per field) + fmt.Printf("\nRunning %d vector index comparisons (create→search→drop)...\n", len(specs)) + var results []CompareResult + + for _, spec := range specs { + // Drop all existing vector indexes on this field + DropVectorIndexes(ctx, collection, config.VectorField) + + // Create this specific index with retry (drop may still be in progress) + var createErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(3 * time.Second) + } + createErr = createNamedVectorIndex(ctx, collection, config.VectorField, spec) + if createErr == nil { + break + } + } + if createErr != nil { + results = append(results, CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Error: createErr, + }) + fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, createErr) + continue + } + fmt.Printf(" ✓ %s created\n", spec.IndexName) + + // Wait for index to become ready + time.Sleep(10 * time.Second) + + // Search using simple cosmosSearch (with retry for index readiness) + var searchResults []SearchResult + var searchErr error + for searchAttempt := 0; searchAttempt < 3; searchAttempt++ { + if searchAttempt > 0 { + time.Sleep(5 * time.Second) + } + searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, config.VectorField, topK) + if searchErr == nil && len(searchResults) > 0 { + break + } + } + + top1Name, top1Score := extractResult(searchResults, 0) + top2Name, top2Score := extractResult(searchResults, 1) + + cr := CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Results: searchResults, + Top1Name: top1Name, + Top1Score: top1Score, + Top2Name: top2Name, + Top2Score: top2Score, + Error: searchErr, + } + results = append(results, cr) + } + + // 6. Print comparison table + fmt.Println() + printComparisonTable(results) + + return nil +} + +// buildIndexSpecs creates the 9 index specifications +func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { + var specs []indexSpec + + type algoConfig struct { + name string + kind string + options bson.D + } + + algos := []algoConfig{ + {"IVF", "vector-ivf", bson.D{{"numLists", 1}}}, + {"HNSW", "vector-hnsw", bson.D{{"m", 16}, {"efConstruction", 64}}}, + {"DiskANN", "vector-diskann", bson.D{{"maxDegree", 32}, {"lBuild", 50}}}, + } + + for _, algo := range algos { + for _, metric := range metrics { + metricLower := strings.ToLower(metric) + opts := bson.D{ + {"kind", algo.kind}, + {"dimensions", dimensions}, + {"similarity", metric}, + } + for _, o := range algo.options { + opts = append(opts, o) + } + + specs = append(specs, indexSpec{ + Algorithm: algo.name, + Kind: algo.kind, + Metric: metric, + IndexName: fmt.Sprintf("vector_%s_%s", strings.ToLower(algo.name), metricLower), + Options: opts, + }) + } + } + + return specs +} + +// createNamedVectorIndex creates a single named vector index +func createNamedVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, spec indexSpec) error { + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", spec.IndexName}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", spec.Options}, + }, + }}, + } + + var result bson.M + err := collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "IndexAlreadyExists") { + return nil + } + return err + } + return nil +} + +// vectorSearchSimple performs a vector search using the active vector index +func vectorSearchSimple(ctx context.Context, collection *mongo.Collection, embedding []float64, vectorField string, topK int) ([]SearchResult, error) { + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": embedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, err + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, err + } + + return results, nil +} + +// extractResult returns the name and score of the result at the given index +func extractResult(results []SearchResult, idx int) (string, float64) { + if idx >= len(results) { + return "(no results)", 0 + } + doc := results[idx].Document.(bson.D) + var name string + for _, elem := range doc { + if elem.Key == "HotelName" { + name = fmt.Sprintf("%v", elem.Value) + break + } + } + if name == "" { + name = "Unknown" + } + return name, results[idx].Score +} + +// printComparisonTable outputs a formatted table of results +func printComparisonTable(results []CompareResult) { + fmt.Println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐") + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff") + fmt.Println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤") + + for _, r := range results { + if r.Error != nil { + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + r.Algorithm, r.Metric, "ERROR", "-", "-", "-", "-") + continue + } + + top1 := r.Top1Name + if len(top1) > 26 { + top1 = top1[:26] + } + top2 := r.Top2Name + if len(top2) > 26 { + top2 = top2[:26] + } + diff := math.Abs(r.Top1Score - r.Top2Score) + + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %6.4f │ %-26s │ %6.4f │%6.4f │\n", + r.Algorithm, r.Metric, top1, r.Top1Score, top2, r.Top2Score, diff) + } + + fmt.Println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘") +} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go new file mode 100644 index 0000000..85e7e6e --- /dev/null +++ b/ai/select-algorithm-go/src/main.go @@ -0,0 +1,28 @@ +package main + +import ( + "context" + "fmt" + "log" +) + +func main() { + fmt.Println("Starting vector algorithm comparison...") + + ctx := context.Background() + config := LoadConfig() + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless(ctx, config) + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + err = RunCompareAll(ctx, config, mongoClient, azureOpenAIClient) + if err != nil { + log.Fatalf("Compare all failed: %v", err) + } + + fmt.Println("\nComparison completed successfully!") +} diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go new file mode 100644 index 0000000..8b415db --- /dev/null +++ b/ai/select-algorithm-go/src/utils.go @@ -0,0 +1,385 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/azure" + "github.com/openai/openai-go/v3/option" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +// Config holds the application configuration +type Config struct { + ClusterName string + DatabaseName string + DataFile string + VectorField string + ModelName string + Dimensions int + BatchSize int + Similarity string + Algorithm string +} + +// SearchResult represents a search result document +type SearchResult struct { + Document interface{} `bson:"document"` + Score float64 `bson:"score"` +} + +// InsertStats holds statistics about data insertion +type InsertStats struct { + Total int `json:"total"` + Inserted int `json:"inserted"` + Failed int `json:"failed"` +} + +// LoadConfig loads configuration from environment variables +func LoadConfig() *Config { + dimensions, _ := strconv.Atoi(getEnvOrDefault("EMBEDDING_DIMENSIONS", "1536")) + batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) + + return &Config{ + ClusterName: getEnvOrDefault("MONGO_CLUSTER_NAME", ""), + DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), + DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"), + VectorField: getEnvOrDefault("EMBEDDED_FIELD", "DescriptionVector"), + ModelName: getEnvOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), + Dimensions: dimensions, + BatchSize: batchSize, + Similarity: getEnvOrDefault("SIMILARITY", ""), + Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "")), + } +} + +// getEnvOrDefault returns environment variable value or default if not set +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +// GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication +func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { + if config.ClusterName == "" { + return nil, openai.Client{}, fmt.Errorf("MONGO_CLUSTER_NAME environment variable is required") + } + + // Create Azure credential + credential, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("failed to create Azure credential: %v", err) + } + + // Connect to DocumentDB with OIDC authentication + mongoURI := fmt.Sprintf("mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", config.ClusterName) + + fmt.Println("Attempting OIDC authentication...") + mongoClient, err := connectWithOIDC(ctx, mongoURI, credential) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("OIDC authentication failed: %v", err) + } + fmt.Println("OIDC authentication successful!") + + // Get Azure OpenAI endpoint + azureOpenAIEndpoint := os.Getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if azureOpenAIEndpoint == "" { + return nil, openai.Client{}, fmt.Errorf("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + } + + // Create Azure OpenAI client with credential-based authentication + openAIClient := openai.NewClient( + option.WithBaseURL(fmt.Sprintf("%s/openai/v1", azureOpenAIEndpoint)), + azure.WithTokenCredential(credential)) + + return mongoClient, openAIClient, nil +} + +// connectWithOIDC attempts to connect using OIDC authentication +func connectWithOIDC(ctx context.Context, mongoURI string, credential *azidentity.DefaultAzureCredential) (*mongo.Client, error) { + oidcCallback := func(ctx context.Context, args *options.OIDCArgs) (*options.OIDCCredential, error) { + scope := "https://ossrdbms-aad.database.windows.net/.default" + fmt.Printf("Getting token with scope: %s\n", scope) + token, err := credential.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{scope}, + }) + if err != nil { + return nil, fmt.Errorf("failed to get token with scope %s: %v", scope, err) + } + + fmt.Printf("Successfully obtained token\n") + + return &options.OIDCCredential{ + AccessToken: token.Token, + }, nil + } + + clientOptions := options.Client(). + ApplyURI(mongoURI). + SetConnectTimeout(30 * time.Second). + SetServerSelectionTimeout(30 * time.Second). + SetRetryWrites(false). + SetAuth(options.Credential{ + AuthMechanism: "MONGODB-OIDC", + AuthMechanismProperties: map[string]string{ + "TOKEN_RESOURCE": "https://ossrdbms-aad.database.windows.net", + }, + OIDCMachineCallback: oidcCallback, + }) + + mongoClient, err := mongo.Connect(ctx, clientOptions) + if err != nil { + return nil, err + } + + return mongoClient, nil +} + +// ReadFileReturnJSON reads a JSON file and returns the data as a slice of maps +func ReadFileReturnJSON(filePath string) ([]map[string]interface{}, error) { + file, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file '%s': %v", filePath, err) + } + + var data []map[string]interface{} + err = json.Unmarshal(file, &data) + if err != nil { + return nil, fmt.Errorf("error parsing JSON in file '%s': %v", filePath, err) + } + + return data, nil +} + +// InsertData inserts data into a MongoDB collection in batches +func InsertData(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + totalDocuments := len(data) + insertedCount := 0 + failedCount := 0 + + fmt.Printf("Starting batch insertion of %d documents...\n", totalDocuments) + + for i := 0; i < totalDocuments; i += batchSize { + end := i + batchSize + if end > totalDocuments { + end = totalDocuments + } + + batch := data[i:end] + batchNum := (i / batchSize) + 1 + + documents := make([]interface{}, len(batch)) + for j, doc := range batch { + documents[j] = doc + } + + result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) + if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + errorCount := len(bulkErr.WriteErrors) + insertedCount += len(batch) - errorCount + failedCount += errorCount + fmt.Printf("Batch %d had errors: %d inserted, %d failed\n", batchNum, len(batch)-errorCount, errorCount) + for _, writeErr := range bulkErr.WriteErrors { + fmt.Printf(" Error: %s\n", writeErr.Message) + } + } else { + failedCount += len(batch) + fmt.Printf("Batch %d failed completely: %v\n", batchNum, err) + } + } else { + insertedCount += len(result.InsertedIDs) + fmt.Printf("Batch %d completed: %d documents inserted\n", batchNum, len(result.InsertedIDs)) + } + + time.Sleep(100 * time.Millisecond) + } + + return &InsertStats{ + Total: totalDocuments, + Inserted: insertedCount, + Failed: failedCount, + }, nil +} + +// DropVectorIndexes drops existing vector indexes on the specified field +func DropVectorIndexes(ctx context.Context, collection *mongo.Collection, vectorField string) error { + cursor, err := collection.Indexes().List(ctx) + if err != nil { + return fmt.Errorf("could not list indexes: %v", err) + } + defer cursor.Close(ctx) + + var vectorIndexes []string + for cursor.Next(ctx) { + var index bson.M + if err := cursor.Decode(&index); err != nil { + continue + } + + if key, ok := index["key"].(bson.M); ok { + if indexType, exists := key[vectorField]; exists && indexType == "cosmosSearch" { + if name, ok := index["name"].(string); ok { + vectorIndexes = append(vectorIndexes, name) + } + } + } + } + + for _, indexName := range vectorIndexes { + fmt.Printf("Dropping existing vector index: %s\n", indexName) + _, err := collection.Indexes().DropOne(ctx, indexName) + if err != nil { + fmt.Printf("Warning: Could not drop index %s: %v\n", indexName, err) + } + } + + if len(vectorIndexes) > 0 { + fmt.Printf("Dropped %d existing vector index(es)\n", len(vectorIndexes)) + } else { + fmt.Println("No existing vector indexes found to drop") + } + + return nil +} + +// PerformVectorSearch performs a vector search using the cosmosSearch aggregation pipeline +func PerformVectorSearch(ctx context.Context, collection *mongo.Collection, client openai.Client, query, vectorField, model string, topK int) ([]SearchResult, error) { + fmt.Printf("Performing vector search for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, client, query, model) + if err != nil { + return nil, fmt.Errorf("error generating embedding: %v", err) + } + + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": queryEmbedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, fmt.Errorf("error performing vector search: %v", err) + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + fmt.Printf("Warning: Could not decode result: %v\n", err) + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, fmt.Errorf("cursor error: %v", err) + } + + return results, nil +} + +// GenerateEmbedding generates an embedding for the given text using Azure OpenAI +func GenerateEmbedding(ctx context.Context, client openai.Client, text, modelName string) ([]float64, error) { + resp, err := client.Embeddings.New(ctx, openai.EmbeddingNewParams{ + Input: openai.EmbeddingNewParamsInputUnion{ + OfString: openai.String(text), + }, + Model: modelName, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate embedding: %v", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding data received") + } + + embedding := make([]float64, len(resp.Data[0].Embedding)) + for i, v := range resp.Data[0].Embedding { + embedding[i] = float64(v) + } + + return embedding, nil +} + +// PrintSearchResults prints search results in a formatted way +func PrintSearchResults(results []SearchResult, algorithm string) { + if len(results) == 0 { + fmt.Println("No search results found.") + return + } + + fmt.Printf("\n%s Search Results (top %d):\n", strings.ToUpper(algorithm), len(results)) + fmt.Println(strings.Repeat("=", 80)) + + for i, result := range results { + doc := result.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + + fmt.Printf("%d. HotelName: %s, Score: %.4f\n", i+1, hotelName, result.Score) + } +} + +// FilterDocumentsWithEmbeddings returns only documents that contain the vector field +func FilterDocumentsWithEmbeddings(data []map[string]interface{}, vectorField string) []map[string]interface{} { + var filtered []map[string]interface{} + for _, doc := range data { + if _, exists := doc[vectorField]; exists { + filtered = append(filtered, doc) + } + } + return filtered +} + +// PrepareCollection clears existing data and inserts new documents +func PrepareCollection(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + fmt.Printf("Preparing collection '%s'...\n", collection.Name()) + + deleteResult, err := collection.DeleteMany(ctx, bson.M{}) + if err != nil { + return nil, fmt.Errorf("failed to clear existing data: %v", err) + } + if deleteResult.DeletedCount > 0 { + fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) + } + + stats, err := InsertData(ctx, collection, data, batchSize) + if err != nil { + return nil, fmt.Errorf("failed to insert data: %v", err) + } + + return stats, nil +} diff --git a/ai/select-algorithm-java/.gitignore b/ai/select-algorithm-java/.gitignore new file mode 100644 index 0000000..9ae5e73 --- /dev/null +++ b/ai/select-algorithm-java/.gitignore @@ -0,0 +1,7 @@ +target/ +.env +*.class + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md new file mode 100644 index 0000000..d84bd48 --- /dev/null +++ b/ai/select-algorithm-java/README.md @@ -0,0 +1,128 @@ +# Select Algorithm - Java + +This sample demonstrates how to compare all three vector search index algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB using the MongoDB Java driver. + +## Prerequisites + +- Java 17 or later +- Maven 3.8+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your Azure resource details (if not using `azd`): + - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint + - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) + - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +## Build + +```bash +mvn clean compile +``` + +## Run + +Compare all 9 algorithm × similarity combinations: + +```bash +mvn exec:java -Pcompare +``` + +Or via the `ALGORITHM` environment variable: + +```bash +ALGORITHM=compare mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare"; mvn exec:java +``` + +## Algorithms + +| Algorithm | Description | Best For | +|-----------|-------------|----------| +| **IVF** | Inverted File index — partitions vectors into clusters | Large datasets with batch queries | +| **HNSW** | Hierarchical Navigable Small World graph | Low-latency, high-recall searches | +| **DiskANN** | Disk-based Approximate Nearest Neighbor | Very large datasets that exceed memory | + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Print detailed per-index results | + +## Authentication + +This sample uses **passwordless authentication** via `DefaultAzureCredential`: + +- **DocumentDB**: OIDC mechanism with Azure identity +- **Azure OpenAI**: Entra ID token-based auth + +Ensure your identity has the appropriate RBAC roles assigned on both resources. + +### What It Does + +1. Connects to DocumentDB and loads hotel data into a single `hotels` collection +2. Generates one embedding for the query text (reused for all searches) +3. For each of the 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +### Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + +## Project Structure + +``` +src/main/java/com/azure/documentdb/selectalgorithm/ +├── Main.java — Entry point, runs CompareAll +├── Utils.java — Shared helpers (connection, embedding, data loading) +└── CompareAll.java — Unified comparison runner (all 9 combinations) +``` diff --git a/ai/select-algorithm-java/data/README.md b/ai/select-algorithm-java/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-java/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-java/output/compare_all.txt b/ai/select-algorithm-java/output/compare_all.txt new file mode 100644 index 0000000..7794fd5 --- /dev/null +++ b/ai/select-algorithm-java/output/compare_all.txt @@ -0,0 +1,50 @@ +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 5 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml new file mode 100644 index 0000000..99c57e9 --- /dev/null +++ b/ai/select-algorithm-java/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + + diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md new file mode 100644 index 0000000..f90bd8b --- /dev/null +++ b/ai/select-algorithm-java/quickstart.md @@ -0,0 +1,528 @@ +--- +title: Quickstart - Vector index with Java +description: Test and compare DiskANN, HNSW, and IVF vector indexes in Azure DocumentDB using Java to select the best algorithm for your vector search workload. +ms.devlang: java +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Java in Azure DocumentDB + +This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure DocumentDB using Java to help you select the best configuration for your vector search workload. The sample uses the same hotel dataset with pre-calculated vectors as the other quickstarts to demonstrate performance differences across algorithms and similarity functions. + + + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Java 21 or higher](/java/openjdk/download) + +- [Maven 3.6 or higher](https://maven.apache.org/download.cgi) + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` data file with vectors to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify: Confirm the file exists and is valid JSON: + + ### [Bash](#tab/bash) + + ```bash + ls -lh data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Item data\Hotels_Vector.json + ``` + + --- + +## Create a Java project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-quickstart + cd select-algorithm-quickstart + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-quickstart + Set-Location select-algorithm-quickstart + code . + ``` + + --- + +2. Create a standard Maven project structure: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p src/main/java/com/azure/documentdb/selectalgorithm + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Path "src\main\java\com\azure\documentdb\selectalgorithm" -Force + ``` + + --- + +3. Create a `pom.xml` file in the root directory with the following content: + + ```xml + + 4.0.0 + + com.azure.documentdb.samples + select-algorithm-java + 1.0-SNAPSHOT + Azure DocumentDB Vector Algorithm Comparison + + + 21 + 21 + 21 + UTF-8 + + + + + + com.azure + azure-sdk-bom + 1.2.29 + pom + import + + + + + + + org.mongodb + mongodb-driver-sync + 5.6.2 + + + com.azure + azure-identity + + + com.azure + azure-ai-openai + + + com.fasterxml.jackson.core + jackson-databind + 2.18.2 + + + io.github.cdimascio + dotenv-java + 3.0.2 + + + org.slf4j + slf4j-simple + 2.0.17 + runtime + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 21 + + + + org.codehaus.mojo + exec-maven-plugin + 3.1.0 + + com.azure.documentdb.selectalgorithm.SelectAlgorithm + + + + + + ``` + + Verify: Run `mvn dependency:resolve` to confirm all dependencies resolve without errors. + +4. Create a `.env` filein the project root for environment variables: + + ```bash + # Azure DocumentDB cluster name for passwordless authentication + MONGO_CLUSTER_NAME= + + # Azure managed identity principal ID for authentication + AZURE_MANAGED_IDENTITY_PRINCIPAL_ID= + + # Azure OpenAI endpoint and model configuration + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-openai-resource.openai.azure.com/ + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + + # Data file path (relative to project root) + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + + # Vector configuration + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 + + # Algorithm selection: all, diskann, hnsw, ivf + ALGORITHM=all + + # Similarity function: COS, L2, IP, all + SIMILARITY=COS + ``` + + Replace the placeholder values with your Azure resource information: + + - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `AZURE_MANAGED_IDENTITY_PRINCIPAL_ID`: Your managed identity principal ID + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + + Verify the `.env` file was created: + + ### [Bash](#tab/bash) + + ```bash + cat .env + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content .env + ``` + + --- + + You should see your configuration values including the Azure OpenAI endpoint and cluster name. + + This sample uses passwordless authenticationwith `DefaultAzureCredential`, which requires your identity to have proper RBAC roles assigned. For more information on authentication options, see [Authenticate Java apps to Azure services by using the Azure SDK for Java](/azure/developer/java/sdk/authentication/overview). + +## Create code files + +When you are done, the project structure should look like this: + +```text +select-algorithm-quickstart/ +├── data/ +│ └── Hotels_Vector.json # Hotel data with vector embeddings +├── src/ +│ └── main/ +│ └── java/ +│ └── com/ +│ └── azure/ +│ └── documentdb/ +│ └── selectalgorithm/ +│ ├── SelectAlgorithm.java # Main comparison logic +│ └── Utils.java # Shared utility functions +├── pom.xml # Maven dependencies +└── .env # Environment variables +``` + +## Create the algorithm comparison code + +### Create utility functions + +Create `src/main/java/com/azure/documentdb/selectalgorithm/Utils.java` and paste the following code: + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java" ::: + +This utility class provides: + +- **Environment variable management**: Loads configuration from `.env` file or system environment +- **Passwordless authentication**: Uses `DefaultAzureCredential` for both MongoDB and Azure OpenAI +- **MongoDB client creation**: Configures OIDC authentication for DocumentDB +- **Azure OpenAI client creation**: Sets up the OpenAI client for embedding generation +- **Data loading**: Reads hotel data from JSON file +- **Embedding generation**: Creates vector embeddings for text queries +- **Index configuration**: Generates algorithm-specific vector index options +- **Search configuration**: Generates algorithm-specific search parameters +- **Results formatting**: Prints comparison table of algorithm performance + +### Create main comparison logic + +Create the following source files in `src/main/java/com/azure/documentdb/selectalgorithm/`: + +#### CompareAll.java + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java" ::: + +#### Main.java + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java" ::: + + +This main comparison logic provides: + +- **Algorithm comparison logic**: Tests all combinations of algorithms and similarity functions +- **Collection management**: Creates separate collections for each configuration +- **Data loading**: Inserts hotel data in batches +- **Index creation**: Creates both standard and vector indexes +- **Performance measurement**: Measures average query latency +- **Results display**: Outputs comparison table + +## Run the code + +1. Compile the project: + + ```bash + mvn clean compile + ``` + + Verify: The build output ends with `BUILD SUCCESS`. + +2. Run the comparison for all algorithms with cosine similarity (default): + + ```bash + mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + +3. Run the comparison for a specific algorithm: + + ### [Bash](#tab/bash) + + ```bash + # Test only DiskANN + ALGORITHM=diskann mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test only HNSW + ALGORITHM=hnsw mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test only IVF + ALGORITHM=ivf mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + # Test only DiskANN + $env:ALGORITHM="diskann" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test only HNSW + $env:ALGORITHM="hnsw" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test only IVF + $env:ALGORITHM="ivf" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + --- + +4. Run the comparison for all similarity functions: + + ### [Bash](#tab/bash) + + ```bash + # Test all algorithms with all similarity functions + ALGORITHM=all SIMILARITY=all mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test DiskANN with all similarity functions + ALGORITHM=diskann SIMILARITY=all mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + # Test all algorithms with all similarity functions + $env:ALGORITHM="all" + $env:SIMILARITY="all" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test DiskANN with all similarity functions + $env:ALGORITHM="diskann" + $env:SIMILARITY="all" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + --- + +5. Run the comparison for a specific similarity function: + + ### [Bash](#tab/bash) + + ```bash + # Test all algorithms with L2 (Euclidean) distance + SIMILARITY=L2 mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test all algorithms with IP (inner product) + SIMILARITY=IP mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + # Test all algorithms with L2 (Euclidean) distance + $env:SIMILARITY="L2" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + + # Test all algorithms with IP (inner product) + $env:SIMILARITY="IP" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + ``` + + --- + +The program displays a comparison table showing average latency for each algorithm and similarity function combination: + +```text +================================================================================ +Vector Index Algorithm Comparison Results +================================================================================ +Algorithm Similarity Avg Latency (ms) +-------------------------------------------------------------------------------- +DISKANN COS 42.30 +DISKANN IP 38.70 +DISKANN L2 45.10 +HNSW COS 31.50 +HNSW IP 29.80 +HNSW L2 34.20 +IVF COS 55.60 +IVF IP 52.10 +IVF L2 58.90 +================================================================================ +``` + +> [!NOTE] +> The latency values shown above are illustrative. Actual results depend on your DocumentDB cluster configuration, region, network latency, and dataset size. + +## Understanding the results + +### Algorithm characteristics + +**DiskANN** - Disk-based approximate nearest neighbor search +- Good balance of speed and accuracy +- Suitable for large datasets that don't fit in memory +- Parameters: `maxDegree=32` (graph connectivity), `lBuild=50` (build quality), `lSearch=100` (query accuracy) + +**HNSW** - Hierarchical Navigable Small World +- Memory-based hierarchical graph +- Excellent for real-time applications requiring low latency +- Parameters: `m=16` (connections per layer), `efConstruction=64` (build quality), `efSearch=80` (query accuracy) + +**IVF** - Inverted File Index +- Cluster-based partitioning approach +- Fast search via centroid comparison +- Parameters: `numLists=1` (number of clusters), `nProbes=1` (clusters to search) + +### Similarity functions + +**COS (Cosine)** - Measures angle between vectors +- Best for text embeddings (like those from OpenAI models) +- Scale-invariant (ignores vector magnitude) +- Range: -1 to 1 (1 = identical direction) + +**L2 (Euclidean)** - Measures straight-line distance +- Sensitive to vector magnitude +- Good for embeddings where scale matters +- Range: 0 to infinity (0 = identical) + +**IP (Inner Product)** - Dot product of vectors +- Fast to compute +- Can be used with normalized vectors +- Range: -infinity to infinity + +### Choosing the right configuration + +Use the comparison results to guide your selection: + +1. **For real-time applications**: Choose HNSW if latency is critical +2. **For large datasets**: Choose DiskANN if your data exceeds available memory +3. **For fast batch processing**: Choose IVF if you can tolerate slightly lower accuracy +4. **For text embeddings**: Use COS similarity function (most common with OpenAI embeddings) + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `MongoTimeoutException` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `MongoSecurityException` | Check credentials in connection string. | +| Maven build failures | Run `mvn dependency:resolve` to check for missing dependencies. Ensure Java 17+ is installed. | +| `No plugin found for prefix 'exec'` | Add `exec-maven-plugin` to your `pom.xml` as shown in this article. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the Azure portal. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "" +use Hotels +db.dropDatabase() +``` + +### [Azure portal](#tab/portal) + +1. Navigate to your DocumentDB resource in the Azure portal +2. Select **Data Explorer** +3. Right-click the **Hotels** database and select **Delete Database** + +--- + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java new file mode 100644 index 0000000..f632350 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -0,0 +1,223 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * Unified comparison runner that executes all 9 combinations + * (3 algorithms x 3 similarity metrics) and prints a formatted table. + */ +public class CompareAll { + + private static final String COLLECTION_NAME = "hotels"; + private static final String[] ALGORITHMS = {"ivf", "hnsw", "diskann"}; + private static final String[] METRICS = {"COS", "L2", "IP"}; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String queryText = Utils.getEnv("QUERY_TEXT", "luxury hotel near the beach"); + int topK = Integer.parseInt(Utils.getEnv("TOP_K", "5")); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Top K: %d%n", topK); + System.out.printf(" Metrics: COS, L2, IP%n"); + System.out.printf(" Algos: IVF, HNSW, DiskANN%n"); + System.out.println(); + + List results = new ArrayList<>(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + collection.drop(); + System.out.println(" Collection reset."); + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Run 9 algorithm × metric combinations sequentially (create→search→drop) + // DocumentDB does not allow multiple vector indexes of the same kind + // on the same field path simultaneously. + System.out.println(" Running 9 algorithm × metric combinations...\n"); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + // 1. Drop all existing vector indexes + dropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + createIndex(database, collection, vectorField, dimensions, algo, metric); + System.out.printf(" ✓ %s created%n", indexName); + + // 3. Wait for index to build + try { Thread.sleep(5000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } + + // 4. Search + List searchResults = performSearch( + collection, vectorAsDoubles, vectorField, topK); + + // 5. Extract top 2 results + String top1Name = "-"; double top1Score = 0.0; + String top2Name = "-"; double top2Score = 0.0; + if (!searchResults.isEmpty()) { + Document top1 = searchResults.get(0); + top1Name = top1.getString("HotelName") != null ? top1.getString("HotelName") : "-"; + top1Score = top1.getDouble("score") != null ? top1.getDouble("score") : 0.0; + } + if (searchResults.size() > 1) { + Document top2 = searchResults.get(1); + top2Name = top2.getString("HotelName") != null ? top2.getString("HotelName") : "-"; + top2Score = top2.getDouble("score") != null ? top2.getDouble("score") : 0.0; + } + results.add(new SearchResult(algo.toUpperCase(), metric, top1Name, top1Score, top2Name, top2Score)); + } + } + + // Cleanup: drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); + } + + // Print comparison table + printComparisonTable(results); + } + + private static void dropVectorIndexes(MongoCollection collection, String vectorField) { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + Document key = idx.get("key", Document.class); + if (key != null && "cosmosSearch".equals(key.getString(vectorField))) { + try { + collection.dropIndex(name); + } catch (Exception e) { + // Ignore if index doesn't exist + } + } + } + } + + private static void createIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, + String algo, String metric) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + Document cosmosSearchOptions = new Document() + .append("dimensions", dimensions) + .append("similarity", metric); + + switch (algo) { + case "ivf" -> cosmosSearchOptions + .append("kind", "vector-ivf") + .append("numLists", 1); + case "hnsw" -> cosmosSearchOptions + .append("kind", "vector-hnsw") + .append("m", 16) + .append("efConstruction", 64); + case "diskann" -> cosmosSearchOptions + .append("kind", "vector-diskann") + .append("maxDegree", 32) + .append("lBuild", 50); + } + + Document indexDefinition = new Document() + .append("name", indexName) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + try { + database.runCommand(command); + } catch (Exception e) { + // Idempotent: ignore if index already exists + if (!e.getMessage().contains("already exists")) { + throw e; + } + } + } + + private static List performSearch(MongoCollection collection, + List vectorAsDoubles, + String vectorField, int topK) { + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + return results; + } + + private static void printComparisonTable(List results) { + System.out.println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7s│ %-27s│ %-7s│ %-6s│%n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff"); + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (int i = 0; i < results.size(); i++) { + SearchResult r = results.get(i); + double diff = Math.abs(r.top1Score() - r.top2Score()); + String top1Display = r.top1Name().length() > 27 ? r.top1Name().substring(0, 24) + "..." : r.top1Name(); + String top2Display = r.top2Name().length() > 27 ? r.top2Name().substring(0, 24) + "..." : r.top2Name(); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7.4f│ %-27s│ %-7.4f│ %-6.4f│%n", + r.algorithm(), r.metric(), top1Display, r.top1Score(), top2Display, r.top2Score(), diff); + if (i < results.size() - 1) { + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + } + System.out.println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } + + private record SearchResult( + String algorithm, + String metric, + String top1Name, + double top1Score, + String top2Name, + double top2Score) { + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java new file mode 100644 index 0000000..5a9d54c --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -0,0 +1,17 @@ +package com.azure.documentdb.selectalgorithm; + +public class Main { + + public static void main(String[] args) { + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.println(); + + CompareAll.run(); + + System.out.println("=============================================="); + System.out.println(" Comparison complete."); + System.out.println("=============================================="); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java new file mode 100644 index 0000000..8ed19d0 --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -0,0 +1,190 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingItem; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.core.credential.AccessToken; +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.InsertManyOptions; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + + public static String getEnv(String key, String defaultValue) { + String value = System.getenv(key); + return (value != null && !value.isBlank()) ? value : defaultValue; + } + + public static String getEnv(String key) { + return getEnv(key, null); + } + + public static MongoClient getMongoClient() { + String clusterName = getEnv("MONGO_CLUSTER_NAME"); + if (clusterName == null) { + throw new IllegalStateException("MONGO_CLUSTER_NAME environment variable is required"); + } + + String connectionUri = String.format( + "mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", clusterName); + + // Use custom OIDC callback with DefaultAzureCredential + // This chains through CLI, managed identity, etc. + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + String tokenResource = "https://ossrdbms-aad.database.windows.net/.default"; + + MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (MongoCredential.OidcCallback) context -> { + AccessToken token = credential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes(tokenResource)).block(); + return new MongoCredential.OidcCallbackResult(token.getToken()); + }); + + MongoClientSettings settings = MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(connectionUri)) + .credential(mongoCredential) + .retryWrites(false) + .build(); + + return MongoClients.create(settings); + } + + public static OpenAIClient getOpenAIClient() { + String endpoint = getEnv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + if (endpoint == null) { + throw new IllegalStateException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + } + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + public static List readJsonFile(String path) { + try { + String content = Files.readString(Path.of(path)); + // Parse JSON array of documents + @SuppressWarnings("unchecked") + List docs = Document.parse("{\"data\":" + content + "}").getList("data", Document.class); + return docs; + } catch (IOException e) { + throw new RuntimeException("Failed to read data file: " + path, e); + } + } + + public static void insertData(MongoCollection collection, List data, int batchSize) { + System.out.printf(" Inserting %d documents in batches of %d...%n", data.size(), batchSize); + InsertManyOptions options = new InsertManyOptions().ordered(false); + + for (int i = 0; i < data.size(); i += batchSize) { + List batch = data.subList(i, Math.min(i + batchSize, data.size())); + // Remove _id to avoid duplicate key errors on re-run + List cleaned = new ArrayList<>(); + for (Document doc : batch) { + Document copy = new Document(doc); + copy.remove("_id"); + cleaned.add(copy); + } + try { + collection.insertMany(cleaned, options); + } catch (Exception e) { + // Ignore duplicate key errors on re-insert + if (!e.getMessage().contains("duplicate key")) { + throw e; + } + } + System.out.printf(" Inserted batch %d-%d%n", i + 1, Math.min(i + batchSize, data.size())); + } + System.out.println(" Data insertion complete."); + } + + public static void dropVectorIndexes(MongoCollection collection, String vectorField) { + try { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + if (name != null && name.contains(vectorField) && !name.equals("_id_")) { + System.out.printf(" Dropping existing index: %s%n", name); + collection.dropIndex(name); + } + } + } catch (Exception e) { + // Ignore errors when indexes don't exist + System.out.println(" No existing vector indexes to drop."); + } + } + + public static List getEmbedding(OpenAIClient client, String text, String model) { + EmbeddingsOptions options = new EmbeddingsOptions(List.of(text)); + List embeddings = client.getEmbeddings(model, options).getData(); + if (embeddings.isEmpty()) { + throw new RuntimeException("No embedding returned for query text"); + } + return embeddings.get(0).getEmbedding(); + } + + public static List performVectorSearch( + MongoCollection collection, + OpenAIClient aiClient, + String query, + String vectorField, + String model, + int topK) { + + System.out.printf(" Generating embedding for query: \"%s\"%n", query); + List queryVector = getEmbedding(aiClient, query, model); + System.out.printf(" Embedding generated (%d dimensions)%n", queryVector.size()); + + // Convert List to List for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + + return results; + } + + public static void printResults(List results) { + System.out.println("\n === Search Results ==="); + for (int i = 0; i < results.size(); i++) { + Document doc = results.get(i); + System.out.printf(" %d. %s (score: %.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + System.out.printf(" %s%n", doc.getString("Description")); + } + System.out.println(); + } +} diff --git a/ai/select-algorithm-python/.gitignore b/ai/select-algorithm-python/.gitignore new file mode 100644 index 0000000..87965ce --- /dev/null +++ b/ai/select-algorithm-python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.env +.venv/ + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..3393ce5 --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,96 @@ + +# Select Vector Algorithm (Python) + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each algorithm is optimized for different dataset sizes and performance requirements. + +## Algorithm Selection Guide + +| Algorithm | Dataset Size | Cluster Tier | Key Parameters | +|-----------|-------------|--------------|----------------| +| IVF | < 10K docs | M10+ | numLists | +| HNSW | 10K-50K | M30+ | m, efConstruction | +| DiskANN | 50K+ | M40+ | maxDegree, lBuild | + +## Prerequisites + +- Azure subscription +- Azure DocumentDB cluster (M40+ for all algorithms, M10+ for IVF only) +- Azure OpenAI resource with `text-embedding-3-small` deployed +- Python 3.10+ +- Azure CLI (`az login` for passwordless auth) + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Install dependencies: + ```bash + cd src + pip install -r ../requirements.txt + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Ensure you're logged in to Azure: + ```bash + az login + ``` + +## Run + +Compare all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation: + +```bash +cd src +python compare_all.py +``` + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show all k results per combo | diff --git a/ai/select-algorithm-python/data/README.md b/ai/select-algorithm-python/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-python/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-python/output/compare_all.txt b/ai/select-algorithm-python/output/compare_all.txt new file mode 100644 index 0000000..aa96c4f --- /dev/null +++ b/ai/select-algorithm-python/output/compare_all.txt @@ -0,0 +1,47 @@ +====================================================================== + Compare All Algorithms — 9 Combinations + (3 Algorithms × 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 5 + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | ++=============+==========+==========================+=========+===================+=========+========+ +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md new file mode 100644 index 0000000..fba490e --- /dev/null +++ b/ai/select-algorithm-python/quickstart.md @@ -0,0 +1,457 @@ +--- +title: Quickstart - Vector index with Python +description: Compare vector index algorithms and similarity functions using the Python SDK in Azure DocumentDB to optimize search performance for your workload. +ms.devlang: python +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Python in Azure DocumentDB + +In this quickstart, you compare three vector index algorithms (DiskANN, HNSW, and IVF) and three similarity functions (cosine, L2, and inner product) to find the optimal configuration for your search workload. This quickstart uses a sample hotel dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-python) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Python](https://www.python.org/downloads/) 3.10 or greater + +## Create data file with vectors + +1. Create a new data directory and download the hotels data file with vectors: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p data + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/main/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Force -Path data + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/main/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/ + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data/ + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Python project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p select-algorithm + cd select-algorithm + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Force -Path select-algorithm + Set-Location select-algorithm + code . + ``` + + --- + +2. In the terminal, create and activate a virtual environment: + + For Windows: + + ```powershell + python -m venv venv + venv\Scripts\activate + ``` + + For macOS/Linux: + + ```bash + python -m venv venv + source venv/bin/activate + ``` + +3. Install the required packages: + + ```bash + pip install "pymongo>=4.7" openai==1.55.3 azure-identity==1.15.0 python-dotenv==1.0.0 + ``` + + - `pymongo`: MongoDB driver for Python (≥4.7 required for OIDC authentication) + - `openai`: OpenAI client library to create vectors + - `azure-identity`: Azure Identity library for passwordless authentication + - `python-dotenv`: Environment variable management from .env files + + Verify the packages are installed: + + ### [Bash](#tab/bash) + + ```bash + pip list | grep pymongo + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + pip list | Select-String pymongo + ``` + + --- + + You should see `pymongo` with a version of 4.7 or greater. + +4. Create a `.env` file for environment variables in the project root: + + ```bash + # Azure OpenAI Embedding Settings + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_OPENAI_EMBEDDING_API_VERSION=2024-10-21 + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + + # Data File Paths and Vector Configuration + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 + + # Azure DocumentDB Connection Settings + MONGO_CLUSTER_NAME= + + # Azure DocumentDB Database Name + AZURE_DOCUMENTDB_DATABASENAME=Hotels + + # Algorithm Selection (used by select_algorithm.py) + # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" + ALGORITHM=all + + # SIMILARITY: "all" | "COS" | "L2" | "IP" + SIMILARITY=COS + ``` + + For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Python apps to Azure services by using the Azure SDK for Python](/azure/developer/python/sdk/authentication/overview). + + Verify the `.env` file was created: + + ### [Bash](#tab/bash) + + ```bash + cat .env + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content .env + ``` + + --- + + You should see your connection string and Azure OpenAI endpoint values. + +## Create code files + +Create the following project structure: + +``` +├── data/ +│ └── Hotels_Vector.json # Hotel data with vector embeddings +└── select-algorithm/ + ├── src/ + │ ├── select_algorithm.py # Main comparison script + │ └── utils.py # Shared utility functions + └── .env # Environment variables +``` + +Create the `src` directory: + +### [Bash](#tab/bash) + +```bash +mkdir -p src +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Force -Path src +``` + +--- + +## Create the algorithm comparison code + +Create the `src/compare_all.py` file with the following code: + +:::code language="python" source="~/../documentdb-samples/ai/select-algorithm-python/src/compare_all.py" ::: + +This script orchestrates the algorithm comparison by: + +- Loading configuration from environment variables +- Initializing MongoDB and Azure OpenAI clients with passwordless authentication +- Loading hotel data with pre-calculated embeddings +- Testing each algorithm/similarity combination by creating a collection, inserting data, creating an index, and executing a search +- Measuring and comparing search performance across all configurations +- Displaying results in a comparison table + +## Create utility functions + +Create the `src/utils.py` file with the following code: + +:::code language="python" source="~/../documentdb-samples/ai/select-algorithm-python/src/utils.py" ::: + +The utilities provide essential functions for: + +- Passwordless authentication to DocumentDB and Azure OpenAI using DefaultAzureCredential +- Reading JSON data files with error handling +- Batch insertion of documents with DocumentDB's 16 MB payload limit in mind +- Formatted display of comparison results showing algorithm performance + +## Run the code + +Execute the comparison script to test all algorithms with cosine similarity: + +```bash +python src/select_algorithm.py +``` + +The output shows the comparison across all three algorithms: + +``` +Vector Algorithm Comparison + Database: Hotels + Algorithms: all + Similarity: COS + Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos + Search query: "quintessential lodging near running trails, eateries, retail" + +Initializing MongoDB and Azure OpenAI clients... + +Loading data from ../data/Hotels_Vector.json... +Loaded 50 documents +Generating query embedding... +Query embedding: 1536 dimensions + +--- DiskANN / COS --- +Collection: hotels_diskann_cos +Created collection: hotels_diskann_cos +Inserting 50 documents in batches of 100... +Batch 1 completed: 50 documents inserted +Inserted: 50/50 +Created vector index: vectorIndex_diskann_cos +Executing vector search... +Success: 5 results, 145ms + +--- HNSW / COS --- +Collection: hotels_hnsw_cos +Created collection: hotels_hnsw_cos +Inserting 50 documents in batches of 100... +Batch 1 completed: 50 documents inserted +Inserted: 50/50 +Created vector index: vectorIndex_hnsw_cos +Executing vector search... +Success: 5 results, 132ms + +--- IVF / COS --- +Collection: hotels_ivf_cos +Created collection: hotels_ivf_cos +Inserting 50 documents in batches of 100... +Batch 1 completed: 50 documents inserted +Inserted: 50/50 +Created vector index: vectorIndex_ivf_cos +Executing vector search... +Success: 5 results, 128ms + +========================================================================================== + Vector Algorithm Comparison Results +========================================================================================== +Algorithm Similarity Top Result Score Latency(ms) +------------------------------------------------------------------------------------------ +DiskANN COS Twin Dome Motel 0.8947 145 +HNSW COS Twin Dome Motel 0.8947 132 +IVF COS Twin Dome Motel 0.8947 128 +========================================================================================== + +--- DiskANN / COS (hotels_diskann_cos) --- + 1. Twin Dome Motel, Score: 0.8947 + 2. Triple Landscape Hotel, Score: 0.8898 + 3. Smile Hotel, Score: 0.8855 + 4. Gastronomic Landscape Hotel, Score: 0.8797 + 5. Twin Landscape Resort, Score: 0.8772 + Latency: 145ms + +--- HNSW / COS (hotels_hnsw_cos) --- + 1. Twin Dome Motel, Score: 0.8947 + 2. Triple Landscape Hotel, Score: 0.8898 + 3. Smile Hotel, Score: 0.8855 + 4. Gastronomic Landscape Hotel, Score: 0.8797 + 5. Twin Landscape Resort, Score: 0.8772 + Latency: 132ms + +--- IVF / COS (hotels_ivf_cos) --- + 1. Twin Dome Motel, Score: 0.8947 + 2. Triple Landscape Hotel, Score: 0.8898 + 3. Smile Hotel, Score: 0.8855 + 4. Gastronomic Landscape Hotel, Score: 0.8797 + 5. Twin Landscape Resort, Score: 0.8772 + Latency: 128ms + +Closing database connection... +Database connection closed +``` + +### Test specific combinations + +To override environment variables at the command line: + +### [Bash](#tab/bash) + +```bash +# Test only DiskANN across all similarity functions +ALGORITHM=diskann SIMILARITY=all python src/select_algorithm.py +``` + +```bash +# Test all algorithms with L2 distance +ALGORITHM=all SIMILARITY=L2 python src/select_algorithm.py +``` + +```bash +# Test HNSW with inner product +ALGORITHM=hnsw SIMILARITY=IP python src/select_algorithm.py +``` + +### [PowerShell](#tab/powershell) + +```powershell +# Test only DiskANN across all similarity functions +$env:ALGORITHM="diskann"; $env:SIMILARITY="all"; python src/select_algorithm.py +``` + +```powershell +# Test all algorithms with L2 distance +$env:ALGORITHM="all"; $env:SIMILARITY="L2"; python src/select_algorithm.py +``` + +```powershell +# Test HNSW with inner product +$env:ALGORITHM="hnsw"; $env:SIMILARITY="IP"; python src/select_algorithm.py +``` + +--- + +> [!NOTE] +> When using `SIMILARITY=all`, the script tests all three similarity functions (COS, L2, IP) for each selected algorithm. Combined with `ALGORITHM=all`, this runs all 9 combinations (3 algorithms × 3 similarity functions). Each combination creates a separate collection, so the full run takes longer. + +### Understanding the results + +The comparison table helps you choose the best configuration for your workload: + +- **Latency**: Query execution time in milliseconds. Lower is better for user-facing search. +- **Score**: Similarity score using the selected function. Higher scores indicate better matches. +- **Top Result**: The highest-scoring hotel for the query. Consistency across algorithms indicates stable results. + +Algorithm selection guidelines: + +- **DiskANN**: Best for large datasets where memory is limited. Stores index on disk while maintaining good performance. +- **HNSW**: Best for high-accuracy requirements and fast search. Requires more memory but provides excellent recall. +- **IVF**: Best for very large datasets where some recall can be traded for speed. Uses clustering for efficient search. + +Similarity function selection: + +- **COS (Cosine)**: Best for text embeddings. Normalizes vectors and measures angle between them. +- **L2 (Euclidean)**: Measures straight-line distance. Sensitive to vector magnitude. +- **IP (Inner Product)**: Dot product similarity. Useful when vector magnitude is meaningful. + +Tuning parameters: + +DiskANN tuning: +- `maxDegree`: Higher values improve accuracy but increase memory usage (default: 32) +- `lBuild`: Higher values improve index quality but slow down index creation (default: 50) +- `lSearch`: Higher values improve recall but slow down queries (default: 100) + +HNSW tuning: +- `m`: Number of connections per layer. Higher improves recall (default: 16) +- `efConstruction`: Candidates during build. Higher improves quality (default: 64) +- `efSearch`: Candidates during search. Higher improves recall (default: 80) + +IVF tuning: +- `numLists`: Number of clusters. Higher speeds up search but may reduce recall (default: 1) +- `nProbes`: Clusters searched at query time. Higher improves recall but slows queries (default: 1) + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `ServerSelectionTimeoutError` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `AuthenticationFailed` | Check that your connection string includes the correct username and password, or that your Microsoft Entra token is valid. | +| `pymongo.errors.OperationFailure` | Ensure the database and collection exist. Check that the vector index was created successfully. | +| `ModuleNotFoundError: No module named 'pymongo'` | Activate your virtual environment and run `pip install "pymongo>=4.7"`. | +| Empty search results | The vector index may not be ready yet. The script includes retry logic, but large datasets may require longer wait times. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the Azure portal. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [Azure portal](#tab/portal) + +1. Navigate to your DocumentDB resource in the Azure portal. +2. Select **Data Explorer**. +3. Right-click the **Hotels** database and select **Delete Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..36e664e --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,11 @@ +# MongoDB driver for connecting to DocumentDB +pymongo>=4.7.0 + +# Azure OpenAI SDK for generating embeddings +openai>=1.0.0,<2.0.0 + +# Azure authentication library for passwordless connection +azure-identity>=1.15.0 + +# Formatted table output for compare_all.py +tabulate>=0.9.0 diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py new file mode 100644 index 0000000..b5c22d8 --- /dev/null +++ b/ai/select-algorithm-python/src/compare_all.py @@ -0,0 +1,206 @@ +""" +Compare All Algorithms — Unified comparison runner. + +Executes all 9 combinations (3 algorithms × 3 similarity metrics) in a single +invocation and prints a formatted comparison table. + +Algorithms: IVF, HNSW, DiskANN +Metrics: COS, L2, IP +""" +import os +import time +from typing import Dict, List, Any + +from tabulate import tabulate +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data +) + +# Index definitions: (algo_label, kind, extra_params) +ALGORITHMS = [ + ("IVF", "vector-ivf", {"numLists": 1}), + ("HNSW", "vector-hnsw", {"m": 16, "efConstruction": 64}), + ("DiskANN", "vector-diskann", {"maxDegree": 32, "lBuild": 50}), +] + +METRICS = ["COS", "L2", "IP"] + + +def get_compare_config() -> Dict[str, Any]: + """Load comparison-specific configuration from environment variables.""" + config = get_config() + config["query_text"] = os.getenv("QUERY_TEXT", "luxury hotel near the beach") + config["top_k"] = int(os.getenv("TOP_K", "5")) + return config + + +def index_name(algo: str, metric: str) -> str: + """Generate canonical index name: vector_{algo}_{metric}.""" + return f"vector_{algo.lower()}_{metric.lower()}" + + +def get_existing_index_names(collection) -> List[str]: + """Return names of existing indexes on the collection.""" + return [idx["name"] for idx in collection.list_indexes()] + + +def drop_vector_indexes(collection, vector_field: str) -> None: + """Drop all existing vector indexes on *vector_field*.""" + for idx in collection.list_indexes(): + name = idx.get("name", "") + key = idx.get("key", {}) + if vector_field in key and key[vector_field] == "cosmosSearch": + collection.drop_index(name) + + +def create_vector_index(collection, name: str, kind: str, vector_field: str, + dimensions: int, similarity: str, + extra_params: Dict[str, Any]) -> None: + """Create a single vector index.""" + cosmos_options = { + "kind": kind, + "dimensions": dimensions, + "similarity": similarity, + **extra_params, + } + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": name, + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": cosmos_options, + } + ], + } + collection.database.command(index_command) + + +def generate_embedding(azure_openai_client, query_text: str, + model_name: str) -> List[float]: + """Generate a single embedding for the query text.""" + response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + return response.data[0].embedding + + +def vector_search_with_index(collection, query_embedding: List[float], + vector_field: str, + top_k: int) -> List[Dict[str, Any]]: + """Run vector search using the single active index and return results.""" + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + results = list(collection.aggregate(pipeline)) + + return results + + +def main(): + print("=" * 70) + print(" Compare All Algorithms — 9 Combinations") + print(" (3 Algorithms × 3 Similarity Metrics)") + print("=" * 70) + + config = get_compare_config() + query_text = config["query_text"] + top_k = config["top_k"] + + print(f"\n Query: \"{query_text}\"") + print(f" Top K: {top_k}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config["database_name"]] + + # Drop collection for a clean comparison + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection (if any)") + + # Create fresh collection and load data + collection = database["hotels"] + data = read_file_return_json(config["data_file"]) + documents = [doc for doc in data if config["vector_field"] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + insert_data(collection, documents, config["batch_size"]) + + # Generate ONE embedding for the query + print("\nGenerating embedding for query...") + query_embedding = generate_embedding( + azure_openai_client, query_text, config["model_name"] + ) + + # Run all 9 searches sequentially (create→search→drop for each) + print("Running 9 vector searches...\n") + table_rows = [] + + for algo_label, kind, extra_params in ALGORITHMS: + for metric in METRICS: + name = index_name(algo_label, metric) + # Drop all vector indexes first + drop_vector_indexes(collection, config["vector_field"]) + # Create this specific index + create_vector_index( + collection, name, kind, config["vector_field"], + config["dimensions"], metric, extra_params + ) + print(f" Created index '{name}'") + time.sleep(5) # Increased wait time + # Search (no index name needed) + results = vector_search_with_index( + collection, query_embedding, config["vector_field"], top_k + ) + + top1_name = results[0].get("document", results[0]).get("HotelName", "Unknown") if len(results) > 0 else "(no results)" + top1_score = results[0].get("score", 0) if len(results) > 0 else 0 + top2_name = results[1].get("document", results[1]).get("HotelName", "Unknown") if len(results) > 1 else "(no results)" + top2_score = results[1].get("score", 0) if len(results) > 1 else 0 + + table_rows.append([ + algo_label, + metric, + top1_name, + f"{top1_score:.4f}", + top2_name, + f"{top2_score:.4f}", + f"{abs(top1_score - top2_score):.4f}", + ]) + + # Print comparison table + headers = ["Algorithm", "Metric", "Top 1 Result", "Score", + "Top 2 Result", "Score", "Diff"] + print(tabulate(table_rows, headers=headers, tablefmt="grid")) + + finally: + # Cleanup: drop the comparison collection + try: + database = mongo_client[config["database_name"]] + database.drop_collection("hotels") + print("\nCleanup: dropped collection 'hotels'") + except Exception as e: + print(f"Cleanup warning: {e}") + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..ee905f7 --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,171 @@ +import json +import os +import time +import warnings +from typing import Dict, List, Any, Optional, Tuple + +# Suppress the PyMongo CosmosDB cluster detection warning +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a CosmosDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential): + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: + """Create MongoDB and Azure OpenAI clients using passwordless auth.""" + cluster_name = os.getenv("MONGO_CLUSTER_NAME") + if not cluster_name: + raise ValueError("MONGO_CLUSTER_NAME environment variable is required") + + credential = DefaultAzureCredential() + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.global.mongocluster.cosmos.azure.com/", + connectTimeoutMS=120000, + tls=True, + retryWrites=False, + authMechanism="MONGODB-OIDC", + authMechanismProperties={"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=token_provider, + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2023-05-15") + ) + + return mongo_client, azure_openai_client + + +def get_config() -> Dict[str, Any]: + """Load configuration from environment variables.""" + return { + 'database_name': os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels'), + 'data_file': os.getenv('DATA_FILE_WITH_VECTORS', 'data/Hotels_Vector.json'), + 'vector_field': os.getenv('EMBEDDED_FIELD', 'DescriptionVector'), + 'model_name': os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'dimensions': int(os.getenv('EMBEDDING_DIMENSIONS', '1536')), + 'batch_size': int(os.getenv('LOAD_SIZE_BATCH', '100')), + 'similarity': os.getenv('SIMILARITY', ''), + } + + +def read_file_return_json(file_path: str) -> List[Dict[str, Any]]: + """Read a JSON file and return the parsed data.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + + +def insert_data(collection: Collection, data: List[Dict[str, Any]], + batch_size: int = 100) -> Dict[str, Any]: + """Insert data into collection in batches, skipping if already populated.""" + total_documents = len(data) + + existing_count = collection.count_documents({}) + if existing_count >= total_documents: + print(f"Collection already has {existing_count} documents, skipping insert") + return {'total': total_documents, 'inserted': 0, 'skipped': True} + + if existing_count > 0: + collection.delete_many({}) + + inserted_count = 0 + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + try: + operations = [InsertOne(doc) for doc in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + time.sleep(0.1) + + print(f"Inserted {inserted_count}/{total_documents} documents") + return {'total': total_documents, 'inserted': inserted_count, 'skipped': False} + + +def drop_vector_indexes(collection: Collection, vector_field: str) -> None: + """Drop any existing vector indexes on the specified field.""" + try: + indexes = list(collection.list_indexes()) + for index in indexes: + if 'key' in index and vector_field in index['key']: + if index['key'][vector_field] == 'cosmosSearch': + collection.drop_index(index['name']) + print(f"Dropped existing vector index: {index['name']}") + except Exception as e: + print(f"Warning: Error dropping indexes: {e}") + + +def perform_vector_search(collection: Collection, + azure_openai_client: AzureOpenAI, + query_text: str, + vector_field: str, + model_name: str, + top_k: int = 5) -> List[Dict[str, Any]]: + """Perform vector search using the $search aggregation stage.""" + embedding_response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + query_embedding = embedding_response.data[0].embedding + + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + return list(collection.aggregate(pipeline)) + + +def print_search_results(results: List[Dict[str, Any]], algorithm: str) -> None: + """Print formatted search results.""" + print(f"\n{'='*60}") + print(f" {algorithm} Search Results ({len(results)} found)") + print(f"{'='*60}") + for i, result in enumerate(results, 1): + doc = result.get('document', result) + name = doc.get('HotelName', doc.get('name', 'Unknown')) + score = result.get('score', 0) + print(f" {i}. {name} (score: {score:.4f})") + print() diff --git a/ai/select-algorithm-typescript/.gitignore b/ai/select-algorithm-typescript/.gitignore new file mode 100644 index 0000000..9a088e4 --- /dev/null +++ b/ai/select-algorithm-typescript/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +dist/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md new file mode 100644 index 0000000..b9140cb --- /dev/null +++ b/ai/select-algorithm-typescript/README.md @@ -0,0 +1,116 @@ +# Select Algorithm — TypeScript + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using TypeScript. + +## Prerequisites + +- [Node.js 20+](https://nodejs.org/) +- [Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli) (for `az login`) +- An Azure DocumentDB cluster with vector search enabled +- An Azure OpenAI resource with an embedding model deployed + +## Setup + +1. **Install dependencies:** + + ```bash + npm install + ``` + +2. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +3. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file in the project folder with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + | Variable | Description | + |---|---| + | `MONGO_CLUSTER_NAME` | Your DocumentDB cluster name | + | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | + | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | + | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | + | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | + | `DATA_FILE_WITH_VECTORS` | Path to JSON data file with vectors | + | `EMBEDDED_FIELD` | Field name containing the vector (default: `DescriptionVector`) | + | `EMBEDDING_DIMENSIONS` | Vector dimensions (default: `1536`) | + | `LOAD_SIZE_BATCH` | Batch size for data insertion | + | `SIMILARITY` | Similarity metric: `COS`, `L2`, or `IP` | + +5. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +6. **Build the project:** + + ```bash + npm run build + ``` + +## Run + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: + +```bash +npm start +``` + +**Environment variables** (optional overrides): + +| Variable | Default | Description | +|---|---|---| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per combination | +| `VERBOSE` | `false` | When `true`, shows all k results per combo | + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +## Algorithm comparison + +| Algorithm | Index type | Best for | +|---|---|---| +| **IVF** | `vector-ivf` | Smaller datasets, lower memory usage | +| **HNSW** | `vector-hnsw` | Fast approximate search, balanced recall/speed | +| **DiskANN** | `vector-diskann` | Large-scale datasets, disk-based search | diff --git a/ai/select-algorithm-typescript/data/README.md b/ai/select-algorithm-typescript/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-typescript/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-typescript/output/compare_all.txt b/ai/select-algorithm-typescript/output/compare_all.txt new file mode 100644 index 0000000..8e34340 --- /dev/null +++ b/ai/select-algorithm-typescript/output/compare_all.txt @@ -0,0 +1,42 @@ +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small +Reading JSON file from data/Hotels_Vector.json +Loaded 50 documents +Processing in batches of 50... +Batch 1 complete: 50 inserted + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 5 results)... ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection "hotels" +Database connection closed diff --git a/ai/select-algorithm-typescript/package-lock.json b/ai/select-algorithm-typescript/package-lock.json new file mode 100644 index 0000000..f0ceb74 --- /dev/null +++ b/ai/select-algorithm-typescript/package-lock.json @@ -0,0 +1,735 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "select-algorithm-typescript", + "version": "1.0.0", + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", + "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", + "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-util": "^1.13.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-client": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", + "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-rest-pipeline": "^1.22.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.23.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz", + "integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "@typespec/ts-http-runtime": "^0.3.4", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", + "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", + "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/identity": { + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.1.tgz", + "integrity": "sha512-5C/2WD5Vb1lHnZS16dNQRPMjN6oV/Upba+C9nBIs15PmOi6A3ZGs4Lr2u60zw4S04gi+u3cEXiqTVP7M4Pz3kw==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.9.0", + "@azure/core-client": "^1.9.2", + "@azure/core-rest-pipeline": "^1.17.0", + "@azure/core-tracing": "^1.0.0", + "@azure/core-util": "^1.11.0", + "@azure/logger": "^1.0.0", + "@azure/msal-browser": "^5.5.0", + "@azure/msal-node": "^5.1.0", + "open": "^10.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", + "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", + "license": "MIT", + "dependencies": { + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/msal-browser": { + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-5.9.0.tgz", + "integrity": "sha512-CzE+4PefDSJWj26zU7G1bKchlGRRHMBFreG4tAlGuzyI8hAPiYGobaJvZBgZBf6L63iphX7VH+ityL8VgEQz9Q==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "16.5.2", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-16.5.2.tgz", + "integrity": "sha512-GkDEL6TYo3HgT3UuqakdgE9PZfc1hMki6+Hwgy1uddb/EauvAKfu85vVhuofRSo22D1xTnWt8Ucwfg4vSCVwvA==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-node": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-5.1.5.tgz", + "integrity": "sha512-ObTeMoNPmq19X3z40et9Xvs4ZoWVeJg43PZMRLG5iwVL+2nCtAerG3YTDItqPp1CfXNwmCXBbg8jn1DOx65c3g==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2", + "jsonwebtoken": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@mongodb-js/saslprep": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.4.9.tgz", + "integrity": "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==", + "license": "MIT", + "dependencies": { + "sparse-bitfield": "^3.0.3" + } + }, + "node_modules/@types/node": { + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@types/webidl-conversions": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", + "integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==", + "license": "MIT" + }, + "node_modules/@types/whatwg-url": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", + "license": "MIT", + "dependencies": { + "@types/webidl-conversions": "*" + } + }, + "node_modules/@typespec/ts-http-runtime": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.5.tgz", + "integrity": "sha512-yURCknZhvywvQItHMMmFSo+fq5arCUIyz/CVk7jD89MSai7dkaX8ufjCWp3NttLojoTVbcE72ri+be/TnEbMHw==", + "license": "MIT", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/bson": { + "version": "6.10.4", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.10.4.tgz", + "integrity": "sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==", + "license": "Apache-2.0", + "engines": { + "node": ">=16.20.1" + } + }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-wsl": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jsonwebtoken": { + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", + "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", + "license": "MIT", + "dependencies": { + "jws": "^4.0.1", + "lodash.includes": "^4.3.0", + "lodash.isboolean": "^3.0.3", + "lodash.isinteger": "^4.0.4", + "lodash.isnumber": "^3.0.3", + "lodash.isplainobject": "^4.0.6", + "lodash.isstring": "^4.0.1", + "lodash.once": "^4.0.0", + "ms": "^2.1.1", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/lodash.includes": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", + "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isinteger": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", + "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", + "license": "MIT" + }, + "node_modules/lodash.isnumber": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", + "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", + "license": "MIT" + }, + "node_modules/lodash.isplainobject": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", + "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", + "license": "MIT" + }, + "node_modules/lodash.isstring": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", + "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", + "license": "MIT" + }, + "node_modules/lodash.once": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", + "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", + "license": "MIT" + }, + "node_modules/memory-pager": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", + "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", + "license": "MIT" + }, + "node_modules/mongodb": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.21.0.tgz", + "integrity": "sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==", + "license": "Apache-2.0", + "dependencies": { + "@mongodb-js/saslprep": "^1.3.0", + "bson": "^6.10.4", + "mongodb-connection-string-url": "^3.0.2" + }, + "engines": { + "node": ">=16.20.1" + }, + "peerDependencies": { + "@aws-sdk/credential-providers": "^3.188.0", + "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.3.2", + "socks": "^2.7.1" + }, + "peerDependenciesMeta": { + "@aws-sdk/credential-providers": { + "optional": true + }, + "@mongodb-js/zstd": { + "optional": true + }, + "gcp-metadata": { + "optional": true + }, + "kerberos": { + "optional": true + }, + "mongodb-client-encryption": { + "optional": true + }, + "snappy": { + "optional": true + }, + "socks": { + "optional": true + } + } + }, + "node_modules/mongodb-connection-string-url": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.2.tgz", + "integrity": "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==", + "license": "Apache-2.0", + "dependencies": { + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^14.1.0 || ^13.0.0" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "5.23.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.23.2.tgz", + "integrity": "sha512-MQBzmTulj+MM5O8SKEk/gL8a7s5mktS9zUtAkU257WjvobGc9nKcBuVwjyEEcb9SI8a8Y2G/mzn3vm9n1Jlleg==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/run-applescript": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", + "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sparse-bitfield": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", + "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", + "license": "MIT", + "dependencies": { + "memory-pager": "^1.0.2" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json new file mode 100644 index 0000000..df5b82b --- /dev/null +++ b/ai/select-algorithm-typescript/package.json @@ -0,0 +1,20 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "description": "Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB", + "type": "module", + "scripts": { + "env:init": "azd env get-values > .env", + "build": "tsc", + "start": "node --env-file .env dist/compare-all.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md new file mode 100644 index 0000000..b5e9738 --- /dev/null +++ b/ai/select-algorithm-typescript/quickstart.md @@ -0,0 +1,441 @@ +--- +title: Quickstart - Vector index with TypeScript +description: Compare vector index algorithms and similarity functions using TypeScript in Azure DocumentDB to optimize search performance for your workload. +ms.devlang: typescript +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with TypeScript in Azure DocumentDB + +In this quickstart, you compare three vector index algorithms (DiskANN, HNSW, and IVF) and three similarity functions (cosine, L2, and inner product) to find the optimal configuration for your search workload. This quickstart uses a sample hotel dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-typescript) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Node.js LTS](https://nodejs.org/download/) +- [TypeScript](https://www.typescriptlang.org/download) 5.x or greater + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` data file with vectors to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Node.js project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-typescript + cd select-algorithm-typescript + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-typescript + Set-Location select-algorithm-typescript + code . + ``` + + --- + +2. Initialize a TypeScript Node.js project: + + ```bash + npm init -y + ``` + + Verify the project was initialized: + + ### [Bash](#tab/bash) + + ```bash + ls package.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem package.json + ``` + + --- + +3. Install the required packages: + + ```bash + npm install mongodb openai @azure/identity + npm install --save-dev typescript @types/node + ``` + + - `mongodb`: MongoDB driver for Node.js + - `openai`: OpenAI client library to create vectors + - `@azure/identity`: Azure Identity library for passwordless authentication + - `typescript`: TypeScript compiler + + Verify: `npm list` shows all installed packages without errors. + +4. Create a `tsconfig.json` file in the project root: + + ```json + { + "compilerOptions": { + "target": "ES2022", + "module": "Node16", + "moduleResolution": "Node16", + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "strict": true, + "rootDir": "./src", + "outDir": "./dist" + }, + "include": ["src/**/*"], + "exclude": ["node_modules"] + } + ``` + +5. Update your `package.json` to include: + + ```json + { + "type": "module", + "scripts": { + "build": "tsc", + "start": "node --env-file .env dist/select-algorithm.js" + } + } + ``` + +6. Create a `.env` file for environment variables in the project root: + + ```bash + # Azure OpenAI Embedding Settings + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + + # Data File Paths and Vector Configuration + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 + + # Azure DocumentDB Connection Settings + MONGO_CLUSTER_NAME= + + # Azure DocumentDB Database Name + AZURE_DOCUMENTDB_DATABASENAME=Hotels + + # Algorithm Selection (used by select-algorithm.ts) + # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" + ALGORITHM=all + + # SIMILARITY: "all" | "COS" | "L2" | "IP" + SIMILARITY=all + ``` + + Verify the `.env` file was created: + + ### [Bash](#tab/bash) + + ```bash + cat .env + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content .env + ``` + + --- + + You should see your configuration values including the Azure OpenAI endpoint and cluster name. + + For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate JavaScript apps to Azure services using the Azure SDK for JavaScript](/azure/developer/javascript/sdk/authentication/overview). + +## Create code files + +Create the following project structure: + +``` +├── data/ +│ └── Hotels_Vector.json # Hotel data with vector embeddings +└── select-algorithm-typescript/ + ├── src/ + │ ├── select-algorithm.ts # Main comparison script + │ └── utils.ts # Shared utility functions + ├── tsconfig.json + ├── package.json + └── .env # Environment variables +``` + +Create the `src` directory: + +### [Bash](#tab/bash) + +```bash +mkdir src +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Name src +``` + +--- + +## Create the algorithm comparison code + +Create the `src/select-algorithm.ts` file with the following code: + +:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/select-algorithm.ts" ::: + +This script orchestrates the algorithm comparison by: + +- Loading configuration from environment variables +- Initializing MongoDB and Azure OpenAI clients with passwordless authentication +- Loading hotel data with pre-calculated embeddings +- Testing each algorithm/similarity combination by creating a collection, inserting data, creating an index, and executing a search +- Measuring and comparing search performance across all configurations +- Displaying results in a comparison table + +## Create utility functions + +Create the `src/utils.ts` file with the following code: + +:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/utils.ts" ::: + +The utilities provide essential functions for: + +- Passwordless authentication to DocumentDB and Azure OpenAI using DefaultAzureCredential +- Reading JSON data files +- Batch insertion of documents with DocumentDB's 16 MB payload limit in mind +- Formatted display of comparison results showing algorithm performance + +## Run the code + +Execute the comparison script to test all 9 algorithm × similarity combinations: + +```bash +npm run build +npm start +``` + +The output shows the comparison across all algorithms and similarity metrics: + +``` +Vector Algorithm Comparison + Database: Hotels + Algorithms: all + Similarity: all + Collections to query: hotels_diskann_cos, hotels_diskann_l2, hotels_diskann_ip, hotels_hnsw_cos, ... + Search query: "quintessential lodging near running trails, eateries, retail" + +Generating query embedding... +Query embedding: 1536 dimensions + +--- DiskANN / COS --- +Collection: hotels_diskann_cos +Created collection: hotels_diskann_cos +Inserted: 50/50 +Created vector index: vectorIndex_diskann_cos +Executing vector search... +Success: 5 results, 142ms + +... + +========================================================================================== + Vector Algorithm Comparison Results +========================================================================================== +Algorithm Similarity Top Result Score Latency(ms) +------------------------------------------------------------------------------------------ +DiskANN COS Ocean Water Resort & 0.6184 142 +DiskANN L2 Ocean Water Resort & 0.8736 128 +DiskANN IP Ocean Water Resort & 0.6184 135 +HNSW COS Ocean Water Resort & 0.6184 119 +HNSW L2 Ocean Water Resort & 0.8736 115 +HNSW IP Ocean Water Resort & 0.6184 121 +IVF COS Ocean Water Resort & 0.6184 108 +IVF L2 Ocean Water Resort & 0.8736 105 +IVF IP Ocean Water Resort & 0.6184 110 +========================================================================================== + +--- DiskANN / COS (hotels_diskann_cos) --- + 1. Ocean Water Resort & Spa, Score: 0.6184 + 2. Windy Ocean Motel, Score: 0.5056 + 3. Gastronomic Landscape Hotel, Score: 0.4892 + 4. Sublime Palace Hotel, Score: 0.4753 + 5. Luxury Lion Resort, Score: 0.4612 + Latency: 142ms +... + +Closing database connection... +Database connection closed +``` + +> [!NOTE] +> Latency values are approximate and vary by environment. Scores may differ slightly depending on your Azure OpenAI embedding deployment. + +### Test individual algorithms + +To test a specific algorithm, update the `ALGORITHM` and `SIMILARITY` values in your `.env` file: + +```bash +# Edit .env to set specific values, for example: +# ALGORITHM=ivf +# SIMILARITY=COS + +npm run build +npm start +``` + +### Understanding the results + +The comparison table demonstrates key behaviors of vector search in DocumentDB: + +- **All algorithms return identical results on small datasets.** With 50 documents, every algorithm finds the same matches because the dataset fits entirely in memory regardless of index structure. Algorithm selection becomes important at scale (millions of documents) where tradeoffs in latency, memory, and recall diverge. + +- **COS and IP produce identical scores** (0.6184 / 0.5056) because the `text-embedding-3-small` model outputs normalized (unit-length) vectors. For normalized vectors, cosine similarity equals inner product mathematically. + +- **L2 (Euclidean distance) scores are inverted.** Higher L2 scores mean *more* distance — the #1 result has the *lowest* score (0.8736 = closest to query). This explains the negative Diff value (-0.1208). + +- **Score separation (Diff column)** shows confidence. A larger positive diff means the search clearly distinguishes the best match from the second-best. This metric helps evaluate result quality regardless of the absolute score values. + +Algorithm selection guidelines for production: + +| Algorithm | Best for | Tradeoff | +|-----------|----------|----------| +| **DiskANN** | Large datasets (millions+) | Stores index on disk, lower memory | +| **HNSW** | High-accuracy requirements | More memory, excellent recall | +| **IVF** | Very large datasets with limited memory | Faster build, possible recall reduction | + +Similarity function selection: + +| Function | Score meaning | Best for | +|----------|-------------|----------| +| **COS (Cosine)** | Higher = more similar (0–1) | Text embeddings (normalized vectors) | +| **L2 (Euclidean)** | Lower = more similar (distance) | When magnitude matters | +| **IP (Inner Product)** | Higher = more similar | Equivalent to COS for normalized vectors | + +Tuning parameters affect the recall/latency tradeoff at both index build and query time: + +| Algorithm | Build parameters | Search parameters | +|-----------|-----------------|-------------------| +| **DiskANN** | `maxDegree` (32), `lBuild` (50) | `lSearch` (100) | +| **HNSW** | `m` (16), `efConstruction` (64) | `efSearch` (80) | +| **IVF** | `numLists` (1) | `nProbes` (1) | + +Higher build values improve index quality but slow creation. Higher search values improve recall but increase latency. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `MongoServerSelectionError` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `MongoServerError: Authentication failed` | Check credentials in connection string. Verify you've run `az login` for passwordless auth. | +| TypeScript compilation errors | Run `npx tsc --version` to verify TypeScript is installed. Check `tsconfig.json` settings match the values shown in this article. | +| `Cannot find module` errors | Run `npm install` to ensure all dependencies are installed. | +| `Embedding dimension mismatch` | Verify `AZURE_OPENAI_EMBEDDING_MODEL` in `.env` matches the model deployed in your Azure OpenAI resource. | +| Empty search results | The vector index may not be ready yet. The code includes retry logic, but if the dataset is large, increase the wait time. | + +## Clean up resources + +When you're done, you can remove the database using `mongosh` or the Azure portal. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.mongocluster.cosmos.azure.com/" --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [Azure portal](#tab/portal) + +1. Navigate to your DocumentDB resource in the Azure portal. +2. Select **Data Explorer**. +3. Right-click the **Hotels** database and select **Delete Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts new file mode 100644 index 0000000..9f0abaa --- /dev/null +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -0,0 +1,232 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +interface AlgorithmConfig { + name: string; + kind: string; + options: Record; +} + +interface SearchResult { + algorithm: string; + similarity: string; + top1Name: string; + top1Score: number; + top2Name: string; + top2Score: number; +} + +const ALGORITHMS: AlgorithmConfig[] = [ + { name: 'IVF', kind: 'vector-ivf', options: { numLists: 1 } }, + { name: 'HNSW', kind: 'vector-hnsw', options: { m: 16, efConstruction: 64 } }, + { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, +]; + +const SIMILARITIES = ['COS', 'L2', 'IP']; + +async function main() { + const baseConfig = getConfig(); + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '5', 10); + const collectionName = 'hotels'; + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(baseConfig.dbName); + + // Drop collection if it exists for a clean start + let collections = await db.listCollections({ name: collectionName }).toArray(); + if (collections.length > 0) { + try { + const col = db.collection(collectionName); + const existingIndexes = await col.listIndexes().toArray(); + for (const idx of existingIndexes) { + if (idx.name !== '_id_') { + try { + await col.dropIndex(idx.name); + } catch {} + } + } + await new Promise(r => setTimeout(r, 2000)); + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } catch (e: any) { + console.log(`Cleanup note: ${e.message.split('\n')[0]}`); + } + await new Promise(r => setTimeout(r, 10000)); + } + + // Load data once for reuse + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + console.log(`Loaded ${data.length} documents`); + + // Insert data into collection + const collection = db.collection(collectionName); + await insertData(baseConfig, collection, data); + + // Generate one embedding for the query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + console.log(`Embedding generated (${queryVector.length} dimensions)`); + + // Sequential create→search→drop for each algorithm+similarity combo + // DocumentDB does not allow multiple vector indexes of the same kind on the same field + console.log(`\nRunning searches (top ${topK} results)...\n`); + const results: SearchResult[] = []; + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // 1. Drop all existing vector indexes + const indexes = await collection.listIndexes().toArray(); + let droppedAny = false; + for (const idx of indexes) { + if (idx.key && idx.key[baseConfig.embeddedField] === 'cosmosSearch') { + try { await collection.dropIndex(idx.name); droppedAny = true; } catch {} + } + } + if (droppedAny) { + await new Promise(r => setTimeout(r, 2000)); + } + + // 2. Create this specific index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [baseConfig.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: algo.kind, + ...algo.options, + similarity: sim, + dimensions: baseConfig.embeddingDimensions + } + }] + }; + await db.command(indexOptions); + console.log(` ✓ ${indexName} created`); + + // 3. Wait for index to be ready + await new Promise(r => setTimeout(r, 5000)); + + // 4. Search with retry (index may need more time) + let searchResults: any[] = []; + for (let attempt = 0; attempt < 3; attempt++) { + if (attempt > 0) { + await new Promise(r => setTimeout(r, 5000)); + } + try { + searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]).toArray(); + if (searchResults.length > 0) break; + } catch (e) { + if (attempt === 2) throw e; + } + } + + // Record top 2 results + const top1 = searchResults[0] as any; + const top2 = searchResults[1] as any; + results.push({ + algorithm: algo.name, + similarity: sim, + top1Name: top1?.document?.HotelName ?? '(none)', + top1Score: top1?.score ?? 0, + top2Name: top2?.document?.HotelName ?? '(none)', + top2Score: top2?.score ?? 0, + }); + } + } + + // Print comparison table + printComparisonTable(results); + + } catch (error) { + console.error('Compare-all failed:', error); + process.exitCode = 1; + } finally { + // Cleanup: drop the comparison collection + if (dbClient) { + try { + const db = dbClient.db(baseConfig.dbName); + await db.dropCollection(collectionName); + console.log(`\nCleanup: dropped collection "${collectionName}"`); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +function printComparisonTable(results: SearchResult[]) { + const algoW = 10; + const simW = 8; + const name1W = 28; + const score1W = 8; + const name2W = 28; + const score2W = 8; + const diffW = 7; + + const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); + + const cols = [algoW, simW, name1W, score1W, name2W, score2W, diffW]; + const topLine = `┌${cols.map(w => '─'.repeat(w)).join('┬')}┐`; + const headerSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const rowSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const bottomLine = `└${cols.map(w => '─'.repeat(w)).join('┴')}┘`; + + console.log(topLine); + console.log( + `│${pad(' Algorithm', algoW)}│${pad(' Metric', simW)}│${pad(' Top 1 Result', name1W)}│${pad(' Score', score1W)}│${pad(' Top 2 Result', name2W)}│${pad(' Score', score2W)}│${pad(' Diff', diffW)}│` + ); + console.log(headerSep); + + results.forEach((r, i) => { + const diff = Math.abs(r.top1Score - r.top2Score).toFixed(4); + console.log( + `│${pad(` ${r.algorithm}`, algoW)}│${pad(` ${r.similarity}`, simW)}│${pad(` ${r.top1Name}`, name1W)}│${pad(` ${r.top1Score.toFixed(4)}`, score1W)}│${pad(` ${r.top2Name}`, name2W)}│${pad(` ${r.top2Score.toFixed(4)}`, score2W)}│${pad(` ${diff}`, diffW)}│` + ); + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts new file mode 100644 index 0000000..e082d21 --- /dev/null +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -0,0 +1,287 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printComparisonTable } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Validate required environment variables at startup +const requiredEnvVars = [ + 'MONGO_CLUSTER_NAME', + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'DATA_FILE_WITH_VECTORS' +]; + +const missing = requiredEnvVars.filter(v => !process.env[v]); +if (missing.length > 0) { + console.error(`Missing required environment variables: ${missing.join(', ')}`); + console.error('See .env.example for required values.'); + process.exit(1); +} + +type Algorithm = 'diskann' | 'hnsw' | 'ivf'; +type Similarity = 'COS' | 'L2' | 'IP'; + +const ALGORITHMS: Algorithm[] = ['diskann', 'hnsw', 'ivf']; +const SIMILARITIES: Similarity[] = ['COS', 'L2', 'IP']; + +const ALGORITHM_LABELS: Record = { + diskann: 'DiskANN', + hnsw: 'HNSW', + ivf: 'IVF', +}; + +// Index creation configs per algorithm +function getIndexOptions( + collectionName: string, + indexName: string, + embeddedField: string, + dimensions: number, + algorithm: Algorithm, + similarity: Similarity +) { + const base = { + createIndexes: collectionName, + indexes: [ + { + name: indexName, + key: { [embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: {} as Record, + }, + ], + }; + + switch (algorithm) { + case 'diskann': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-diskann', + dimensions, + similarity, + maxDegree: 32, + lBuild: 50, + }; + break; + case 'hnsw': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-hnsw', + dimensions, + similarity, + m: 16, + efConstruction: 64, + }; + break; + case 'ivf': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-ivf', + dimensions, + similarity, + numLists: 1, + }; + break; + } + + return base; +} + +// Algorithm-specific query params +function getSearchPipeline( + queryEmbedding: number[], + embeddedField: string, + k: number, + algorithm: Algorithm +) { + const cosmosSearch: Record = { + vector: queryEmbedding, + path: embeddedField, + k, + }; + + // Add algorithm-specific search params + switch (algorithm) { + case 'diskann': + cosmosSearch.lSearch = 100; + break; + case 'hnsw': + cosmosSearch.efSearch = 80; + break; + case 'ivf': + cosmosSearch.nProbes = 1; + break; + } + + return [ + { $search: { cosmosSearch } }, + { $project: { score: { $meta: "searchScore" }, document: "$$ROOT" } }, + ]; +} + +/** + * Determine which collections to create/query based on ALGORITHM and SIMILARITY env vars. + * Collection naming: hotels_{algorithm}_{similarity} + */ +function getTargetCollections( + algorithmEnv: string, + similarityEnv: string +): Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> { + const algorithms: Algorithm[] = + !algorithmEnv ? ALGORITHMS : [algorithmEnv as Algorithm]; + const similarities: Similarity[] = + !similarityEnv ? SIMILARITIES : [similarityEnv as Similarity]; + + const targets: Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> = []; + + for (const alg of algorithms) { + if (!ALGORITHMS.includes(alg)) { + throw new Error(`Invalid ALGORITHM '${alg}'. Must be one of: ${ALGORITHMS.join(', ')}`); + } + for (const sim of similarities) { + if (!SIMILARITIES.includes(sim)) { + throw new Error(`Invalid SIMILARITY '${sim}'. Must be one of: ${SIMILARITIES.join(', ')}`); + } + targets.push({ + collectionName: `hotels_${alg}_${sim.toLowerCase()}`, + algorithm: alg, + similarity: sim, + }); + } + } + + return targets; +} + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('Azure OpenAI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + const dbName = process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels'; + const embeddedField = process.env.EMBEDDED_FIELD || 'DescriptionVector'; + const embeddingDimensions = parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10); + const dataFile = process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json'; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const batchSize = parseInt(process.env.LOAD_SIZE_BATCH || '100', 10); + const algorithmEnv = (process.env.ALGORITHM || '').trim().toLowerCase(); + const similarityEnv = (process.env.SIMILARITY || '').trim().toUpperCase(); + const searchQuery = 'quintessential lodging near running trails, eateries, retail'; + + const targets = getTargetCollections(algorithmEnv, similarityEnv); + + console.log(`\n🔬 Vector Algorithm Comparison`); + console.log(` Database: ${dbName}`); + console.log(` Algorithms: ${algorithmEnv}`); + console.log(` Similarity: ${similarityEnv}`); + console.log(` Collections to query: ${targets.map(t => t.collectionName).join(', ')}`); + console.log(` Search query: "${searchQuery}"\n`); + + await dbClient.connect(); + const db = dbClient.db(dbName); + + // Load data once (shared across collections) + const data = await readFileReturnJson(path.join(__dirname, '..', dataFile)); + + // Generate query embedding once (reuse across collections) + console.log('Generating query embedding...'); + const embeddingResponse = await aiClient.embeddings.create({ + model: deployment, + input: [searchQuery], + }); + const queryEmbedding = embeddingResponse.data[0].embedding; + if (queryEmbedding.length !== embeddingDimensions) { + throw new Error( + `Embedding dimension mismatch: expected ${embeddingDimensions}, got ${queryEmbedding.length}. ` + + `Verify AZURE_OPENAI_EMBEDDING_MODEL matches the configured EMBEDDING_DIMENSIONS.` + ); + } + console.log(`Query embedding: ${queryEmbedding.length} dimensions\n`); + + const config = { batchSize }; + + const comparisonResults: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> = []; + + for (const target of targets) { + console.log(`\n━━━ ${ALGORITHM_LABELS[target.algorithm]} / ${target.similarity} ━━━`); + console.log(`Collection: ${target.collectionName}`); + + try { + // Create collection (drops existing to ensure clean state) + try { + await db.dropCollection(target.collectionName); + } catch { + // Collection may not exist yet + } + const collection = await db.createCollection(target.collectionName); + console.log('Created collection:', target.collectionName); + + // Insert data + const insertSummary = await insertData(config, collection, data); + console.log(`Inserted: ${insertSummary.inserted}/${insertSummary.total}`); + + // Create vector index + const indexName = `vectorIndex_${target.algorithm}_${target.similarity.toLowerCase()}`; + const indexOptions = getIndexOptions( + target.collectionName, + indexName, + embeddedField, + embeddingDimensions, + target.algorithm, + target.similarity + ); + await db.command(indexOptions); + console.log('Created vector index:', indexName); + + // Run vector search + console.log('Executing vector search...'); + const startTime = Date.now(); + + const pipeline = getSearchPipeline(queryEmbedding, embeddedField, 5, target.algorithm); + const searchResults = await collection.aggregate(pipeline).toArray(); + + const latencyMs = Date.now() - startTime; + + comparisonResults.push({ + collectionName: target.collectionName, + algorithm: ALGORITHM_LABELS[target.algorithm], + similarity: target.similarity, + searchResults, + latencyMs, + }); + + console.log(`✓ ${searchResults.length} results, ${latencyMs}ms`); + } catch (error) { + console.error(`✗ Error with ${target.collectionName}:`, (error as Error).message); + } + } + + // Print comparison table + if (comparisonResults.length > 0) { + printComparisonTable(comparisonResults); + } + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('\nClosing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts new file mode 100644 index 0000000..f10ea77 --- /dev/null +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -0,0 +1,205 @@ +import { Collection, Document, MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +// Define a type for JSON data +export type JsonData = Record; + +export function getConfig() { + return { + dbName: process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels', + dataFile: process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json', + embeddedField: process.env.EMBEDDED_FIELD || 'DescriptionVector', + similarity: process.env.SIMILARITY || '', + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small', + batchSize: parseInt(process.env.LOAD_SIZE_BATCH || '100', 10) + }; +} + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: Math.floor(((tokenResponse?.expiresOnTimestamp || 0) - Date.now()) / 1000) + }; +}; + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + // Validate all required environment variables upfront + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const clusterName = process.env.MONGO_CLUSTER_NAME!; + + if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, MONGO_CLUSTER_NAME'); + } + + console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); + + const credential = new DefaultAzureCredential(); + + // For Azure OpenAI with DefaultAzureCredential + { + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion: process.env.AZURE_OPENAI_EMBEDDING_API_VERSION || "2023-05-15", + endpoint, + deployment, + azureADTokenProvider, + timeout: 30000, + maxRetries: 3, + }); + } + + // For DocumentDB with DefaultAzureCredential (uses signed-in user) + { + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 120000, + tls: true, + retryWrites: false, + maxIdleTimeMS: 120000, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + } + ); + } + + return { aiClient, dbClient }; +} + +export async function readFileReturnJson(filePath: string): Promise { + + console.log(`Reading JSON file from ${filePath}`); + + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} + +export async function insertData(config: { batchSize: number }, collection: Collection, data: Document[]) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + // Small pause between batches to reduce resource contention + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + // Create standard field indexes + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec: Record = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } + + return { total: data.length, inserted, failed }; +} + +export function printSearchResults(insertSummary: any, vectorIndexSummary: any, searchResults: Document[]) { + console.log(`\nInsert summary: ${JSON.stringify(insertSummary)}`); + console.log(`Vector index: ${JSON.stringify(vectorIndexSummary)}`); + + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result: Document, index: number) => { + const { document, score } = result; + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + }); +} + +/** + * Print a side-by-side comparison table of vector search results across collections + */ +export function printComparisonTable( + results: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> +): void { + console.log('\n╔══════════════════════════════════════════════════════════════════════════════════╗'); + console.log('║ Vector Algorithm Comparison Results ║'); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + // Header + console.log( + '║ ' + + 'Algorithm'.padEnd(12) + + 'Similarity'.padEnd(14) + + 'Top Result'.padEnd(24) + + 'Score'.padEnd(12) + + 'Latency(ms)'.padEnd(14) + + '║' + ); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + for (const r of results) { + const topResult = r.searchResults[0]; + const topName = topResult ? (topResult.document.HotelName as string).substring(0, 22) : 'N/A'; + const topScore = topResult ? topResult.score.toFixed(4) : 'N/A'; + + console.log( + '║ ' + + r.algorithm.padEnd(12) + + r.similarity.padEnd(14) + + topName.padEnd(24) + + topScore.padEnd(12) + + r.latencyMs.toFixed(0).padEnd(14) + + '║' + ); + } + + console.log('╚══════════════════════════════════════════════════════════════════════════════════╝'); + + // Detailed results per collection + for (const r of results) { + console.log(`\n--- ${r.algorithm} / ${r.similarity} (${r.collectionName}) ---`); + if (r.searchResults.length === 0) { + console.log(' No results.'); + continue; + } + r.searchResults.forEach((item: Document, i: number) => { + console.log(` ${i + 1}. ${item.document.HotelName}, Score: ${item.score.toFixed(4)}`); + }); + console.log(` Latency: ${r.latencyMs.toFixed(0)}ms`); + } +} diff --git a/ai/select-algorithm-typescript/tsconfig.json b/ai/select-algorithm-typescript/tsconfig.json new file mode 100644 index 0000000..3cb9aaa --- /dev/null +++ b/ai/select-algorithm-typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} From 7a8d212555fb5936a4734c85ecc7e68e83b251b6 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Fri, 15 May 2026 12:34:46 -0700 Subject: [PATCH 02/11] Implement Khelan's PM feedback for article 2/3 quickstarts - Rename MONGO_CLUSTER_NAME to DOCUMENTDB_CLUSTER_NAME in all 5 language samples - Add DOCUMENTDB_CLUSTER_NAME dual-output in Bicep (preserves backward compat) - Replace Data Explorer cleanup guidance with VS Code extension - Strengthen algorithm guidance: DiskANN recommended for enterprise (16K dims, disk-based) - Remove python-dotenv from pip install (repo rule #10) - Fix Python filename refs (select_algorithm.py -> compare_all.py) - Revert out-of-scope vector-search-* changes to origin/main Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/quickstart.md | 72 ++++++++-------- ai/select-algorithm-go/README.md | 6 +- ai/select-algorithm-go/quickstart.md | 48 ++++++----- ai/select-algorithm-go/src/utils.go | 4 +- ai/select-algorithm-java/README.md | 4 +- ai/select-algorithm-java/quickstart.md | 79 +++++++++-------- .../documentdb/selectalgorithm/Utils.java | 4 +- ai/select-algorithm-python/README.md | 2 +- ai/select-algorithm-python/quickstart.md | 84 ++++++++++--------- ai/select-algorithm-python/src/utils.py | 4 +- ai/select-algorithm-typescript/README.md | 2 +- ai/select-algorithm-typescript/quickstart.md | 59 +++++++------ .../src/select-algorithm.ts | 2 +- ai/select-algorithm-typescript/src/utils.ts | 4 +- .../com/azure/documentdb/samples/DiskAnn.java | 2 +- .../com/azure/documentdb/samples/HNSW.java | 2 +- .../com/azure/documentdb/samples/IVF.java | 2 +- ai/vector-search-typescript/src/utils.ts | 2 +- infra/main.bicep | 1 + 19 files changed, 204 insertions(+), 179 deletions(-) diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index d0b761b..81f9bf2 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -149,7 +149,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```bash export AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" export AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" - export MONGO_CLUSTER_NAME="" + export DOCUMENTDB_CLUSTER_NAME="" export AZURE_TENANT_ID="" export DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" ``` @@ -159,7 +159,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```powershell $env:AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" $env:AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" - $env:MONGO_CLUSTER_NAME="" + $env:DOCUMENTDB_CLUSTER_NAME="" $env:AZURE_TENANT_ID="" $env:DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" ``` @@ -466,41 +466,40 @@ IVF COS Historic Downtown Inn 0.8342 52 Latency: 52ms ``` -## Understanding the results +### Choosing the right algorithm -Use this guidance to choose the right vector search algorithm for your workload: +Use this comparison to select the best algorithm for your workload: -| Algorithm | Best for | Index creation | Search speed | Memory usage | Accuracy | -|-----------|----------|---------------|--------------|--------------|----------| -| **DiskANN** | Large datasets, disk-based storage | Slow | Fast | Low (disk-based) | High | -| **HNSW** | Real-time search, high throughput | Medium | Fastest | High (memory-intensive) | Very high | -| **IVF** | Cost-sensitive, approximate search | Fast | Medium | Low | Medium | +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall -### Similarity functions +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: +- Best for: Enterprise production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall -| Function | Formula | Best for | -|----------|---------|----------| -| **COS** (Cosine) | Angle between vectors | Text embeddings, normalized vectors | -| **L2** (Euclidean) | Distance between points | Image embeddings, coordinate data | -| **IP** (Inner Product) | Dot product | Recommendation systems, unnormalized data | +**HNSW** (hierarchical navigable small world): +- Best for: Enterprise production workloads on M30+ clusters requiring highest recall +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall -### Tuning parameters +> [!TIP] +> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. -Each algorithm has tuning parameters that control the accuracy/performance tradeoff: +### Choosing the right similarity function -**DiskANN:** -- `maxDegree`: Higher values (20-64) improve accuracy but increase memory -- `lBuild`: Higher values (10-100) improve index quality but slow build time -- `lSearch`: Higher values (100-200) improve search accuracy but slow queries +The similarity function should match your embedding model and use case: -**HNSW:** -- `m`: Higher values (16-48) improve accuracy but increase memory -- `efConstruction`: Higher values (64-200) improve index quality but slow build time -- `efSearch`: Higher values (80-200) improve search accuracy but slow queries +- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) +- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data +- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful -**IVF:** -- `numLists`: More lists improve speed but may reduce accuracy -- `nProbes`: Higher values (1-10) improve accuracy but slow queries +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. ## Troubleshooting @@ -515,26 +514,31 @@ Each algorithm has tuning parameters that control the accuracy/performance trade ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure portal. +When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. ### [mongosh](#tab/mongosh) Connect to your DocumentDB cluster and drop the database: ```bash -mongosh "" +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript use Hotels db.dropDatabase() ``` -### [Azure portal](#tab/portal) +### [VS Code extension](#tab/vscode) -1. Navigate to your DocumentDB resource in the Azure portal. -2. Select **Data Explorer**. -3. Right-click the **Hotels** database and select **Delete Database**. +1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. --- +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + ## Related content - [Vector search overview](./vector-search.md) diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md index a76ab08..f03828e 100644 --- a/ai/select-algorithm-go/README.md +++ b/ai/select-algorithm-go/README.md @@ -34,7 +34,7 @@ This sample demonstrates how to compare different vector search algorithms (IVF, Required variables: ```env - MONGO_CLUSTER_NAME=your-cluster-name + DOCUMENTDB_CLUSTER_NAME=your-cluster-name AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small AZURE_DOCUMENTDB_DATABASENAME=Hotels @@ -112,7 +112,7 @@ go run ./src/... | Variable | Default | Description | |--------------|----------------------------------|---------------------------------| -| `MONGO_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | +| `DOCUMENTDB_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | *(required)* | Azure OpenAI endpoint | | `AZURE_OPENAI_EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model name | | `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Database name | @@ -180,7 +180,7 @@ The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.defau **"OIDC authentication failed"** - Run `az login` and ensure you're authenticated - Verify your Azure identity has RBAC permissions on the DocumentDB cluster -- Check that `MONGO_CLUSTER_NAME` matches your cluster name +- Check that `DOCUMENTDB_CLUSTER_NAME` matches your cluster name **"DiskANN indexes require a higher cluster tier"** - DiskANN requires M40+ cluster tier diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index c9db665..4f8ee20 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -167,7 +167,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ LOAD_SIZE_BATCH=100 # DocumentDB Configuration - MONGO_CLUSTER_NAME=your-cluster-name + DOCUMENTDB_CLUSTER_NAME=your-cluster-name # Algorithm Selection # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" @@ -183,7 +183,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name (not the full connection string, just the name) + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name (not the full connection string, just the name) Verify the `.env` file was created: @@ -392,23 +392,26 @@ The comparison table shows how different algorithms perform on the same dataset Use this comparison to select the best algorithm for your workload: -**DiskANN** (disk-based approximate nearest neighbor): -- Best for: Large datasets that don't fit in memory -- Pros: Memory efficient, good recall with high dimensions -- Cons: Requires disk I/O, slower build time +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall + +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: +- Best for: Enterprise production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover +- Cons: Requires M30+ cluster tier - Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall **HNSW** (hierarchical navigable small world): -- Best for: High-speed queries with excellent recall -- Pros: Fastest queries, excellent recall, stable performance -- Cons: Higher memory usage than DiskANN +- Best for: Enterprise production workloads on M30+ clusters requiring highest recall +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage - Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall -**IVF** (inverted file index): -- Best for: Large datasets with good clustering properties -- Pros: Fast queries, low memory overhead -- Cons: Recall depends on `numLists` and `nProbes` tuning -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall +> [!TIP] +> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. ### Choosing the right similarity function @@ -468,26 +471,31 @@ SIMILARITY=COS ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure portal. +When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. ### [mongosh](#tab/mongosh) Connect to your DocumentDB cluster and drop the database: ```bash -mongosh "" +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript use Hotels db.dropDatabase() ``` -### [Azure portal](#tab/portal) +### [VS Code extension](#tab/vscode) -1. Navigate to your DocumentDB resource in the Azure portal -2. Select **Data Explorer** -3. Right-click the **Hotels** database and select **Delete Database** +1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. --- +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + ## Related content - [Vector search overview](./vector-search.md) diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go index 8b415db..aa099f5 100644 --- a/ai/select-algorithm-go/src/utils.go +++ b/ai/select-algorithm-go/src/utils.go @@ -51,7 +51,7 @@ func LoadConfig() *Config { batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) return &Config{ - ClusterName: getEnvOrDefault("MONGO_CLUSTER_NAME", ""), + ClusterName: getEnvOrDefault("DOCUMENTDB_CLUSTER_NAME", ""), DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"), VectorField: getEnvOrDefault("EMBEDDED_FIELD", "DescriptionVector"), @@ -74,7 +74,7 @@ func getEnvOrDefault(key, defaultValue string) string { // GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { if config.ClusterName == "" { - return nil, openai.Client{}, fmt.Errorf("MONGO_CLUSTER_NAME environment variable is required") + return nil, openai.Client{}, fmt.Errorf("DOCUMENTDB_CLUSTER_NAME environment variable is required") } // Create Azure credential diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md index d84bd48..2449f40 100644 --- a/ai/select-algorithm-java/README.md +++ b/ai/select-algorithm-java/README.md @@ -29,7 +29,7 @@ This sample demonstrates how to compare all three vector search index algorithms ``` 2. Update `.env` with your Azure resource details (if not using `azd`): - - `MONGO_CLUSTER_NAME` — your DocumentDB cluster name + - `DOCUMENTDB_CLUSTER_NAME` — your DocumentDB cluster name - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file @@ -78,7 +78,7 @@ $env:ALGORITHM="compare"; mvn exec:java | Variable | Default | Description | |----------|---------|-------------| -| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `DOCUMENTDB_CLUSTER_NAME` | (required) | DocumentDB cluster name | | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | | `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | | `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index f90bd8b..fdd3c7b 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -206,7 +206,7 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D ```bash # Azure DocumentDB cluster name for passwordless authentication - MONGO_CLUSTER_NAME= + DOCUMENTDB_CLUSTER_NAME= # Azure managed identity principal ID for authentication AZURE_MANAGED_IDENTITY_PRINCIPAL_ID= @@ -232,7 +232,7 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D Replace the placeholder values with your Azure resource information: - - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name - `AZURE_MANAGED_IDENTITY_PRINCIPAL_ID`: Your managed identity principal ID - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL @@ -447,48 +447,40 @@ IVF L2 58.90 ## Understanding the results -### Algorithm characteristics +### Choosing the right algorithm -**DiskANN** - Disk-based approximate nearest neighbor search -- Good balance of speed and accuracy -- Suitable for large datasets that don't fit in memory -- Parameters: `maxDegree=32` (graph connectivity), `lBuild=50` (build quality), `lSearch=100` (query accuracy) +Use this comparison to select the best algorithm for your workload: -**HNSW** - Hierarchical Navigable Small World -- Memory-based hierarchical graph -- Excellent for real-time applications requiring low latency -- Parameters: `m=16` (connections per layer), `efConstruction=64` (build quality), `efSearch=80` (query accuracy) +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall -**IVF** - Inverted File Index -- Cluster-based partitioning approach -- Fast search via centroid comparison -- Parameters: `numLists=1` (number of clusters), `nProbes=1` (clusters to search) +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: +- Best for: Enterprise production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall -### Similarity functions +**HNSW** (hierarchical navigable small world): +- Best for: Enterprise production workloads on M30+ clusters requiring highest recall +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall -**COS (Cosine)** - Measures angle between vectors -- Best for text embeddings (like those from OpenAI models) -- Scale-invariant (ignores vector magnitude) -- Range: -1 to 1 (1 = identical direction) +> [!TIP] +> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. -**L2 (Euclidean)** - Measures straight-line distance -- Sensitive to vector magnitude -- Good for embeddings where scale matters -- Range: 0 to infinity (0 = identical) +### Choosing the right similarity function -**IP (Inner Product)** - Dot product of vectors -- Fast to compute -- Can be used with normalized vectors -- Range: -infinity to infinity +The similarity function should match your embedding model and use case: -### Choosing the right configuration +- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) +- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data +- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful -Use the comparison results to guide your selection: - -1. **For real-time applications**: Choose HNSW if latency is critical -2. **For large datasets**: Choose DiskANN if your data exceeds available memory -3. **For fast batch processing**: Choose IVF if you can tolerate slightly lower accuracy -4. **For text embeddings**: Use COS similarity function (most common with OpenAI embeddings) +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. ## Troubleshooting @@ -501,26 +493,31 @@ Use the comparison results to guide your selection: ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure portal. +When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. ### [mongosh](#tab/mongosh) Connect to your DocumentDB cluster and drop the database: ```bash -mongosh "" +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript use Hotels db.dropDatabase() ``` -### [Azure portal](#tab/portal) +### [VS Code extension](#tab/vscode) -1. Navigate to your DocumentDB resource in the Azure portal -2. Select **Data Explorer** -3. Right-click the **Hotels** database and select **Delete Database** +1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. --- +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + ## Related content - [Vector search overview](./vector-search.md) diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java index 8ed19d0..b8b761e 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -34,9 +34,9 @@ public static String getEnv(String key) { } public static MongoClient getMongoClient() { - String clusterName = getEnv("MONGO_CLUSTER_NAME"); + String clusterName = getEnv("DOCUMENTDB_CLUSTER_NAME"); if (clusterName == null) { - throw new IllegalStateException("MONGO_CLUSTER_NAME environment variable is required"); + throw new IllegalStateException("DOCUMENTDB_CLUSTER_NAME environment variable is required"); } String connectionUri = String.format( diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md index 3393ce5..1fe7746 100644 --- a/ai/select-algorithm-python/README.md +++ b/ai/select-algorithm-python/README.md @@ -80,7 +80,7 @@ The script creates a single `hotels` collection, loads data once, then for each | Variable | Default | Description | |----------|---------|-------------| -| `MONGO_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `DOCUMENTDB_CLUSTER_NAME` | (required) | DocumentDB cluster name | | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | | `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | | `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index fba490e..536c654 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -104,13 +104,12 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ 3. Install the required packages: ```bash - pip install "pymongo>=4.7" openai==1.55.3 azure-identity==1.15.0 python-dotenv==1.0.0 + pip install "pymongo>=4.7" openai==1.55.3 azure-identity==1.15.0 ``` - `pymongo`: MongoDB driver for Python (≥4.7 required for OIDC authentication) - `openai`: OpenAI client library to create vectors - `azure-identity`: Azure Identity library for passwordless authentication - - `python-dotenv`: Environment variable management from .env files Verify the packages are installed: @@ -145,12 +144,12 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ LOAD_SIZE_BATCH=100 # Azure DocumentDB Connection Settings - MONGO_CLUSTER_NAME= + DOCUMENTDB_CLUSTER_NAME= # Azure DocumentDB Database Name AZURE_DOCUMENTDB_DATABASENAME=Hotels - # Algorithm Selection (used by select_algorithm.py) + # Algorithm Selection (used by compare_all.py) # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" ALGORITHM=all @@ -161,7 +160,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Python apps to Azure services by using the Azure SDK for Python](/azure/developer/python/sdk/authentication/overview). @@ -192,7 +191,7 @@ Create the following project structure: │ └── Hotels_Vector.json # Hotel data with vector embeddings └── select-algorithm/ ├── src/ - │ ├── select_algorithm.py # Main comparison script + │ ├── compare_all.py # Main comparison script │ └── utils.py # Shared utility functions └── .env # Environment variables ``` @@ -246,7 +245,7 @@ The utilities provide essential functions for: Execute the comparison script to test all algorithms with cosine similarity: ```bash -python src/select_algorithm.py +python src/compare_all.py ``` The output shows the comparison across all three algorithms: @@ -342,34 +341,34 @@ To override environment variables at the command line: ```bash # Test only DiskANN across all similarity functions -ALGORITHM=diskann SIMILARITY=all python src/select_algorithm.py +ALGORITHM=diskann SIMILARITY=all python src/compare_all.py ``` ```bash # Test all algorithms with L2 distance -ALGORITHM=all SIMILARITY=L2 python src/select_algorithm.py +ALGORITHM=all SIMILARITY=L2 python src/compare_all.py ``` ```bash # Test HNSW with inner product -ALGORITHM=hnsw SIMILARITY=IP python src/select_algorithm.py +ALGORITHM=hnsw SIMILARITY=IP python src/compare_all.py ``` ### [PowerShell](#tab/powershell) ```powershell # Test only DiskANN across all similarity functions -$env:ALGORITHM="diskann"; $env:SIMILARITY="all"; python src/select_algorithm.py +$env:ALGORITHM="diskann"; $env:SIMILARITY="all"; python src/compare_all.py ``` ```powershell # Test all algorithms with L2 distance -$env:ALGORITHM="all"; $env:SIMILARITY="L2"; python src/select_algorithm.py +$env:ALGORITHM="all"; $env:SIMILARITY="L2"; python src/compare_all.py ``` ```powershell # Test HNSW with inner product -$env:ALGORITHM="hnsw"; $env:SIMILARITY="IP"; python src/select_algorithm.py +$env:ALGORITHM="hnsw"; $env:SIMILARITY="IP"; python src/compare_all.py ``` --- @@ -385,33 +384,40 @@ The comparison table helps you choose the best configuration for your workload: - **Score**: Similarity score using the selected function. Higher scores indicate better matches. - **Top Result**: The highest-scoring hotel for the query. Consistency across algorithms indicates stable results. -Algorithm selection guidelines: +### Choosing the right algorithm -- **DiskANN**: Best for large datasets where memory is limited. Stores index on disk while maintaining good performance. -- **HNSW**: Best for high-accuracy requirements and fast search. Requires more memory but provides excellent recall. -- **IVF**: Best for very large datasets where some recall can be traded for speed. Uses clustering for efficient search. +Use this comparison to select the best algorithm for your workload: -Similarity function selection: +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall -- **COS (Cosine)**: Best for text embeddings. Normalizes vectors and measures angle between them. -- **L2 (Euclidean)**: Measures straight-line distance. Sensitive to vector magnitude. -- **IP (Inner Product)**: Dot product similarity. Useful when vector magnitude is meaningful. +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: +- Best for: Enterprise production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall -Tuning parameters: +**HNSW** (hierarchical navigable small world): +- Best for: Enterprise production workloads on M30+ clusters requiring highest recall +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall -DiskANN tuning: -- `maxDegree`: Higher values improve accuracy but increase memory usage (default: 32) -- `lBuild`: Higher values improve index quality but slow down index creation (default: 50) -- `lSearch`: Higher values improve recall but slow down queries (default: 100) +> [!TIP] +> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. -HNSW tuning: -- `m`: Number of connections per layer. Higher improves recall (default: 16) -- `efConstruction`: Candidates during build. Higher improves quality (default: 64) -- `efSearch`: Candidates during search. Higher improves recall (default: 80) +### Choosing the right similarity function -IVF tuning: -- `numLists`: Number of clusters. Higher speeds up search but may reduce recall (default: 1) -- `nProbes`: Clusters searched at query time. Higher improves recall but slows queries (default: 1) +The similarity function should match your embedding model and use case: + +- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) +- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data +- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful + +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. ## Troubleshooting @@ -425,14 +431,14 @@ IVF tuning: ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure portal. +When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. ### [mongosh](#tab/mongosh) Connect to your DocumentDB cluster and drop the database: ```bash -mongosh "mongodb+srv://.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC ``` ```javascript @@ -440,11 +446,11 @@ use Hotels db.dropDatabase() ``` -### [Azure portal](#tab/portal) +### [VS Code extension](#tab/vscode) -1. Navigate to your DocumentDB resource in the Azure portal. -2. Select **Data Explorer**. -3. Right-click the **Hotels** database and select **Delete Database**. +1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. --- diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py index ee905f7..b27e217 100644 --- a/ai/select-algorithm-python/src/utils.py +++ b/ai/select-algorithm-python/src/utils.py @@ -30,9 +30,9 @@ def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: """Create MongoDB and Azure OpenAI clients using passwordless auth.""" - cluster_name = os.getenv("MONGO_CLUSTER_NAME") + cluster_name = os.getenv("DOCUMENTDB_CLUSTER_NAME") if not cluster_name: - raise ValueError("MONGO_CLUSTER_NAME environment variable is required") + raise ValueError("DOCUMENTDB_CLUSTER_NAME environment variable is required") credential = DefaultAzureCredential() diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md index b9140cb..73414a8 100644 --- a/ai/select-algorithm-typescript/README.md +++ b/ai/select-algorithm-typescript/README.md @@ -41,7 +41,7 @@ Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using | Variable | Description | |---|---| - | `MONGO_CLUSTER_NAME` | Your DocumentDB cluster name | + | `DOCUMENTDB_CLUSTER_NAME` | Your DocumentDB cluster name | | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index b5e9738..35b9aa7 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -183,7 +183,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ LOAD_SIZE_BATCH=100 # Azure DocumentDB Connection Settings - MONGO_CLUSTER_NAME= + DOCUMENTDB_CLUSTER_NAME= # Azure DocumentDB Database Name AZURE_DOCUMENTDB_DATABASENAME=Hotels @@ -217,7 +217,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - - `MONGO_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate JavaScript apps to Azure services using the Azure SDK for JavaScript](/azure/developer/javascript/sdk/authentication/overview). @@ -370,15 +370,32 @@ The comparison table demonstrates key behaviors of vector search in DocumentDB: - **Score separation (Diff column)** shows confidence. A larger positive diff means the search clearly distinguishes the best match from the second-best. This metric helps evaluate result quality regardless of the absolute score values. -Algorithm selection guidelines for production: +### Choosing the right algorithm -| Algorithm | Best for | Tradeoff | -|-----------|----------|----------| -| **DiskANN** | Large datasets (millions+) | Stores index on disk, lower memory | -| **HNSW** | High-accuracy requirements | More memory, excellent recall | -| **IVF** | Very large datasets with limited memory | Faster build, possible recall reduction | +Use this comparison to select the best algorithm for your workload: -Similarity function selection: +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall + +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: +- Best for: Enterprise production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall + +**HNSW** (hierarchical navigable small world): +- Best for: Enterprise production workloads on M30+ clusters requiring highest recall +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall + +> [!TIP] +> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. + +### Choosing the right similarity function | Function | Score meaning | Best for | |----------|-------------|----------| @@ -386,15 +403,7 @@ Similarity function selection: | **L2 (Euclidean)** | Lower = more similar (distance) | When magnitude matters | | **IP (Inner Product)** | Higher = more similar | Equivalent to COS for normalized vectors | -Tuning parameters affect the recall/latency tradeoff at both index build and query time: - -| Algorithm | Build parameters | Search parameters | -|-----------|-----------------|-------------------| -| **DiskANN** | `maxDegree` (32), `lBuild` (50) | `lSearch` (100) | -| **HNSW** | `m` (16), `efConstruction` (64) | `efSearch` (80) | -| **IVF** | `numLists` (1) | `nProbes` (1) | - -Higher build values improve index quality but slow creation. Higher search values improve recall but increase latency. +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. ## Troubleshooting @@ -409,14 +418,14 @@ Higher build values improve index quality but slow creation. Higher search value ## Clean up resources -When you're done, you can remove the database using `mongosh` or the Azure portal. +When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. ### [mongosh](#tab/mongosh) Connect to your DocumentDB cluster and drop the database: ```bash -mongosh "mongodb+srv://.mongocluster.cosmos.azure.com/" --authenticationMechanism MONGODB-OIDC +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC ``` ```javascript @@ -424,15 +433,15 @@ use Hotels db.dropDatabase() ``` -### [Azure portal](#tab/portal) +### [VS Code extension](#tab/vscode) -1. Navigate to your DocumentDB resource in the Azure portal. -2. Select **Data Explorer**. -3. Right-click the **Hotels** database and select **Delete Database**. +1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. --- -If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group to remove all associated resources. +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. ## Related content diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts index e082d21..8596448 100644 --- a/ai/select-algorithm-typescript/src/select-algorithm.ts +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -9,7 +9,7 @@ const __dirname = dirname(__filename); // Validate required environment variables at startup const requiredEnvVars = [ - 'MONGO_CLUSTER_NAME', + 'DOCUMENTDB_CLUSTER_NAME', 'AZURE_OPENAI_EMBEDDING_ENDPOINT', 'AZURE_OPENAI_EMBEDDING_MODEL', 'DATA_FILE_WITH_VECTORS' diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts index f10ea77..d2ddcf7 100644 --- a/ai/select-algorithm-typescript/src/utils.ts +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -33,10 +33,10 @@ export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClie // Validate all required environment variables upfront const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; - const clusterName = process.env.MONGO_CLUSTER_NAME!; + const clusterName = process.env.DOCUMENTDB_CLUSTER_NAME!; if (!endpoint || !deployment || !clusterName) { - throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, MONGO_CLUSTER_NAME'); + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, DOCUMENTDB_CLUSTER_NAME'); } console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 676630b..59dcc76 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index 146fc27..e077d91 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index e800107..7962115 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-typescript/src/utils.ts b/ai/vector-search-typescript/src/utils.ts index 1e4abe9..a7fe3bb 100644 --- a/ai/vector-search-typescript/src/utils.ts +++ b/ai/vector-search-typescript/src/utils.ts @@ -80,7 +80,7 @@ export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClie // For DocumentDB with DefaultAzureCredential (uses signed-in user) { dbClient = new MongoClient( - `mongodb+srv://${clusterName}.mongocluster.cosmos.azure.com/`, { + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { connectTimeoutMS: 120000, tls: true, retryWrites: false, diff --git a/infra/main.bicep b/infra/main.bicep index 8e6778d..f2c6cfb 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -254,6 +254,7 @@ output AZURE_DOCUMENTDB_DATABASENAME string = databaseName output AZURE_DOCUMENTDB_COLLECTION string = collectionName output AZURE_DOCUMENTDB_INDEX_NAME string = indexName output MONGO_CLUSTER_NAME string = documentDbCluster.outputs.clusterName +output DOCUMENTDB_CLUSTER_NAME string = documentDbCluster.outputs.clusterName output AZURE_DOCUMENTDB_ADMIN_USERNAME string = documentDbAdminUsername // Configuration for embedding creation and vector search From 8588776fc941d77be214aac36d419344c7c10357 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 09:22:33 -0700 Subject: [PATCH 03/11] fix: standardize VS Code extension name across quickstarts Use 'Azure Databases extension' consistently in all 5 language quickstarts to match the actual marketplace listing (ms-azuretools.vscode-cosmosdb). The section intro previously said 'Azure DocumentDB extension' while the link tab already used the correct 'Azure Databases extension' name. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/quickstart.md | 2 +- ai/select-algorithm-go/quickstart.md | 2 +- ai/select-algorithm-java/quickstart.md | 2 +- ai/select-algorithm-python/quickstart.md | 2 +- ai/select-algorithm-typescript/quickstart.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index 81f9bf2..d27278f 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -514,7 +514,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. ### [mongosh](#tab/mongosh) diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index 4f8ee20..4fd84f7 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -471,7 +471,7 @@ SIMILARITY=COS ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. ### [mongosh](#tab/mongosh) diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index fdd3c7b..b879815 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -493,7 +493,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. ### [mongosh](#tab/mongosh) diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index 536c654..86955b4 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -431,7 +431,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. ### [mongosh](#tab/mongosh) diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index 35b9aa7..7b2b847 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -418,7 +418,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure DocumentDB extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. ### [mongosh](#tab/mongosh) From 9261b7b7116a0f3d0f2d916c536c00cdf87cb724 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 12:11:07 -0700 Subject: [PATCH 04/11] fix: update VS Code extension to DocumentDB for VS Code Replace Azure Databases extension (ms-azuretools.vscode-cosmosdb) with DocumentDB for VS Code (ms-azuretools.vscode-documentdb) per Khelan's PM feedback to align with recommended developer experience. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-dotnet/quickstart.md | 4 ++-- ai/select-algorithm-go/quickstart.md | 4 ++-- ai/select-algorithm-java/quickstart.md | 4 ++-- ai/select-algorithm-python/quickstart.md | 4 ++-- ai/select-algorithm-typescript/quickstart.md | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index d27278f..122b617 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -514,7 +514,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. ### [mongosh](#tab/mongosh) @@ -531,7 +531,7 @@ db.dropDatabase() ### [VS Code extension](#tab/vscode) -1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. 2. Connect to your Azure DocumentDB cluster. 3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index 4fd84f7..edbb20c 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -471,7 +471,7 @@ SIMILARITY=COS ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. ### [mongosh](#tab/mongosh) @@ -488,7 +488,7 @@ db.dropDatabase() ### [VS Code extension](#tab/vscode) -1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. 2. Connect to your Azure DocumentDB cluster. 3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index b879815..81c3cbb 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -493,7 +493,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. ### [mongosh](#tab/mongosh) @@ -510,7 +510,7 @@ db.dropDatabase() ### [VS Code extension](#tab/vscode) -1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. 2. Connect to your Azure DocumentDB cluster. 3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index 86955b4..70036ba 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -431,7 +431,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. ### [mongosh](#tab/mongosh) @@ -448,7 +448,7 @@ db.dropDatabase() ### [VS Code extension](#tab/vscode) -1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. 2. Connect to your Azure DocumentDB cluster. 3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index 7b2b847..afcee79 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -418,7 +418,7 @@ For the `text-embedding-3-small` model used in this quickstart, **COS (cosine si ## Clean up resources -When you're done, you can remove the database using mongosh or the Azure Databases extension for Visual Studio Code. +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. ### [mongosh](#tab/mongosh) @@ -435,7 +435,7 @@ db.dropDatabase() ### [VS Code extension](#tab/vscode) -1. Install the [Azure Databases extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-cosmosdb) for Visual Studio Code. +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. 2. Connect to your Azure DocumentDB cluster. 3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. From a2c7ace9dd0d84d50c51f069019d6414b92482a5 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 12:13:46 -0700 Subject: [PATCH 05/11] docs: add DocumentDB for VS Code as required extension in copilot-instructions - Add rule 14: always use ms-azuretools.vscode-documentdb - Remove ms-azuretools.vscode-cosmosdb exception from rule 1 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index c446bd6..40ed28a 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -111,7 +111,7 @@ All samples MUST use these environment variable names and defaults: ## Rules -1. **No Cosmos DB references.**Never use "Cosmos DB", "cosmosdb", "MongoDB vCore", or "mongo.cosmos.azure.com". Always use "Azure DocumentDB" and "documentdb.azure.com". Exception: `mongocluster.cosmos.azure.com` (hostname), `cosmosSearch` (API command), and `ms-azuretools.vscode-cosmosdb` (VS Code extension) are valid and NOT Cosmos references. +1. **No Cosmos DB references.** Never use "Cosmos DB", "cosmosdb", "MongoDB vCore", or "mongo.cosmos.azure.com". Always use "Azure DocumentDB" and "documentdb.azure.com". Exception: `mongocluster.cosmos.azure.com` (hostname) and `cosmosSearch` (API command) are valid and NOT Cosmos references. 2. **Vector field name is DescriptionVector.** Never default to "contentVector". 3. **Data file path from env var.** Code reads `DATA_FILE_WITH_VECTORS`. The default depends on the sample category: vector-search samples use `../data/Hotels_Vector.json` (shared data directory one level up), while select-algorithm samples use `data/Hotels_Vector.json` (local copy in each sample). .NET copies data locally to `data/Hotels_Vector.json` in the build output. 4. **Batch size is LOAD_SIZE_BATCH=100.** Do not use BATCH_SIZE or other variants. @@ -124,3 +124,4 @@ All samples MUST use these environment variable names and defaults: 11. **Collection naming:** `hotels_{algorithm}` (e.g., `hotels_ivf`, `hotels_hnsw`, `hotels_diskann`). Index naming: `vectorIndex_{algorithm}`. 12. **Vector search uses k=5.** All samples return top 5 results. Do not parameterize k unless explicitly required. 13. **Use the Global read-write hostname.** All samples MUST use the Global read-write connection string format: `.global.mongocluster.cosmos.azure.com`. The `.global.` form auto-follows the active write region after a replica promotion. The non-`.global.` form pins to one cluster and silently becomes read-only after failover — reserve that for read-scale-out scenarios only. (Confirmed by Khelan Modi, DocumentDB PM.) +14. **VS Code extension is DocumentDB for VS Code.** Always reference [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`). Never reference the Azure Databases extension (`ms-azuretools.vscode-cosmosdb`). From bdca085de3ff29a4f522db01bdde450324ad1cb1 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 12:22:09 -0700 Subject: [PATCH 06/11] feat: consolidate algorithm guidance into shared include - Create ai/includes/choosing-algorithm.md with enhanced content - Add quick-reference decision table (IVF/DiskANN/HNSW by scenario) - Elevate DiskANN-as-default recommendation with IMPORTANT callout - Add operational benefits: easier backups, faster recovery - Add dimension future-proofing context (models evolving past 8K) - Replace duplicated sections in all 5 quickstarts with include ref - Addresses Khelan Modi feedback points #3 and #4 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/includes/choosing-algorithm.md | 41 ++++++++++++++++++++ ai/select-algorithm-dotnet/quickstart.md | 35 +---------------- ai/select-algorithm-go/quickstart.md | 35 +---------------- ai/select-algorithm-java/quickstart.md | 35 +---------------- ai/select-algorithm-python/quickstart.md | 35 +---------------- ai/select-algorithm-typescript/quickstart.md | 35 +---------------- 6 files changed, 46 insertions(+), 170 deletions(-) create mode 100644 ai/includes/choosing-algorithm.md diff --git a/ai/includes/choosing-algorithm.md b/ai/includes/choosing-algorithm.md new file mode 100644 index 0000000..3879862 --- /dev/null +++ b/ai/includes/choosing-algorithm.md @@ -0,0 +1,41 @@ +### Choosing the right algorithm + +> [!IMPORTANT] +> For production workloads, start with **DiskANN** on an M30+ cluster. DiskANN supports higher embedding dimensions, uses less cluster memory, and is less likely to require an index redesign as your models evolve. + +Use this quick-reference table to select the right algorithm for your workload: + +| Scenario | Algorithm | Cluster tier | Max dimensions | +|----------|-----------|--------------|----------------| +| Dev/test, demos, small datasets | **IVF** | Any (free tier OK) | 2,000 | +| Production (default) | **DiskANN** | M30+ | 16,000 | +| Production (max recall priority) | **HNSW** | M30+ | 8,000 | + +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall + +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for production*: +- Best for: Production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk freeing cluster memory for reads and writes, lighter index updates, easier backups, faster recovery +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall +- Why default: As embedding models evolve (some already exceed 8,000 dimensions), DiskANN avoids costly index redesigns. Its disk-based architecture also means your cluster memory stays available for operational workloads rather than index storage. + +**HNSW** (hierarchical navigable small world): +- Best for: Production workloads on M30+ clusters where maximum recall is the top priority +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage since the full graph lives in RAM +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall + +### Choosing the right similarity function + +| Function | Score meaning | Best for | +|----------|-------------|----------| +| **COS (Cosine)** | Higher = more similar (0–1) | Text embeddings (normalized vectors) | +| **L2 (Euclidean)** | Lower = more similar (distance) | When magnitude matters | +| **IP (Inner Product)** | Higher = more similar | Equivalent to COS for normalized vectors | + +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index 122b617..8878d81 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -466,40 +466,7 @@ IVF COS Historic Downtown Inn 0.8342 52 Latency: 52ms ``` -### Choosing the right algorithm - -Use this comparison to select the best algorithm for your workload: - -**IVF** (inverted file index): -- Best for: Test environments, demos, and small clusters -- Pros: Fast to build, low resource requirements, works on any cluster tier -- Cons: Lower recall compared to graph-based algorithms at scale -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall - -**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: -- Best for: Enterprise production workloads on M30+ clusters -- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover -- Cons: Requires M30+ cluster tier -- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall - -**HNSW** (hierarchical navigable small world): -- Best for: Enterprise production workloads on M30+ clusters requiring highest recall -- Pros: Excellent recall, fast queries -- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage -- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall - -> [!TIP] -> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. - -### Choosing the right similarity function - -The similarity function should match your embedding model and use case: - -- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) -- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data -- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful - -For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Troubleshooting diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index edbb20c..2c5579b 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -388,40 +388,7 @@ The comparison table shows how different algorithms perform on the same dataset - **Score**: Similarity score (higher is better for COS and IP, lower is better for L2) - **Latency**: Query execution time in milliseconds -### Choosing the right algorithm - -Use this comparison to select the best algorithm for your workload: - -**IVF** (inverted file index): -- Best for: Test environments, demos, and small clusters -- Pros: Fast to build, low resource requirements, works on any cluster tier -- Cons: Lower recall compared to graph-based algorithms at scale -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall - -**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: -- Best for: Enterprise production workloads on M30+ clusters -- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover -- Cons: Requires M30+ cluster tier -- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall - -**HNSW** (hierarchical navigable small world): -- Best for: Enterprise production workloads on M30+ clusters requiring highest recall -- Pros: Excellent recall, fast queries -- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage -- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall - -> [!TIP] -> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. - -### Choosing the right similarity function - -The similarity function should match your embedding model and use case: - -- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) -- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data -- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful - -For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Experiment with different configurations diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index 81c3cbb..e405ffa 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -447,40 +447,7 @@ IVF L2 58.90 ## Understanding the results -### Choosing the right algorithm - -Use this comparison to select the best algorithm for your workload: - -**IVF** (inverted file index): -- Best for: Test environments, demos, and small clusters -- Pros: Fast to build, low resource requirements, works on any cluster tier -- Cons: Lower recall compared to graph-based algorithms at scale -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall - -**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: -- Best for: Enterprise production workloads on M30+ clusters -- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover -- Cons: Requires M30+ cluster tier -- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall - -**HNSW** (hierarchical navigable small world): -- Best for: Enterprise production workloads on M30+ clusters requiring highest recall -- Pros: Excellent recall, fast queries -- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage -- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall - -> [!TIP] -> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. - -### Choosing the right similarity function - -The similarity function should match your embedding model and use case: - -- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) -- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data -- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful - -For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Troubleshooting diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index 70036ba..8a6dd10 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -384,40 +384,7 @@ The comparison table helps you choose the best configuration for your workload: - **Score**: Similarity score using the selected function. Higher scores indicate better matches. - **Top Result**: The highest-scoring hotel for the query. Consistency across algorithms indicates stable results. -### Choosing the right algorithm - -Use this comparison to select the best algorithm for your workload: - -**IVF** (inverted file index): -- Best for: Test environments, demos, and small clusters -- Pros: Fast to build, low resource requirements, works on any cluster tier -- Cons: Lower recall compared to graph-based algorithms at scale -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall - -**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: -- Best for: Enterprise production workloads on M30+ clusters -- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover -- Cons: Requires M30+ cluster tier -- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall - -**HNSW** (hierarchical navigable small world): -- Best for: Enterprise production workloads on M30+ clusters requiring highest recall -- Pros: Excellent recall, fast queries -- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage -- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall - -> [!TIP] -> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. - -### Choosing the right similarity function - -The similarity function should match your embedding model and use case: - -- **COS (Cosine similarity)**: Best for text embeddings and most OpenAI models. Measures angle between vectors (range: -1 to 1, higher is more similar) -- **L2 (Euclidean distance)**: Measures straight-line distance between vectors (lower is more similar). Good for spatial data -- **IP (Inner product)**: Measures alignment between vectors. Good when vector magnitudes are meaningful - -For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Troubleshooting diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index afcee79..c17038f 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -370,40 +370,7 @@ The comparison table demonstrates key behaviors of vector search in DocumentDB: - **Score separation (Diff column)** shows confidence. A larger positive diff means the search clearly distinguishes the best match from the second-best. This metric helps evaluate result quality regardless of the absolute score values. -### Choosing the right algorithm - -Use this comparison to select the best algorithm for your workload: - -**IVF** (inverted file index): -- Best for: Test environments, demos, and small clusters -- Pros: Fast to build, low resource requirements, works on any cluster tier -- Cons: Lower recall compared to graph-based algorithms at scale -- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall - -**DiskANN** (disk-based approximate nearest neighbor) — *recommended for enterprise production*: -- Best for: Enterprise production workloads on M30+ clusters -- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk leaving cluster memory available for regular reads and writes, uses lighter updates that help the system stay smoother and easier to back up and recover -- Cons: Requires M30+ cluster tier -- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall - -**HNSW** (hierarchical navigable small world): -- Best for: Enterprise production workloads on M30+ clusters requiring highest recall -- Pros: Excellent recall, fast queries -- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage -- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall - -> [!TIP] -> For enterprise production workloads, start with **DiskANN** unless you have a specific reason to prefer HNSW. DiskANN supports higher dimensions (16,000 vs 8,000), uses less cluster memory, and requires fewer index maintenance operations — making it the safer long-term default that's less likely to need an index redesign as your embedding models evolve. - -### Choosing the right similarity function - -| Function | Score meaning | Best for | -|----------|-------------|----------| -| **COS (Cosine)** | Higher = more similar (0–1) | Text embeddings (normalized vectors) | -| **L2 (Euclidean)** | Lower = more similar (distance) | When magnitude matters | -| **IP (Inner Product)** | Higher = more similar | Equivalent to COS for normalized vectors | - -For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Troubleshooting From 39d007043d0d64077aff0b41d94d9454c4879fea Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 12:25:39 -0700 Subject: [PATCH 07/11] docs: add sample review checklist from PM feedback Checklist covers branding/naming, tooling references, index selection guidance, and DiskANN-as-default requirements. Derived from Khelan Modi (DocumentDB PM) feedback on PR #74. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 40ed28a..4dbf0d3 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -125,3 +125,33 @@ All samples MUST use these environment variable names and defaults: 12. **Vector search uses k=5.** All samples return top 5 results. Do not parameterize k unless explicitly required. 13. **Use the Global read-write hostname.** All samples MUST use the Global read-write connection string format: `.global.mongocluster.cosmos.azure.com`. The `.global.` form auto-follows the active write region after a replica promotion. The non-`.global.` form pins to one cluster and silently becomes read-only after failover — reserve that for read-scale-out scenarios only. (Confirmed by Khelan Modi, DocumentDB PM.) 14. **VS Code extension is DocumentDB for VS Code.** Always reference [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`). Never reference the Azure Databases extension (`ms-azuretools.vscode-cosmosdb`). + +## Sample Review Checklist + +Use this checklist when creating new samples or reviewing existing ones. Derived from PM (Khelan Modi) feedback. + +### Branding & Naming +- [ ] Environment variables use `DOCUMENTDB_CLUSTER_NAME` (not `MONGO_CLUSTER_NAME`) for select-algorithm samples +- [ ] All references say "Azure DocumentDB" — no "Cosmos DB" or "MongoDB vCore" +- [ ] Connection hostname uses `.global.mongocluster.cosmos.azure.com` format + +### Tooling References +- [ ] VS Code extension references point to [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`) +- [ ] No references to Data Explorer for data browsing — use the VS Code extension instead +- [ ] No references to the old Azure Databases extension (`ms-azuretools.vscode-cosmosdb`) + +### Index Selection Guidance +- [ ] IVF is positioned for dev/test, demos, and small clusters (works on any tier) +- [ ] DiskANN is the default recommendation for production (M30+ clusters) +- [ ] HNSW is positioned for production when maximum recall is the top priority (M30+) +- [ ] Decision table or clear guidance helps readers pick the right algorithm quickly + +### DiskANN as Default +- [ ] DiskANN recommendation is prominent (not buried in a footnote) +- [ ] Higher dimension support called out (up to 16,000 vs HNSW's 8,000) +- [ ] Memory efficiency explained (index on disk, frees RAM for read/write ops) +- [ ] Operational benefits mentioned (lighter updates, easier backups, faster recovery) +- [ ] Future-proofing noted (less likely to need index redesign as models evolve) + +### Optional Enhancements +- [ ] Consider mentioning DocumentDB agent kit (`npx skills add Azure/documentdb-agent-kit`) where appropriate — currently beta/optional From bad40606ac9f604d631ecc25b898cadf08db7f5f Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 12:58:50 -0700 Subject: [PATCH 08/11] fix: address PR review findings - reliability, docs, consistency - Add bounded retry logic (5 attempts, 2s backoff) for index readiness in all 5 languages - Fix Go: validate LOAD_SIZE_BATCH/EMBEDDING_DIMENSIONS > 0, track comparison failures - Fix TypeScript: exit non-zero on total failure, remove 'all' as valid algo/similarity value - Fix Python quickstart: correct download URL path (ai/data/ not data/) - Standardize data file path guidance across all quickstarts - Remove ALGORITHM=all / SIMILARITY=all from all docs (use unset for all combos) - Fix quickstart entrypoints to match actual code (TS, Java, Go, .NET) - Replace .NET appsettings real values with placeholders, document Section__Key overrides - Align copilot-instructions: DiskANN 32/50 for select-algorithm, document naming exception Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 7 +- ai/select-algorithm-dotnet/CompareAll.cs | 43 +++++- ai/select-algorithm-dotnet/appsettings.json | 4 +- ai/select-algorithm-dotnet/quickstart.md | 129 +++++++++--------- ai/select-algorithm-go/output/compare_all.txt | 2 + ai/select-algorithm-go/quickstart.md | 51 +------ ai/select-algorithm-go/src/compare_all.go | 53 +++++-- ai/select-algorithm-go/src/main.go | 5 +- ai/select-algorithm-go/src/utils.go | 25 +++- ai/select-algorithm-java/quickstart.md | 94 ++----------- .../selectalgorithm/CompareAll.java | 46 ++++++- ai/select-algorithm-python/quickstart.md | 47 ++----- ai/select-algorithm-python/src/compare_all.py | 47 ++++++- ai/select-algorithm-typescript/quickstart.md | 25 ++-- .../src/compare-all.ts | 63 +++++---- .../src/select-algorithm.ts | 58 ++++++-- 16 files changed, 371 insertions(+), 328 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 4dbf0d3..50bac00 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -105,9 +105,10 @@ All samples MUST use these environment variable names and defaults: - efSearch: 40 ### DiskANN -- maxDegree: 20 -- lBuild: 10 +- vector-search samples: maxDegree: 20, lBuild: 10 +- select-algorithm compare-all samples: maxDegree: 32, lBuild: 50 - lSearch: 40 +- Select-algorithm samples use higher values for meaningful comparison results. ## Rules @@ -121,7 +122,7 @@ All samples MUST use these environment variable names and defaults: 8. **Output files are committed.** Each sample has an `output/` directory with expected output for each algorithm (`ivf.txt`, `hnsw.txt`, `diskann.txt`). Update these when output format changes. 9. **DocumentDB supports all index types at any dataset size.** IVF, HNSW, and DiskANN are all available — do not imply tier restrictions limit algorithm availability. 10. **No dotenv libraries.** Do NOT use `python-dotenv`, `godotenv`, `dotenv` (npm), or any `.env` file-loading library. Environment variables must be passed via the CLI invocation, not loaded from `.env` files at runtime. This keeps samples explicit and avoids hidden configuration. -11. **Collection naming:** `hotels_{algorithm}` (e.g., `hotels_ivf`, `hotels_hnsw`, `hotels_diskann`). Index naming: `vectorIndex_{algorithm}`. +11. **Collection naming:** Standard per-algorithm samples use `hotels_{algorithm}` (e.g., `hotels_ivf`, `hotels_hnsw`, `hotels_diskann`). Standard index naming is `vectorIndex_{algorithm}`. Compare-all samples that drop and recreate a single collection use collection `hotels` and index naming `vector_{algorithm}_{metric}` (for example, `vector_ivf_cos`). TypeScript `select-algorithm.ts` remains a separate per-collection mode. 12. **Vector search uses k=5.** All samples return top 5 results. Do not parameterize k unless explicitly required. 13. **Use the Global read-write hostname.** All samples MUST use the Global read-write connection string format: `.global.mongocluster.cosmos.azure.com`. The `.global.` form auto-follows the active write region after a replica promotion. The non-`.global.` form pins to one cluster and silently becomes read-only after failover — reserve that for read-scale-out scenarios only. (Confirmed by Khelan Modi, DocumentDB PM.) 14. **VS Code extension is DocumentDB for VS Code.** Always reference [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`). Never reference the Azure Databases extension (`ms-azuretools.vscode-cosmosdb`). diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs index 8a4dec3..86e4117 100644 --- a/ai/select-algorithm-dotnet/CompareAll.cs +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -80,13 +80,15 @@ public static void Run(AppConfiguration appConfig) CreateIndex(collection, vectorField, config); Console.WriteLine($" ✓ {config.Name} created"); - // 3. Wait for index to build - Thread.Sleep(5000); - - // 4. Search - var searchResults = RunVectorSearch(collection, queryVector, vectorField, config.Name, topK); + // 3. Search with retries while the index becomes available + var searchResults = RunVectorSearchWithRetry(collection, queryVector, vectorField, config.Name, topK); + if (searchResults.Count == 0) + { + results.Add(new SearchResult(GetAlgoDisplay(config.Kind), config.Similarity, "(failed)", 0.0, "(failed)", 0.0)); + continue; + } - // 5. Extract top 2 results and record + // 4. Extract top 2 results and record var algoDisplay = GetAlgoDisplay(config.Kind); var top1Name = "-"; var top1Score = 0.0; var top2Name = "-"; var top2Score = 0.0; @@ -237,6 +239,35 @@ private static List RunVectorSearch( return collection.Aggregate(pipeline).ToList(); } + private static List RunVectorSearchWithRetry( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + const int maxRetries = 5; + const int retryDelayMs = 2000; + + for (var attempt = 0; attempt <= maxRetries; attempt++) + { + var results = RunVectorSearch(collection, queryVector, vectorField, indexName, topK); + if (results.Count > 0) + { + return results; + } + + if (attempt < maxRetries) + { + Console.WriteLine($" No results for {indexName} yet. Retrying in 2 seconds ({attempt + 1}/{maxRetries})..."); + Thread.Sleep(retryDelayMs); + } + } + + Console.WriteLine($" Search for {indexName} did not return results after {maxRetries} retries. Recording as failed."); + return []; + } + private static void PrintComparisonTable(List results) { Console.WriteLine(); diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json index 5572a48..6e57d30 100644 --- a/ai/select-algorithm-dotnet/appsettings.json +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -1,10 +1,10 @@ { "AzureOpenAI": { - "Endpoint": "https://oaidctfqpct77ndi.openai.azure.com/", + "Endpoint": "https://.openai.azure.com", "EmbeddingModel": "text-embedding-3-small" }, "MongoDB": { - "ClusterName": "docdb-dctfqpct77ndi", + "ClusterName": "", "DatabaseName": "Hotels", "LoadBatchSize": 100 }, diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index 8878d81..a0828b2 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -142,35 +142,37 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ dotnet list package ``` -3. Create environment variables for authentication. The sample uses DefaultAzureCredential for passwordless authentication: +3. Create environment variables for authentication and configuration overrides. The sample uses `DefaultAzureCredential` for passwordless authentication, and .NET maps environment variables to `appsettings.json` keys by using the `Section__Key` format: ### [Bash](#tab/bash) ```bash - export AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" - export AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" - export DOCUMENTDB_CLUSTER_NAME="" + export AzureOpenAI__Endpoint="https://.openai.azure.com" + export AzureOpenAI__EmbeddingModel="text-embedding-3-small" + export MongoDB__ClusterName="" + export DataFiles__WithVectors="data/Hotels_Vector.json" export AZURE_TENANT_ID="" - export DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" ``` ### [PowerShell](#tab/powershell) ```powershell - $env:AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com" - $env:AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" - $env:DOCUMENTDB_CLUSTER_NAME="" + $env:AzureOpenAI__Endpoint="https://.openai.azure.com" + $env:AzureOpenAI__EmbeddingModel="text-embedding-3-small" + $env:MongoDB__ClusterName="" + $env:DataFiles__WithVectors="data/Hotels_Vector.json" $env:AZURE_TENANT_ID="" - $env:DATA_FILE_WITH_VECTORS="../../data/Hotels_Vector.json" ``` --- Replace the placeholder values with your own information: - - ``: Your Azure OpenAI resource name - - ``: Your Azure DocumentDB cluster name + - ``: Your Azure OpenAI resource name + - ``: Your Azure DocumentDB cluster name - ``: Your Microsoft Entra tenant ID + These environment variables override the matching values in `appsettings.json`. For example, `MongoDB__ClusterName` overrides `MongoDB:ClusterName` and `AzureOpenAI__Endpoint` overrides `AzureOpenAI:Endpoint`. + You should always prefer passwordless authentication. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate .NET apps to Azure services by using the Azure SDK for .NET](/dotnet/azure/sdk/authentication). 4. Sign in with Azure CLI for passwordless authentication: @@ -199,15 +201,33 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```json { - "DatabaseName": "Hotels", - "EmbeddedField": "DescriptionVector", - "EmbeddingDimensions": 1536, - "LoadBatchSize": 100, - "SearchQuery": "quintessential lodging near running trails, eateries, retail", - "TopK": 5 + "AzureOpenAI": { + "Endpoint": "https://.openai.azure.com", + "EmbeddingModel": "text-embedding-3-small" + }, + "MongoDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "data/Hotels_Vector.json" + } } ``` + You can keep placeholder values in `appsettings.json` and override them at runtime with environment variables such as `AzureOpenAI__Endpoint` and `MongoDB__ClusterName`. + ## Create code files Continue the project by creating code files for vector search comparison. When you are done, the project structure should look like this: @@ -216,13 +236,15 @@ Continue the project by creating code files for vector search comparison. When y ├── data/ │ └── Hotels_Vector.json # Hotel data with vector embeddings └── select-algorithm-dotnet/ - ├── Services/ - │ └── VectorComparisonService.cs # Service to compare vector algorithms + ├── CompareAll.cs # Main comparison logic + ├── Models/ + │ ├── Configuration.cs # Configuration models + │ └── HotelData.cs # Hotel data model ├── Utilities/ - │ └── Utils.cs # Shared utility functions + │ └── AzureIdentityTokenHandler.cs # OIDC token handler ├── Program.cs # Main application entry point + ├── Utils.cs # Shared utility functions ├── appsettings.json # Configuration settings - ├── global.json # .NET SDK version specification └── SelectAlgorithm.csproj # Project file ``` @@ -231,14 +253,14 @@ Continue the project by creating code files for vector search comparison. When y ### [Bash](#tab/bash) ```bash - mkdir Services + mkdir Models mkdir Utilities ``` ### [PowerShell](#tab/powershell) ```powershell - New-Item -ItemType Directory -Name Services + New-Item -ItemType Directory -Name Models New-Item -ItemType Directory -Name Utilities ``` @@ -249,17 +271,21 @@ Continue the project by creating code files for vector search comparison. When y ### [Bash](#tab/bash) ```bash - touch Services/VectorComparisonService.cs - touch Utilities/Utils.cs - touch global.json + touch CompareAll.cs + touch Utils.cs + touch Models/Configuration.cs + touch Models/HotelData.cs + touch Utilities/AzureIdentityTokenHandler.cs ``` ### [PowerShell](#tab/powershell) ```powershell - New-Item -ItemType File -Path Services\VectorComparisonService.cs - New-Item -ItemType File -Path Utilities\Utils.cs - New-Item -ItemType File -Name global.json + New-Item -ItemType File -Name CompareAll.cs + New-Item -ItemType File -Name Utils.cs + New-Item -ItemType File -Path Models\Configuration.cs + New-Item -ItemType File -Path Models\HotelData.cs + New-Item -ItemType File -Path Utilities\AzureIdentityTokenHandler.cs ``` --- @@ -276,7 +302,7 @@ This main entry point: - Loads configuration from appsettings.json and environment variables - Sets up dependency injection with logging infrastructure - Initializes Azure OpenAI and DocumentDB clients using passwordless authentication -- Creates a VectorComparisonService to test all algorithms +- Calls `CompareAll.Run()` to execute the flat project entry point - Runs the comparison and prints results in a table format ### CompareAll.cs @@ -319,21 +345,6 @@ These supporting files provide: - Batch data insertion with error handling - Results formatting and display -### global.json - -Add this code to `global.json`: - -```json -{ - "sdk": { - "version": "9.0.200", - "rollForward": "latestFeature" - } -} -``` - -This file specifies the .NET SDK version requirements for the project. - ## Run the code 1. Build the project: @@ -342,51 +353,41 @@ This file specifies the .NET SDK version requirements for the project. dotnet build ``` -2. Run the application to compare all algorithms with COS similarity (default): +2. Run the flat `SelectAlgorithm.csproj` entry point to compare all 9 algorithm × similarity combinations: ```bash dotnet run ``` - The application creates three collections (`hotels_diskann_cos`, `hotels_hnsw_cos`, `hotels_ivf_cos`), inserts data, creates vector indexes, and performs searches on each. + The application loads the sample data once, then creates and tests all 9 algorithm × similarity combinations sequentially. -3. To compare all algorithms with all similarity functions, set environment variables: +3. Leave `ALGORITHM` and `SIMILARITY` unset to compare all 9 algorithm × similarity combinations. ### [Bash](#tab/bash) ```bash - export ALGORITHM=all - export SIMILARITY=all dotnet run ``` ### [PowerShell](#tab/powershell) ```powershell - $env:ALGORITHM="all" - $env:SIMILARITY="all" dotnet run ``` --- - This creates nine collections (3 algorithms x 3 similarity functions) and compares all combinations. - -4. To test a specific algorithm with a specific similarity function: +4. Repeat `dotnet run` whenever you want to rerun the flat `SelectAlgorithm.csproj` entry point: ### [Bash](#tab/bash) ```bash - export ALGORITHM=diskann - export SIMILARITY=COS dotnet run ``` ### [PowerShell](#tab/powershell) ```powershell - $env:ALGORITHM="diskann" - $env:SIMILARITY="COS" dotnet run ``` @@ -434,11 +435,11 @@ Executing vector search... ========================================================================================== Vector Algorithm Comparison Results ========================================================================================== -Algorithm Similarity Top Result Score Latency(ms) ------------------------------------------------------------------------------------------- -DiskANN COS Historic Downtown Inn 0.8342 45 -HNSW COS Historic Downtown Inn 0.8342 38 -IVF COS Historic Downtown Inn 0.8342 52 +Algorithm Similarity Top 1 Result Score Top 2 Result Score Diff +------------------------------------------------------------------------------------------------------------------------ +DiskANN COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 +HNSW COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 +IVF COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 ========================================================================================== --- DiskANN / COS (hotels_diskann_cos) --- diff --git a/ai/select-algorithm-go/output/compare_all.txt b/ai/select-algorithm-go/output/compare_all.txt index 0eeb9a3..509f4b4 100644 --- a/ai/select-algorithm-go/output/compare_all.txt +++ b/ai/select-algorithm-go/output/compare_all.txt @@ -36,4 +36,6 @@ Running 9 vector index comparisons (create→search→drop)... │ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ └──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ +Summary: 9 succeeded, 0 failed + Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index 2c5579b..1d37476 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -161,7 +161,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-openai-resource.openai.azure.com/ # Data File Configuration - DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json EMBEDDED_FIELD=DescriptionVector EMBEDDING_DIMENSIONS=1536 LOAD_SIZE_BATCH=100 @@ -169,12 +169,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ # DocumentDB Configuration DOCUMENTDB_CLUSTER_NAME=your-cluster-name - # Algorithm Selection - # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" - ALGORITHM=all - - # SIMILARITY: "all" | "COS" | "L2" | "IP" - SIMILARITY=COS + # Leave ALGORITHM and SIMILARITY unset to run all combinations # Database name AZURE_DOCUMENTDB_DATABASENAME=Hotels @@ -258,7 +253,7 @@ This code provides a complete vector algorithm comparison application with these - **Passwordless authentication**: Uses `DefaultAzureCredential` for both Azure OpenAI and DocumentDB via OIDC - **Three vector algorithms**: Implements DiskANN, HNSW, and IVF with algorithm-specific tuning parameters - **Three similarity functions**: Supports COS (cosine), L2 (Euclidean), and IP (inner product) -- **Flexible configuration**: Use environment variables to compare all algorithms or test specific combinations +- **Single compare-all entry point**: Runs all 9 algorithm × similarity combinations in one pass - **Performance measurement**: Tracks query latency for each algorithm/similarity pair - **Comparison output**: Generates a formatted table showing results side by side - **Production-ready patterns**: Includes batched insertion, error handling, and connection pooling @@ -288,7 +283,7 @@ Get-Content .env | ForEach-Object { After sourcing the environment variables, run the application: ```bash -go run src/main.go +go run ./src/ ``` The application will: @@ -312,7 +307,7 @@ Vector Algorithm Comparison Search query: "quintessential lodging near running trails, eateries, retail" Initializing MongoDB and Azure OpenAI clients... -Loading data from ../data/Hotels_Vector.json... +Loading data from data/Hotels_Vector.json... Loaded 50 documents Generating query embedding... Query embedding: 1536 dimensions @@ -390,41 +385,9 @@ The comparison table shows how different algorithms perform on the same dataset [!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] -## Experiment with different configurations - -You can compare different combinations by setting environment variables: - -**Compare all algorithms with cosine similarity (default):** +## Run all combinations -```bash -# .env file -ALGORITHM=all -SIMILARITY=COS -``` - -**Compare all algorithms with all similarity functions (9 collections):** - -```bash -# .env file -ALGORITHM=all -SIMILARITY=all -``` - -**Test only DiskANN with all similarity functions:** - -```bash -# .env file -ALGORITHM=diskann -SIMILARITY=all -``` - -**Test only cosine similarity across all algorithms:** - -```bash -# .env file -ALGORITHM=all -SIMILARITY=COS -``` +Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 algorithm × similarity combinations. ## Troubleshooting diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go index 81cb7ef..eda792a 100644 --- a/ai/select-algorithm-go/src/compare_all.go +++ b/ai/select-algorithm-go/src/compare_all.go @@ -99,6 +99,8 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, // 4. Create→search→drop each index sequentially (DocumentDB only allows one vector index per field) fmt.Printf("\nRunning %d vector index comparisons (create→search→drop)...\n", len(specs)) var results []CompareResult + successfulComparisons := 0 + failedComparisons := 0 for _, spec := range specs { // Drop all existing vector indexes on this field @@ -121,26 +123,14 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, Metric: spec.Metric, Error: createErr, }) + failedComparisons++ fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, createErr) continue } fmt.Printf(" ✓ %s created\n", spec.IndexName) - // Wait for index to become ready - time.Sleep(10 * time.Second) - - // Search using simple cosmosSearch (with retry for index readiness) - var searchResults []SearchResult - var searchErr error - for searchAttempt := 0; searchAttempt < 3; searchAttempt++ { - if searchAttempt > 0 { - time.Sleep(5 * time.Second) - } - searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, config.VectorField, topK) - if searchErr == nil && len(searchResults) > 0 { - break - } - } + // Search using simple cosmosSearch with bounded retry for index readiness. + searchResults, searchErr := runVectorSearchWithRetry(ctx, collection, queryEmbedding, config.VectorField, topK) top1Name, top1Score := extractResult(searchResults, 0) top2Name, top2Score := extractResult(searchResults, 1) @@ -156,15 +146,48 @@ func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, Error: searchErr, } results = append(results, cr) + if searchErr != nil { + failedComparisons++ + } else { + successfulComparisons++ + } } // 6. Print comparison table fmt.Println() printComparisonTable(results) + fmt.Printf("\nSummary: %d succeeded, %d failed\n", successfulComparisons, failedComparisons) + if successfulComparisons == 0 { + return fmt.Errorf("all %d comparisons failed", failedComparisons) + } return nil } +func runVectorSearchWithRetry(ctx context.Context, collection *mongo.Collection, queryEmbedding []float64, vectorField string, topK int) ([]SearchResult, error) { + const maxAttempts = 6 + const retryDelay = 2 * time.Second + + var searchResults []SearchResult + var searchErr error + + for attempt := 1; attempt <= maxAttempts; attempt++ { + searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, vectorField, topK) + if searchErr == nil { + if len(searchResults) > 0 { + return searchResults, nil + } + searchErr = fmt.Errorf("search returned no results") + } + + if attempt < maxAttempts { + time.Sleep(retryDelay) + } + } + + return searchResults, searchErr +} + // buildIndexSpecs creates the 9 index specifications func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { var specs []indexSpec diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go index 85e7e6e..0596aa1 100644 --- a/ai/select-algorithm-go/src/main.go +++ b/ai/select-algorithm-go/src/main.go @@ -10,7 +10,10 @@ func main() { fmt.Println("Starting vector algorithm comparison...") ctx := context.Background() - config := LoadConfig() + config, err := LoadConfig() + if err != nil { + log.Fatalf("Invalid configuration: %v", err) + } fmt.Println("\nInitializing clients with passwordless authentication...") mongoClient, azureOpenAIClient, err := GetClientsPasswordless(ctx, config) diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go index aa099f5..c358892 100644 --- a/ai/select-algorithm-go/src/utils.go +++ b/ai/select-algorithm-go/src/utils.go @@ -46,9 +46,16 @@ type InsertStats struct { } // LoadConfig loads configuration from environment variables -func LoadConfig() *Config { - dimensions, _ := strconv.Atoi(getEnvOrDefault("EMBEDDING_DIMENSIONS", "1536")) - batchSize, _ := strconv.Atoi(getEnvOrDefault("LOAD_SIZE_BATCH", "100")) +func LoadConfig() (*Config, error) { + dimensions, err := parsePositiveIntEnv("EMBEDDING_DIMENSIONS", "1536") + if err != nil { + return nil, err + } + + batchSize, err := parsePositiveIntEnv("LOAD_SIZE_BATCH", "100") + if err != nil { + return nil, err + } return &Config{ ClusterName: getEnvOrDefault("DOCUMENTDB_CLUSTER_NAME", ""), @@ -60,7 +67,19 @@ func LoadConfig() *Config { BatchSize: batchSize, Similarity: getEnvOrDefault("SIMILARITY", ""), Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "")), + }, nil +} + +func parsePositiveIntEnv(key, defaultValue string) (int, error) { + value := getEnvOrDefault(key, defaultValue) + parsedValue, err := strconv.Atoi(value) + if err != nil { + return 0, fmt.Errorf("%s must be a positive integer, got %q", key, value) + } + if parsedValue <= 0 { + return 0, fmt.Errorf("%s must be greater than 0, got %q", key, value) } + return parsedValue, nil } // getEnvOrDefault returns environment variable value or default if not set diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index e405ffa..cf674af 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -192,7 +192,7 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D exec-maven-plugin 3.1.0 - com.azure.documentdb.selectalgorithm.SelectAlgorithm + com.azure.documentdb.selectalgorithm.Main @@ -216,18 +216,14 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small # Data file path (relative to project root) - DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json # Vector configuration EMBEDDED_FIELD=DescriptionVector EMBEDDING_DIMENSIONS=1536 LOAD_SIZE_BATCH=100 - # Algorithm selection: all, diskann, hnsw, ivf - ALGORITHM=all - - # Similarity function: COS, L2, IP, all - SIMILARITY=COS + # Leave ALGORITHM and SIMILARITY unset to run all combinations ``` Replace the placeholder values with your Azure resource information: @@ -271,7 +267,8 @@ select-algorithm-quickstart/ │ └── azure/ │ └── documentdb/ │ └── selectalgorithm/ -│ ├── SelectAlgorithm.java # Main comparison logic +│ ├── CompareAll.java # Main comparison logic +│ ├── Main.java # Entry point that runs CompareAll │ └── Utils.java # Shared utility functions ├── pom.xml # Maven dependencies └── .env # Environment variables @@ -329,95 +326,24 @@ This main comparison logic provides: Verify: The build output ends with `BUILD SUCCESS`. -2. Run the comparison for all algorithms with cosine similarity (default): - - ```bash - mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - ``` - -3. Run the comparison for a specific algorithm: - - ### [Bash](#tab/bash) - - ```bash - # Test only DiskANN - ALGORITHM=diskann mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test only HNSW - ALGORITHM=hnsw mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test only IVF - ALGORITHM=ivf mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - ``` - - ### [PowerShell](#tab/powershell) - - ```powershell - # Test only DiskANN - $env:ALGORITHM="diskann" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test only HNSW - $env:ALGORITHM="hnsw" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test only IVF - $env:ALGORITHM="ivf" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - ``` - - --- - -4. Run the comparison for all similarity functions: - - ### [Bash](#tab/bash) +2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and executes all 9 algorithm × similarity combinations: ```bash - # Test all algorithms with all similarity functions - ALGORITHM=all SIMILARITY=all mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test DiskANN with all similarity functions - ALGORITHM=diskann SIMILARITY=all mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.Main" ``` - ### [PowerShell](#tab/powershell) - - ```powershell - # Test all algorithms with all similarity functions - $env:ALGORITHM="all" - $env:SIMILARITY="all" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test DiskANN with all similarity functions - $env:ALGORITHM="diskann" - $env:SIMILARITY="all" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - ``` - - --- - -5. Run the comparison for a specific similarity function: +3. Leave `ALGORITHM` and `SIMILARITY` unset to run all combinations. ### [Bash](#tab/bash) ```bash - # Test all algorithms with L2 (Euclidean) distance - SIMILARITY=L2 mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test all algorithms with IP (inner product) - SIMILARITY=IP mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.SelectAlgorithm" + mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.Main" ``` ### [PowerShell](#tab/powershell) ```powershell - # Test all algorithms with L2 (Euclidean) distance - $env:SIMILARITY="L2" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" - - # Test all algorithms with IP (inner product) - $env:SIMILARITY="IP" - mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.SelectAlgorithm" + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.Main" ``` --- diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index f632350..65b17b4 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -83,14 +83,15 @@ public static void run() { createIndex(database, collection, vectorField, dimensions, algo, metric); System.out.printf(" ✓ %s created%n", indexName); - // 3. Wait for index to build - try { Thread.sleep(5000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } - - // 4. Search - List searchResults = performSearch( - collection, vectorAsDoubles, vectorField, topK); + // 3. Search with retries while the index becomes available + List searchResults = performSearchWithRetry( + collection, vectorAsDoubles, vectorField, topK, indexName); + if (searchResults.isEmpty()) { + results.add(new SearchResult(algo.toUpperCase(), metric, "(failed)", 0.0, "(failed)", 0.0)); + continue; + } - // 5. Extract top 2 results + // 4. Extract top 2 results String top1Name = "-"; double top1Score = 0.0; String top2Name = "-"; double top2Score = 0.0; if (!searchResults.isEmpty()) { @@ -192,6 +193,37 @@ private static List performSearch(MongoCollection collection return results; } + private static List performSearchWithRetry(MongoCollection collection, + List vectorAsDoubles, + String vectorField, + int topK, + String indexName) { + int maxRetries = 5; + int retryDelayMs = 2000; + + for (int attempt = 0; attempt <= maxRetries; attempt++) { + List results = performSearch(collection, vectorAsDoubles, vectorField, topK); + if (!results.isEmpty()) { + return results; + } + + if (attempt < maxRetries) { + System.out.printf(" No results for %s yet. Retrying in 2 seconds (%d/%d)...%n", + indexName, attempt + 1, maxRetries); + try { + Thread.sleep(retryDelayMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + } + + System.out.printf(" Search for %s did not return results after %d retries. Recording as failed.%n", + indexName, maxRetries); + return List.of(); + } + private static void printComparisonTable(List results) { System.out.println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7s│ %-27s│ %-7s│ %-6s│%n", diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index 8a6dd10..df64f04 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -33,14 +33,14 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```bash mkdir -p data - curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/main/data/Hotels_Vector.json + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json ``` ### [PowerShell](#tab/powershell) ```powershell New-Item -ItemType Directory -Force -Path data - Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/main/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" ``` --- @@ -138,7 +138,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com # Data File Paths and Vector Configuration - DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json EMBEDDED_FIELD=DescriptionVector EMBEDDING_DIMENSIONS=1536 LOAD_SIZE_BATCH=100 @@ -149,12 +149,8 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ # Azure DocumentDB Database Name AZURE_DOCUMENTDB_DATABASENAME=Hotels - # Algorithm Selection (used by compare_all.py) - # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" - ALGORITHM=all + # Leave ALGORITHM and SIMILARITY unset to run all combinations - # SIMILARITY: "all" | "COS" | "L2" | "IP" - SIMILARITY=COS ``` For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: @@ -260,7 +256,7 @@ Vector Algorithm Comparison Initializing MongoDB and Azure OpenAI clients... -Loading data from ../data/Hotels_Vector.json... +Loading data from data/Hotels_Vector.json... Loaded 50 documents Generating query embedding... Query embedding: 1536 dimensions @@ -333,49 +329,24 @@ Closing database connection... Database connection closed ``` -### Test specific combinations +### Run all combinations -To override environment variables at the command line: +Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 combinations (3 algorithms × 3 similarity functions): ### [Bash](#tab/bash) ```bash -# Test only DiskANN across all similarity functions -ALGORITHM=diskann SIMILARITY=all python src/compare_all.py -``` - -```bash -# Test all algorithms with L2 distance -ALGORITHM=all SIMILARITY=L2 python src/compare_all.py -``` - -```bash -# Test HNSW with inner product -ALGORITHM=hnsw SIMILARITY=IP python src/compare_all.py +python src/compare_all.py ``` ### [PowerShell](#tab/powershell) ```powershell -# Test only DiskANN across all similarity functions -$env:ALGORITHM="diskann"; $env:SIMILARITY="all"; python src/compare_all.py -``` - -```powershell -# Test all algorithms with L2 distance -$env:ALGORITHM="all"; $env:SIMILARITY="L2"; python src/compare_all.py -``` - -```powershell -# Test HNSW with inner product -$env:ALGORITHM="hnsw"; $env:SIMILARITY="IP"; python src/compare_all.py +python src/compare_all.py ``` --- -> [!NOTE] -> When using `SIMILARITY=all`, the script tests all three similarity functions (COS, L2, IP) for each selected algorithm. Combined with `ALGORITHM=all`, this runs all 9 combinations (3 algorithms × 3 similarity functions). Each combination creates a separate collection, so the full run takes longer. - ### Understanding the results The comparison table helps you choose the best configuration for your workload: diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index b5c22d8..0bee778 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -115,6 +115,35 @@ def vector_search_with_index(collection, query_embedding: List[float], return results +def vector_search_with_retry(collection, query_embedding: List[float], + vector_field: str, top_k: int, + index_name_value: str) -> List[Dict[str, Any]]: + """Retry vector search until results are available or retries are exhausted.""" + max_retries = 5 + retry_delay_seconds = 2 + + for attempt in range(max_retries + 1): + results = vector_search_with_index( + collection, query_embedding, vector_field, top_k + ) + if results: + return results + + if attempt < max_retries: + print( + f" No results for '{index_name_value}' yet. " + f"Retrying in {retry_delay_seconds} seconds " + f"({attempt + 1}/{max_retries})..." + ) + time.sleep(retry_delay_seconds) + + print( + f" Search for '{index_name_value}' did not return results " + f"after {max_retries} retries. Recording as failed." + ) + return [] + + def main(): print("=" * 70) print(" Compare All Algorithms — 9 Combinations") @@ -165,12 +194,22 @@ def main(): config["dimensions"], metric, extra_params ) print(f" Created index '{name}'") - time.sleep(5) # Increased wait time - # Search (no index name needed) - results = vector_search_with_index( - collection, query_embedding, config["vector_field"], top_k + results = vector_search_with_retry( + collection, query_embedding, config["vector_field"], top_k, name ) + if not results: + table_rows.append([ + algo_label, + metric, + "(failed)", + f"{0:.4f}", + "(failed)", + f"{0:.4f}", + f"{0:.4f}", + ]) + continue + top1_name = results[0].get("document", results[0]).get("HotelName", "Unknown") if len(results) > 0 else "(no results)" top1_score = results[0].get("score", 0) if len(results) > 0 else 0 top2_name = results[1].get("document", results[1]).get("HotelName", "Unknown") if len(results) > 1 else "(no results)" diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index c17038f..7662df4 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -164,7 +164,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ "type": "module", "scripts": { "build": "tsc", - "start": "node --env-file .env dist/select-algorithm.js" + "start": "node --env-file .env dist/compare-all.js" } } ``` @@ -177,7 +177,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com # Data File Paths and Vector Configuration - DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json EMBEDDED_FIELD=DescriptionVector EMBEDDING_DIMENSIONS=1536 LOAD_SIZE_BATCH=100 @@ -188,12 +188,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ # Azure DocumentDB Database Name AZURE_DOCUMENTDB_DATABASENAME=Hotels - # Algorithm Selection (used by select-algorithm.ts) - # ALGORITHM: "all" | "diskann" | "hnsw" | "ivf" - ALGORITHM=all - - # SIMILARITY: "all" | "COS" | "L2" | "IP" - SIMILARITY=all + # Leave ALGORITHM and SIMILARITY unset to run all combinations ``` Verify the `.env` file was created: @@ -230,7 +225,7 @@ Create the following project structure: │ └── Hotels_Vector.json # Hotel data with vector embeddings └── select-algorithm-typescript/ ├── src/ - │ ├── select-algorithm.ts # Main comparison script + │ ├── compare-all.ts # Main comparison script │ └── utils.ts # Shared utility functions ├── tsconfig.json ├── package.json @@ -255,9 +250,9 @@ New-Item -ItemType Directory -Name src ## Create the algorithm comparison code -Create the `src/select-algorithm.ts` file with the following code: +Create the `src/compare-all.ts` file with the following code: -:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/select-algorithm.ts" ::: +:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/compare-all.ts" ::: This script orchestrates the algorithm comparison by: @@ -345,15 +340,11 @@ Database connection closed > [!NOTE] > Latency values are approximate and vary by environment. Scores may differ slightly depending on your Azure OpenAI embedding deployment. -### Test individual algorithms +### Run all combinations -To test a specific algorithm, update the `ALGORITHM` and `SIMILARITY` values in your `.env` file: +Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 algorithm × similarity combinations: ```bash -# Edit .env to set specific values, for example: -# ALGORITHM=ivf -# SIMILARITY=COS - npm run build npm start ``` diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index 9f0abaa..00fbfd9 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -121,37 +121,48 @@ async function main() { await db.command(indexOptions); console.log(` ✓ ${indexName} created`); - // 3. Wait for index to be ready - await new Promise(r => setTimeout(r, 5000)); + // 3. Search with bounded retry while the new index becomes ready + const searchPipeline = [ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]; - // 4. Search with retry (index may need more time) let searchResults: any[] = []; - for (let attempt = 0; attempt < 3; attempt++) { - if (attempt > 0) { - await new Promise(r => setTimeout(r, 5000)); - } + let lastSearchError: unknown; + await new Promise(r => setTimeout(r, 1000)); + for (let attempt = 1; attempt <= 5; attempt++) { try { - searchResults = await collection.aggregate([ - { - $search: { - cosmosSearch: { - vector: queryVector, - path: baseConfig.embeddedField, - k: topK - } - } - }, - { - $project: { - score: { $meta: 'searchScore' }, - document: '$$ROOT' - } - } - ]).toArray(); - if (searchResults.length > 0) break; + searchResults = await collection.aggregate(searchPipeline).toArray(); + if (searchResults.length > 0 || attempt === 5) { + break; + } + console.log(` ...search returned no results yet, retrying (${attempt}/5)`); } catch (e) { - if (attempt === 2) throw e; + lastSearchError = e; + if (attempt === 5) { + throw e; + } + console.log(` ...search not ready yet, retrying (${attempt}/5)`); } + + await new Promise(r => setTimeout(r, 2000)); + } + + if (searchResults.length === 0 && lastSearchError) { + throw lastSearchError; } // Record top 2 results diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts index 8596448..38451af 100644 --- a/ai/select-algorithm-typescript/src/select-algorithm.ts +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -24,9 +24,16 @@ if (missing.length > 0) { type Algorithm = 'diskann' | 'hnsw' | 'ivf'; type Similarity = 'COS' | 'L2' | 'IP'; +type SimilarityEnv = 'cos' | 'l2' | 'ip'; const ALGORITHMS: Algorithm[] = ['diskann', 'hnsw', 'ivf']; const SIMILARITIES: Similarity[] = ['COS', 'L2', 'IP']; +const SIMILARITY_ENV_VALUES: SimilarityEnv[] = ['cos', 'l2', 'ip']; +const SIMILARITY_BY_ENV: Record = { + cos: 'COS', + l2: 'L2', + ip: 'IP', +}; const ALGORITHM_LABELS: Record = { diskann: 'DiskANN', @@ -120,27 +127,37 @@ function getSearchPipeline( /** * Determine which collections to create/query based on ALGORITHM and SIMILARITY env vars. + * Leave either env var unset or empty to run all valid combinations. + * Valid ALGORITHM values: ivf, hnsw, diskann + * Valid SIMILARITY values: cos, l2, ip * Collection naming: hotels_{algorithm}_{similarity} */ function getTargetCollections( algorithmEnv: string, similarityEnv: string ): Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> { - const algorithms: Algorithm[] = - !algorithmEnv ? ALGORITHMS : [algorithmEnv as Algorithm]; - const similarities: Similarity[] = - !similarityEnv ? SIMILARITIES : [similarityEnv as Similarity]; + const algorithms: Algorithm[] = !algorithmEnv + ? ALGORITHMS + : (() => { + if (!ALGORITHMS.includes(algorithmEnv as Algorithm)) { + throw new Error(`Invalid ALGORITHM '${algorithmEnv}'. Must be one of: ${ALGORITHMS.join(', ')}`); + } + return [algorithmEnv as Algorithm]; + })(); + + const similarities: Similarity[] = !similarityEnv + ? SIMILARITIES + : (() => { + if (!SIMILARITY_ENV_VALUES.includes(similarityEnv as SimilarityEnv)) { + throw new Error(`Invalid SIMILARITY '${similarityEnv}'. Must be one of: ${SIMILARITY_ENV_VALUES.join(', ')}`); + } + return [SIMILARITY_BY_ENV[similarityEnv as SimilarityEnv]]; + })(); const targets: Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> = []; for (const alg of algorithms) { - if (!ALGORITHMS.includes(alg)) { - throw new Error(`Invalid ALGORITHM '${alg}'. Must be one of: ${ALGORITHMS.join(', ')}`); - } for (const sim of similarities) { - if (!SIMILARITIES.includes(sim)) { - throw new Error(`Invalid SIMILARITY '${sim}'. Must be one of: ${SIMILARITIES.join(', ')}`); - } targets.push({ collectionName: `hotels_${alg}_${sim.toLowerCase()}`, algorithm: alg, @@ -170,15 +187,15 @@ async function main() { const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; const batchSize = parseInt(process.env.LOAD_SIZE_BATCH || '100', 10); const algorithmEnv = (process.env.ALGORITHM || '').trim().toLowerCase(); - const similarityEnv = (process.env.SIMILARITY || '').trim().toUpperCase(); + const similarityEnv = (process.env.SIMILARITY || '').trim().toLowerCase(); const searchQuery = 'quintessential lodging near running trails, eateries, retail'; const targets = getTargetCollections(algorithmEnv, similarityEnv); console.log(`\n🔬 Vector Algorithm Comparison`); console.log(` Database: ${dbName}`); - console.log(` Algorithms: ${algorithmEnv}`); - console.log(` Similarity: ${similarityEnv}`); + console.log(` Algorithms: ${algorithmEnv || ALGORITHMS.join(', ')}`); + console.log(` Similarity: ${similarityEnv || SIMILARITY_ENV_VALUES.join(', ')}`); console.log(` Collections to query: ${targets.map(t => t.collectionName).join(', ')}`); console.log(` Search query: "${searchQuery}"\n`); @@ -212,6 +229,7 @@ async function main() { searchResults: any[]; latencyMs: number; }> = []; + const failedTargets: Array<{ collectionName: string; error: string }> = []; for (const target of targets) { console.log(`\n━━━ ${ALGORITHM_LABELS[target.algorithm]} / ${target.similarity} ━━━`); @@ -263,13 +281,25 @@ async function main() { console.log(`✓ ${searchResults.length} results, ${latencyMs}ms`); } catch (error) { - console.error(`✗ Error with ${target.collectionName}:`, (error as Error).message); + const message = (error as Error).message; + failedTargets.push({ collectionName: target.collectionName, error: message }); + console.error(`✗ Error with ${target.collectionName}:`, message); + } + } + + if (failedTargets.length > 0) { + console.error(`\nFailure summary: ${failedTargets.length} of ${targets.length} target collection(s) failed.`); + for (const failure of failedTargets) { + console.error(` - ${failure.collectionName}: ${failure.error}`); } } // Print comparison table if (comparisonResults.length > 0) { printComparisonTable(comparisonResults); + } else { + console.error('\nNo comparison results were produced. All target collections failed.'); + process.exitCode = 1; } } catch (error) { console.error('App failed:', error); From 965681d3c4cfbb9a703cfc40cec6c857baac81e6 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 13:19:14 -0700 Subject: [PATCH 09/11] fix: cross-language consistency - exit codes, quickstart accuracy, env patterns - Add exit-code-on-all-fail to .NET, Java, Python (matching Go/TS) - Replace all 5 quickstart output blocks with actual output/*.txt content - Fix file tree layouts to match actual project structure - Fix version refs: .NET 8 (not 9), Java 17 (not 21) - Remove dotenv/.env-file patterns (Java dotenv, TS --env-file) - Fix devcontainer extensions: vscode-cosmosdb -> vscode-documentdb - Fix Python CosmosDB branding -> DocumentDB - Standardize TS retry to 6 attempts, remove fixed waits - Make TS scalar indexes optional (skip in compare-all) - Clarify compare-all always runs 9 combos (ignores ALGORITHM/SIMILARITY) - Add Diff column explanation to all quickstarts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .devcontainer/typescript/devcontainer.json | 6 +- .../.devcontainer/devcontainer.json | 2 +- ai/select-algorithm-dotnet/CompareAll.cs | 12 + ai/select-algorithm-dotnet/quickstart.md | 173 ++++++------ ai/select-algorithm-go/quickstart.md | 168 +++++------- ai/select-algorithm-java/quickstart.md | 250 +++++++++--------- .../selectalgorithm/CompareAll.java | 14 +- ai/select-algorithm-python/quickstart.md | 216 +++++++-------- ai/select-algorithm-python/src/compare_all.py | 8 + ai/select-algorithm-python/src/utils.py | 4 +- ai/select-algorithm-typescript/package.json | 2 +- ai/select-algorithm-typescript/quickstart.md | 177 ++++++------- .../src/compare-all.ts | 21 +- ai/select-algorithm-typescript/src/utils.ts | 20 +- 14 files changed, 511 insertions(+), 562 deletions(-) diff --git a/.devcontainer/typescript/devcontainer.json b/.devcontainer/typescript/devcontainer.json index d627844..a4db17d 100644 --- a/.devcontainer/typescript/devcontainer.json +++ b/.devcontainer/typescript/devcontainer.json @@ -11,10 +11,8 @@ "customizations": { "vscode": { "extensions": [ - "ms-azuretools.vscode-cosmosdb", - "buildwithlayer.mongodb-integration-expert-qS6DB", - "mongodb.mongodb-vscode", - "ms-azuretools.vscode-documentdb" + "ms-azuretools.vscode-documentdb", + "mongodb.mongodb-vscode" ] } } diff --git a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json index aafd623..fcda282 100644 --- a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json +++ b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json @@ -18,7 +18,7 @@ "ms-dotnettools.csdevkit", "ms-dotnettools.vscodeintellicode-csharp", "ms-azuretools.vscode-azureresourcegroups", - "ms-azuretools.vscode-cosmosdb", + "ms-azuretools.vscode-documentdb", "mongodb.mongodb-vscode" ], "settings": { diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs index 86e4117..9eb9c75 100644 --- a/ai/select-algorithm-dotnet/CompareAll.cs +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -107,8 +107,20 @@ public static void Run(AppConfiguration appConfig) results.Add(new SearchResult(algoDisplay, config.Similarity, top1Name, top1Score, top2Name, top2Score)); } + var successCount = results.Count(r => r.Top1Name != "(failed)"); + // Print comparison table PrintComparisonTable(results); + + if (successCount == 0) + { + Console.WriteLine("\n❌ All 9 comparisons failed — no algorithm returned results."); + Environment.ExitCode = 1; + } + else + { + Console.WriteLine($"\nSummary: {successCount} succeeded, {9 - successCount} failed"); + } } finally { diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index a0828b2..b9a9997 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -23,7 +23,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ [!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] -- [.NET 9.0 SDK](https://dotnet.microsoft.com/download/dotnet/9.0) or later. .NET 9.0 is a Standard Term Support (STS) release. Use the latest available .NET SDK for long-term production workloads. +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later. ## Create data file with vectors @@ -86,7 +86,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```bash mkdir select-algorithm-dotnet cd select-algorithm-dotnet - dotnet new console --framework net9.0 + dotnet new console --framework net8.0 ``` ### [PowerShell](#tab/powershell) @@ -94,7 +94,7 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ ```powershell New-Item -ItemType Directory -Name select-algorithm-dotnet Set-Location select-algorithm-dotnet - dotnet new console --framework net9.0 + dotnet new console --framework net8.0 ``` --- @@ -233,19 +233,26 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ Continue the project by creating code files for vector search comparison. When you are done, the project structure should look like this: ``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json ├── data/ -│ └── Hotels_Vector.json # Hotel data with vector embeddings -└── select-algorithm-dotnet/ - ├── CompareAll.cs # Main comparison logic - ├── Models/ - │ ├── Configuration.cs # Configuration models - │ └── HotelData.cs # Hotel data model - ├── Utilities/ - │ └── AzureIdentityTokenHandler.cs # OIDC token handler - ├── Program.cs # Main application entry point - ├── Utils.cs # Shared utility functions - ├── appsettings.json # Configuration settings - └── SelectAlgorithm.csproj # Project file +│ └── README.md +├── Models/ +│ ├── Configuration.cs +│ └── HotelData.cs +├── output/ +│ └── compare_all.txt +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs +├── .gitignore +├── appsettings.json +├── CompareAll.cs +├── Program.cs +├── quickstart.md +├── README.md +├── SelectAlgorithm.csproj +└── Utils.cs ``` 1. Create the directory structure: @@ -361,21 +368,7 @@ These supporting files provide: The application loads the sample data once, then creates and tests all 9 algorithm × similarity combinations sequentially. -3. Leave `ALGORITHM` and `SIMILARITY` unset to compare all 9 algorithm × similarity combinations. - - ### [Bash](#tab/bash) - - ```bash - dotnet run - ``` - - ### [PowerShell](#tab/powershell) - - ```powershell - dotnet run - ``` - - --- +3. The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. 4. Repeat `dotnet run` whenever you want to rerun the flat `SelectAlgorithm.csproj` entry point: @@ -395,78 +388,60 @@ These supporting files provide: ### Expected output -The application displays progress logs and a comparison table. Results vary based on data and server load: +The application displays progress logs and a comparison table: ``` -Vector Algorithm Comparison - Database: Hotels - Algorithms: all - Similarity: COS - Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos - Search query: "quintessential lodging near running trails, eateries, retail" - -Generating query embedding... -Query embedding: 1536 dimensions - ---- DiskANN / COS --- -Collection: hotels_diskann_cos -Created collection: hotels_diskann_cos -Inserted: 50/50 -Created vector index: vectorIndex_diskann_cos -Executing vector search... -[OK] 5 results, 45ms - ---- HNSW / COS --- -Collection: hotels_hnsw_cos -Created collection: hotels_hnsw_cos -Inserted: 50/50 -Created vector index: vectorIndex_hnsw_cos -Executing vector search... -[OK] 5 results, 38ms - ---- IVF / COS --- -Collection: hotels_ivf_cos -Created collection: hotels_ivf_cos -Inserted: 50/50 -Created vector index: vectorIndex_ivf_cos -Executing vector search... -[OK] 5 results, 52ms - -========================================================================================== - Vector Algorithm Comparison Results -========================================================================================== -Algorithm Similarity Top 1 Result Score Top 2 Result Score Diff ------------------------------------------------------------------------------------------------------------------------- -DiskANN COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 -HNSW COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 -IVF COS Historic Downtown Inn 0.8342 Mountain Trail Lodge 0.7891 0.0451 -========================================================================================== - ---- DiskANN / COS (hotels_diskann_cos) --- - 1. Historic Downtown Inn, Score: 0.8342 - 2. Mountain Trail Lodge, Score: 0.7891 - 3. Riverside Retreat, Score: 0.7654 - 4. Urban Fitness Suites, Score: 0.7210 - 5. Lakeside Wellness Resort, Score: 0.7045 - Latency: 45ms - ---- HNSW / COS (hotels_hnsw_cos) --- - 1. Historic Downtown Inn, Score: 0.8342 - 2. Mountain Trail Lodge, Score: 0.7891 - 3. Riverside Retreat, Score: 0.7654 - 4. Urban Fitness Suites, Score: 0.7210 - 5. Lakeside Wellness Resort, Score: 0.7045 - Latency: 38ms - ---- IVF / COS (hotels_ivf_cos) --- - 1. Historic Downtown Inn, Score: 0.8342 - 2. Mountain Trail Lodge, Score: 0.7891 - 3. Riverside Retreat, Score: 0.7654 - 4. Urban Fitness Suites, Score: 0.7210 - 5. Lakeside Wellness Resort, Score: 0.7045 - Latency: 52ms +============================================================ + Compare All Algorithms × Metrics + 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 5 +Embedding generated (reused for all searches) + +Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection 'hotels' ``` +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + [!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Troubleshooting @@ -475,7 +450,7 @@ IVF COS Historic Downtown Inn 0.8342 Mountain Tr |-------|----------| | `TimeoutException` during connection | Verify your connection string and environment variables. Ensure your IP is in the DocumentDB firewall rules. | | `AuthenticationException` | Check that `DefaultAzureCredential` can acquire a token. Run `az login` to refresh your credentials. | -| Build errors with .NET version | Ensure you have .NET 9.0 or later installed. Run `dotnet --version` to check. | +| Build errors with .NET version | Ensure you have .NET 8.0 or later installed. Run `dotnet --version` to check. | | `BsonSerializationException` | Ensure your model classes match the document structure in the collection. | | Empty search results | The vector index might not be ready yet. The sample includes retry logic, but if you still see empty results, wait a few seconds and retry. | | `IndexOptionsConflict` (code 85) | DocumentDB doesn't allow multiple vector indexes of the same kind on the same field. Drop the existing index before creating a new one. | diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md index 1d37476..bd9963b 100644 --- a/ai/select-algorithm-go/quickstart.md +++ b/ai/select-algorithm-go/quickstart.md @@ -169,7 +169,8 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ # DocumentDB Configuration DOCUMENTDB_CLUSTER_NAME=your-cluster-name - # Leave ALGORITHM and SIMILARITY unset to run all combinations + # The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). + # The ALGORITHM and SIMILARITY environment variables are used only by the single-algorithm mode. # Database name AZURE_DOCUMENTDB_DATABASENAME=Hotels @@ -220,16 +221,20 @@ New-Item -ItemType File -Path src/main.go When you're done, the project structure should look like this: -``` +```text +select-algorithm-go/ ├── data/ -│ ├── Hotels.json # Source hotel data (without vectors) -│ └── Hotels_Vector.json # Hotel data with vector embeddings -└── select-algorithm-go/ - ├── src/ - │ └── main.go # Main application comparing all algorithms - ├── go.mod # Go module dependencies - ├── go.sum # Dependency checksums - └── .env # Environment configuration +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare_all.go +│ ├── main.go +│ └── utils.go +├── .gitignore +├── go.mod +├── quickstart.md +└── README.md ``` ## Create the algorithm comparison code @@ -253,9 +258,9 @@ This code provides a complete vector algorithm comparison application with these - **Passwordless authentication**: Uses `DefaultAzureCredential` for both Azure OpenAI and DocumentDB via OIDC - **Three vector algorithms**: Implements DiskANN, HNSW, and IVF with algorithm-specific tuning parameters - **Three similarity functions**: Supports COS (cosine), L2 (Euclidean), and IP (inner product) -- **Single compare-all entry point**: Runs all 9 algorithm × similarity combinations in one pass -- **Performance measurement**: Tracks query latency for each algorithm/similarity pair -- **Comparison output**: Generates a formatted table showing results side by side +- **Single compare-all entry point**: Always runs all 9 algorithm × similarity combinations in one pass +- **Index lifecycle automation**: Creates, queries, and drops each vector index in sequence +- **Comparison output**: Generates a formatted table showing the top two results and score gap for each combination - **Production-ready patterns**: Includes batched insertion, error handling, and connection pooling ## Run the code @@ -289,105 +294,76 @@ go run ./src/ The application will: 1. Connect to Azure DocumentDB and Azure OpenAI using passwordless authentication -2. Create separate collections for each algorithm/similarity combination -3. Insert the hotel data into each collection -4. Create a vector index on each collection with algorithm-specific parameters -5. Generate an embedding for the search query -6. Execute vector searches across all collections -7. Display a comparison table with results and latencies +2. Load the hotel data and insert it into the `hotels` collection +3. Generate an embedding for the search query +4. Run all 9 vector index comparisons by creating, querying, and dropping each index in sequence +5. Display a comparison table with the top two results and score gap for each combination +6. Drop the `hotels` collection during cleanup Expected output: -``` -Vector Algorithm Comparison - Database: Hotels - Algorithms: all - Similarity: COS - Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos - Search query: "quintessential lodging near running trails, eateries, retail" - -Initializing MongoDB and Azure OpenAI clients... +```text +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 5 + Loading data from data/Hotels_Vector.json... -Loaded 50 documents -Generating query embedding... -Query embedding: 1536 dimensions - -━━━ DiskANN / COS ━━━ -Collection: hotels_diskann_cos -Created collection: hotels_diskann_cos -Inserted: 50/50 -Created vector index: vectorIndex_diskann_cos -Executing vector search... -[OK] 5 results, 42ms - -━━━ HNSW / COS ━━━ -Collection: hotels_hnsw_cos -Created collection: hotels_hnsw_cos -Inserted: 50/50 -Created vector index: vectorIndex_hnsw_cos -Executing vector search... -[OK] 5 results, 38ms - -━━━ IVF / COS ━━━ -Collection: hotels_ivf_cos -Created collection: hotels_ivf_cos -Inserted: 50/50 -Created vector index: vectorIndex_ivf_cos -Executing vector search... -[OK] 5 results, 35ms - -╔══════════════════════════════════════════════════════════════════════════════════╗ -║ Vector Algorithm Comparison Results ║ -╠══════════════════════════════════════════════════════════════════════════════════╣ -║ Algorithm Similarity Top Result Score Latency(ms) ║ -╠══════════════════════════════════════════════════════════════════════════════════╣ -║ DiskANN COS Secret Point Motel 0.8562 42 ║ -║ HNSW COS Secret Point Motel 0.8562 38 ║ -║ IVF COS Secret Point Motel 0.8562 35 ║ -╚══════════════════════════════════════════════════════════════════════════════════╝ - ---- DiskANN / COS (hotels_diskann_cos) --- - 1. Secret Point Motel, Score: 0.8562 - 2. Countryside Hotel, Score: 0.8457 - 3. Downtown Modern Hotel, Score: 0.8398 - 4. Old Century Hotel, Score: 0.8321 - 5. Save-the-Light Deluxe Inn, Score: 0.8298 - Latency: 42ms - ---- HNSW / COS (hotels_hnsw_cos) --- - 1. Secret Point Motel, Score: 0.8562 - 2. Countryside Hotel, Score: 0.8457 - 3. Downtown Modern Hotel, Score: 0.8398 - 4. Old Century Hotel, Score: 0.8321 - 5. Save-the-Light Deluxe Inn, Score: 0.8298 - Latency: 38ms - ---- IVF / COS (hotels_ivf_cos) --- - 1. Secret Point Motel, Score: 0.8562 - 2. Countryside Hotel, Score: 0.8457 - 3. Downtown Modern Hotel, Score: 0.8398 - 4. Old Century Hotel, Score: 0.8321 - 5. Save-the-Light Deluxe Inn, Score: 0.8298 - Latency: 35ms - -Done. +Loaded 50 documents with embeddings +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create→search→drop)... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' ``` +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + ## Understanding the results The comparison table shows how different algorithms perform on the same dataset with the same query: - **Algorithm**: DiskANN, HNSW, or IVF -- **Similarity**: The distance metric (COS, L2, or IP) -- **Top Result**: The highest-scoring hotel from the search -- **Score**: Similarity score (higher is better for COS and IP, lower is better for L2) -- **Latency**: Query execution time in milliseconds +- **Metric**: The similarity metric (COS, L2, or IP) +- **Top 1 Result**: The highest-ranked hotel for that algorithm and metric +- **Score**: The relevance score for the corresponding result +- **Top 2 Result**: The second-highest-ranked hotel for that algorithm and metric +- **Diff**: The score gap between the top two results [!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] ## Run all combinations -Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 algorithm × similarity combinations. +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. ## Troubleshooting diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index cf674af..2f59830 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -21,7 +21,7 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D [!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] -- [Java 21 or higher](/java/openjdk/download) +- [Java 17 or higher](/java/openjdk/download) - [Maven 3.6 or higher](https://maven.apache.org/download.cgi) @@ -82,16 +82,16 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D ### [Bash](#tab/bash) ```bash - mkdir select-algorithm-quickstart - cd select-algorithm-quickstart + mkdir select-algorithm-java + cd select-algorithm-java code . ``` ### [PowerShell](#tab/powershell) ```powershell - New-Item -ItemType Directory -Name select-algorithm-quickstart - Set-Location select-algorithm-quickstart + New-Item -ItemType Directory -Name select-algorithm-java + Set-Location select-algorithm-java code . ``` @@ -116,64 +116,41 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D 3. Create a `pom.xml` file in the root directory with the following content: ```xml + 4.0.0 - com.azure.documentdb.samples + com.azure.documentdb select-algorithm-java - 1.0-SNAPSHOT - Azure DocumentDB Vector Algorithm Comparison + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB - 21 - 21 - 21 + 17 + 17 UTF-8 - - - - com.azure - azure-sdk-bom - 1.2.29 - pom - import - - - - org.mongodb mongodb-driver-sync - 5.6.2 + 5.4.0 com.azure azure-identity + 1.16.0 com.azure azure-ai-openai - - - com.fasterxml.jackson.core - jackson-databind - 2.18.2 - - - io.github.cdimascio - dotenv-java - 3.0.2 - - - org.slf4j - slf4j-simple - 2.0.17 - runtime + 1.0.0-beta.16 @@ -184,94 +161,101 @@ This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure D maven-compiler-plugin 3.13.0 - 21 + 17 + 17 org.codehaus.mojo exec-maven-plugin - 3.1.0 + 3.4.1 com.azure.documentdb.selectalgorithm.Main + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + ``` Verify: Run `mvn dependency:resolve` to confirm all dependencies resolve without errors. -4. Create a `.env` filein the project root for environment variables: - - ```bash - # Azure DocumentDB cluster name for passwordless authentication - DOCUMENTDB_CLUSTER_NAME= - - # Azure managed identity principal ID for authentication - AZURE_MANAGED_IDENTITY_PRINCIPAL_ID= - - # Azure OpenAI endpoint and model configuration - AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-openai-resource.openai.azure.com/ - AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small - - # Data file path (relative to project root) - DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json - - # Vector configuration - EMBEDDED_FIELD=DescriptionVector - EMBEDDING_DIMENSIONS=1536 - LOAD_SIZE_BATCH=100 - - # Leave ALGORITHM and SIMILARITY unset to run all combinations - ``` - - Replace the placeholder values with your Azure resource information: - - - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name - - `AZURE_MANAGED_IDENTITY_PRINCIPAL_ID`: Your managed identity principal ID - - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - - Verify the `.env` file was created: +4. Set environment variables in your shell before running the sample: ### [Bash](#tab/bash) ```bash - cat .env + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_DOCUMENTDB_DATABASENAME=Hotels + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 ``` ### [PowerShell](#tab/powershell) ```powershell - Get-Content .env + $env:DOCUMENTDB_CLUSTER_NAME="" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com/" + $env:AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" + $env:AZURE_DOCUMENTDB_DATABASENAME="Hotels" + $env:DATA_FILE_WITH_VECTORS="data/Hotels_Vector.json" + $env:EMBEDDED_FIELD="DescriptionVector" + $env:EMBEDDING_DIMENSIONS="1536" + $env:LOAD_SIZE_BATCH="100" ``` --- - You should see your configuration values including the Azure OpenAI endpoint and cluster name. + Replace the placeholder values with your Azure resource information: - This sample uses passwordless authenticationwith `DefaultAzureCredential`, which requires your identity to have proper RBAC roles assigned. For more information on authentication options, see [Authenticate Java apps to Azure services by using the Azure SDK for Java](/azure/developer/java/sdk/authentication/overview). + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `AZURE_OPENAI_EMBEDDING_MODEL`: Your Azure OpenAI embedding deployment name + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + + This sample uses passwordless authentication with `DefaultAzureCredential`, which requires your identity to have proper RBAC roles assigned. For more information on authentication options, see [Authenticate Java apps to Azure services by using the Azure SDK for Java](/azure/developer/java/sdk/authentication/overview). ## Create code files When you are done, the project structure should look like this: ```text -select-algorithm-quickstart/ +select-algorithm-java/ ├── data/ -│ └── Hotels_Vector.json # Hotel data with vector embeddings -├── src/ -│ └── main/ -│ └── java/ -│ └── com/ -│ └── azure/ -│ └── documentdb/ -│ └── selectalgorithm/ -│ ├── CompareAll.java # Main comparison logic -│ ├── Main.java # Entry point that runs CompareAll -│ └── Utils.java # Shared utility functions -├── pom.xml # Maven dependencies -└── .env # Environment variables +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/main/java/com/azure/documentdb/selectalgorithm/ +│ ├── CompareAll.java +│ ├── Main.java +│ └── Utils.java +├── .gitignore +├── pom.xml +├── quickstart.md +└── README.md ``` ## Create the algorithm comparison code @@ -284,7 +268,7 @@ Create `src/main/java/com/azure/documentdb/selectalgorithm/Utils.java` and paste This utility class provides: -- **Environment variable management**: Loads configuration from `.env` file or system environment +- **Environment variable management**: Reads configuration from environment variables by using `System.getenv()` - **Passwordless authentication**: Uses `DefaultAzureCredential` for both MongoDB and Azure OpenAI - **MongoDB client creation**: Configures OIDC authentication for DocumentDB - **Azure OpenAI client creation**: Sets up the OpenAI client for embedding generation @@ -326,13 +310,7 @@ This main comparison logic provides: Verify: The build output ends with `BUILD SUCCESS`. -2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and executes all 9 algorithm × similarity combinations: - - ```bash - mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.Main" - ``` - -3. Leave `ALGORITHM` and `SIMILARITY` unset to run all combinations. +2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and always executes all 9 algorithm × 3 metric combinations: ### [Bash](#tab/bash) @@ -348,28 +326,64 @@ This main comparison logic provides: --- -The program displays a comparison table showing average latency for each algorithm and similarity function combination: + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +The program prints output similar to the following: ```text -================================================================================ -Vector Index Algorithm Comparison Results -================================================================================ -Algorithm Similarity Avg Latency (ms) --------------------------------------------------------------------------------- -DISKANN COS 42.30 -DISKANN IP 38.70 -DISKANN L2 45.10 -HNSW COS 31.50 -HNSW IP 29.80 -HNSW L2 34.20 -IVF COS 55.60 -IVF IP 52.10 -IVF L2 58.90 -================================================================================ +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 5 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ ``` -> [!NOTE] -> The latency values shown above are illustrative. Actual results depend on your DocumentDB cluster configuration, region, network latency, and dataset size. +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. ## Understanding the results @@ -379,7 +393,7 @@ IVF L2 58.90 | Issue | Solution | |-------|----------| -| `MongoTimeoutException` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `MongoTimeoutException` | Verify the `DOCUMENTDB_CLUSTER_NAME` environment variable, and ensure your IP is in the DocumentDB firewall rules. | | `MongoSecurityException` | Check credentials in connection string. | | Maven build failures | Run `mvn dependency:resolve` to check for missing dependencies. Ensure Java 17+ is installed. | | `No plugin found for prefix 'exec'` | Add `exec-maven-plugin` to your `pom.xml` as shown in this article. | diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java index 65b17b4..66281ed 100644 --- a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -108,14 +108,22 @@ public static void run() { } } + // Print comparison table + printComparisonTable(results); + + int successCount = (int) results.stream().filter(r -> !r.top1Name().equals("(failed)")).count(); + if (successCount == 0) { + System.out.println("\n❌ All 9 comparisons failed — no algorithm returned results."); + System.exit(1); + } else { + System.out.printf("%nSummary: %d succeeded, %d failed%n", successCount, 9 - successCount); + } + // Cleanup: drop the comparison collection System.out.println("\n Cleanup: dropping comparison collection..."); collection.drop(); System.out.println(" Cleanup: dropped collection 'hotels'"); } - - // Print comparison table - printComparisonTable(results); } private static void dropVectorIndexes(MongoCollection collection, String vectorField) { diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index df64f04..f4d02c9 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -129,67 +129,64 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ You should see `pymongo` with a version of 4.7 or greater. -4. Create a `.env` file for environment variables in the project root: - - ```bash - # Azure OpenAI Embedding Settings - AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small - AZURE_OPENAI_EMBEDDING_API_VERSION=2024-10-21 - AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com - - # Data File Paths and Vector Configuration - DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json - EMBEDDED_FIELD=DescriptionVector - EMBEDDING_DIMENSIONS=1536 - LOAD_SIZE_BATCH=100 - - # Azure DocumentDB Connection Settings - DOCUMENTDB_CLUSTER_NAME= - - # Azure DocumentDB Database Name - AZURE_DOCUMENTDB_DATABASENAME=Hotels - - # Leave ALGORITHM and SIMILARITY unset to run all combinations - - ``` - - For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: - - - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name - - You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Python apps to Azure services by using the Azure SDK for Python](/azure/developer/python/sdk/authentication/overview). - - Verify the `.env` file was created: +4. Set the required environment variables in your current shell session before you run the sample: ### [Bash](#tab/bash) ```bash - cat .env + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_OPENAI_EMBEDDING_API_VERSION=2024-10-21 + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_DOCUMENTDB_DATABASENAME=Hotels ``` ### [PowerShell](#tab/powershell) ```powershell - Get-Content .env + $env:AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small" + $env:AZURE_OPENAI_EMBEDDING_API_VERSION = "2024-10-21" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT = "https://.openai.azure.com" + $env:DATA_FILE_WITH_VECTORS = "data/Hotels_Vector.json" + $env:EMBEDDED_FIELD = "DescriptionVector" + $env:EMBEDDING_DIMENSIONS = "1536" + $env:LOAD_SIZE_BATCH = "100" + $env:DOCUMENTDB_CLUSTER_NAME = "" + $env:AZURE_DOCUMENTDB_DATABASENAME = "Hotels" ``` --- - You should see your connection string and Azure OpenAI endpoint values. + For the passwordless authentication used in this article, replace the placeholder values with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Python apps to Azure services by using the Azure SDK for Python](/azure/developer/python/sdk/authentication/overview). ## Create code files Create the following project structure: ``` +select-algorithm-python/ ├── data/ -│ └── Hotels_Vector.json # Hotel data with vector embeddings -└── select-algorithm/ - ├── src/ - │ ├── compare_all.py # Main comparison script - │ └── utils.py # Shared utility functions - └── .env # Environment variables +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare_all.py +│ └── utils.py +├── .gitignore +├── quickstart.md +├── README.md +└── requirements.txt ``` Create the `src` directory: @@ -238,100 +235,69 @@ The utilities provide essential functions for: ## Run the code -Execute the comparison script to test all algorithms with cosine similarity: +Execute the comparison script to run all 9 combinations: ```bash python src/compare_all.py ``` -The output shows the comparison across all three algorithms: +The output matches `output/compare_all.txt`: ``` -Vector Algorithm Comparison - Database: Hotels - Algorithms: all - Similarity: COS - Collections to query: hotels_diskann_cos, hotels_hnsw_cos, hotels_ivf_cos - Search query: "quintessential lodging near running trails, eateries, retail" - -Initializing MongoDB and Azure OpenAI clients... - -Loading data from data/Hotels_Vector.json... -Loaded 50 documents -Generating query embedding... -Query embedding: 1536 dimensions - ---- DiskANN / COS --- -Collection: hotels_diskann_cos -Created collection: hotels_diskann_cos -Inserting 50 documents in batches of 100... -Batch 1 completed: 50 documents inserted -Inserted: 50/50 -Created vector index: vectorIndex_diskann_cos -Executing vector search... -Success: 5 results, 145ms - ---- HNSW / COS --- -Collection: hotels_hnsw_cos -Created collection: hotels_hnsw_cos -Inserting 50 documents in batches of 100... -Batch 1 completed: 50 documents inserted -Inserted: 50/50 -Created vector index: vectorIndex_hnsw_cos -Executing vector search... -Success: 5 results, 132ms - ---- IVF / COS --- -Collection: hotels_ivf_cos -Created collection: hotels_ivf_cos -Inserting 50 documents in batches of 100... -Batch 1 completed: 50 documents inserted -Inserted: 50/50 -Created vector index: vectorIndex_ivf_cos -Executing vector search... -Success: 5 results, 128ms - -========================================================================================== - Vector Algorithm Comparison Results -========================================================================================== -Algorithm Similarity Top Result Score Latency(ms) ------------------------------------------------------------------------------------------- -DiskANN COS Twin Dome Motel 0.8947 145 -HNSW COS Twin Dome Motel 0.8947 132 -IVF COS Twin Dome Motel 0.8947 128 -========================================================================================== - ---- DiskANN / COS (hotels_diskann_cos) --- - 1. Twin Dome Motel, Score: 0.8947 - 2. Triple Landscape Hotel, Score: 0.8898 - 3. Smile Hotel, Score: 0.8855 - 4. Gastronomic Landscape Hotel, Score: 0.8797 - 5. Twin Landscape Resort, Score: 0.8772 - Latency: 145ms - ---- HNSW / COS (hotels_hnsw_cos) --- - 1. Twin Dome Motel, Score: 0.8947 - 2. Triple Landscape Hotel, Score: 0.8898 - 3. Smile Hotel, Score: 0.8855 - 4. Gastronomic Landscape Hotel, Score: 0.8797 - 5. Twin Landscape Resort, Score: 0.8772 - Latency: 132ms - ---- IVF / COS (hotels_ivf_cos) --- - 1. Twin Dome Motel, Score: 0.8947 - 2. Triple Landscape Hotel, Score: 0.8898 - 3. Smile Hotel, Score: 0.8855 - 4. Gastronomic Landscape Hotel, Score: 0.8797 - 5. Twin Landscape Resort, Score: 0.8772 - Latency: 128ms - -Closing database connection... -Database connection closed +====================================================================== + Compare All Algorithms — 9 Combinations + (3 Algorithms × 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 5 + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | ++=============+==========+==========================+=========+===================+=========+========+ +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ + +Cleanup: dropped collection 'hotels' ``` +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + ### Run all combinations -Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 combinations (3 algorithms × 3 similarity functions): +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. ### [Bash](#tab/bash) @@ -361,7 +327,7 @@ The comparison table helps you choose the best configuration for your workload: | Issue | Solution | |-------|----------| -| `ServerSelectionTimeoutError` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `ServerSelectionTimeoutError` | Verify that your environment variables are set in the current shell. Ensure your IP is in the DocumentDB firewall rules. | | `AuthenticationFailed` | Check that your connection string includes the correct username and password, or that your Microsoft Entra token is valid. | | `pymongo.errors.OperationFailure` | Ensure the database and collection exist. Check that the vector index was created successfully. | | `ModuleNotFoundError: No module named 'pymongo'` | Activate your virtual environment and run `pip install "pymongo>=4.7"`. | diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py index 0bee778..1f7dcb1 100644 --- a/ai/select-algorithm-python/src/compare_all.py +++ b/ai/select-algorithm-python/src/compare_all.py @@ -8,6 +8,7 @@ Metrics: COS, L2, IP """ import os +import sys import time from typing import Dict, List, Any @@ -230,6 +231,13 @@ def main(): "Top 2 Result", "Score", "Diff"] print(tabulate(table_rows, headers=headers, tablefmt="grid")) + success_count = sum(1 for row in table_rows if row[2] != "(failed)") + if success_count == 0: + print("\n❌ All 9 comparisons failed — no algorithm returned results.") + sys.exit(1) + else: + print(f"\nSummary: {success_count} succeeded, {9 - success_count} failed") + finally: # Cleanup: drop the comparison collection try: diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py index b27e217..de0f8b4 100644 --- a/ai/select-algorithm-python/src/utils.py +++ b/ai/select-algorithm-python/src/utils.py @@ -4,10 +4,10 @@ import warnings from typing import Dict, List, Any, Optional, Tuple -# Suppress the PyMongo CosmosDB cluster detection warning +# Suppress the PyMongo DocumentDB cluster detection warning warnings.filterwarnings( "ignore", - message="You appear to be connected to a CosmosDB cluster.*", + message="You appear to be connected to a DocumentDB cluster.*", ) from pymongo import MongoClient, InsertOne diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json index df5b82b..5c1f24a 100644 --- a/ai/select-algorithm-typescript/package.json +++ b/ai/select-algorithm-typescript/package.json @@ -6,7 +6,7 @@ "scripts": { "env:init": "azd env get-values > .env", "build": "tsc", - "start": "node --env-file .env dist/compare-all.js" + "start": "node dist/compare-all.js" }, "dependencies": { "@azure/identity": "^4.11.1", diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md index 7662df4..b04fc58 100644 --- a/ai/select-algorithm-typescript/quickstart.md +++ b/ai/select-algorithm-typescript/quickstart.md @@ -164,56 +164,46 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ "type": "module", "scripts": { "build": "tsc", - "start": "node --env-file .env dist/compare-all.js" + "start": "node dist/compare-all.js" } } ``` -6. Create a `.env` file for environment variables in the project root: - - ```bash - # Azure OpenAI Embedding Settings - AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small - AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com - - # Data File Paths and Vector Configuration - DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json - EMBEDDED_FIELD=DescriptionVector - EMBEDDING_DIMENSIONS=1536 - LOAD_SIZE_BATCH=100 - - # Azure DocumentDB Connection Settings - DOCUMENTDB_CLUSTER_NAME= - - # Azure DocumentDB Database Name - AZURE_DOCUMENTDB_DATABASENAME=Hotels - - # Leave ALGORITHM and SIMILARITY unset to run all combinations - ``` - - Verify the `.env` file was created: +6. Set the required environment variables in your shell before running the sample: ### [Bash](#tab/bash) ```bash - cat .env + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_DOCUMENTDB_DATABASENAME=Hotels ``` ### [PowerShell](#tab/powershell) ```powershell - Get-Content .env + $env:AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT = "https://.openai.azure.com" + $env:DATA_FILE_WITH_VECTORS = "data/Hotels_Vector.json" + $env:EMBEDDED_FIELD = "DescriptionVector" + $env:EMBEDDING_DIMENSIONS = "1536" + $env:LOAD_SIZE_BATCH = "100" + $env:DOCUMENTDB_CLUSTER_NAME = "" + $env:AZURE_DOCUMENTDB_DATABASENAME = "Hotels" ``` - --- - - You should see your configuration values including the Azure OpenAI endpoint and cluster name. - - For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: + Replace the placeholder values with your own information: - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate JavaScript apps to Azure services using the Azure SDK for JavaScript](/azure/developer/javascript/sdk/authentication/overview). ## Create code files @@ -221,15 +211,21 @@ Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/ Create the following project structure: ``` +select-algorithm-typescript/ ├── data/ -│ └── Hotels_Vector.json # Hotel data with vector embeddings -└── select-algorithm-typescript/ - ├── src/ - │ ├── compare-all.ts # Main comparison script - │ └── utils.ts # Shared utility functions - ├── tsconfig.json - ├── package.json - └── .env # Environment variables +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare-all.ts +│ ├── select-algorithm.ts +│ └── utils.ts +├── .gitignore +├── package.json +├── package-lock.json +├── quickstart.md +├── README.md +└── tsconfig.json ``` Create the `src` directory: @@ -288,61 +284,58 @@ npm start The output shows the comparison across all algorithms and similarity metrics: ``` -Vector Algorithm Comparison - Database: Hotels - Algorithms: all - Similarity: all - Collections to query: hotels_diskann_cos, hotels_diskann_l2, hotels_diskann_ip, hotels_hnsw_cos, ... - Search query: "quintessential lodging near running trails, eateries, retail" - -Generating query embedding... -Query embedding: 1536 dimensions - ---- DiskANN / COS --- -Collection: hotels_diskann_cos -Created collection: hotels_diskann_cos -Inserted: 50/50 -Created vector index: vectorIndex_diskann_cos -Executing vector search... -Success: 5 results, 142ms - -... - -========================================================================================== - Vector Algorithm Comparison Results -========================================================================================== -Algorithm Similarity Top Result Score Latency(ms) ------------------------------------------------------------------------------------------- -DiskANN COS Ocean Water Resort & 0.6184 142 -DiskANN L2 Ocean Water Resort & 0.8736 128 -DiskANN IP Ocean Water Resort & 0.6184 135 -HNSW COS Ocean Water Resort & 0.6184 119 -HNSW L2 Ocean Water Resort & 0.8736 115 -HNSW IP Ocean Water Resort & 0.6184 121 -IVF COS Ocean Water Resort & 0.6184 108 -IVF L2 Ocean Water Resort & 0.8736 105 -IVF IP Ocean Water Resort & 0.6184 110 -========================================================================================== - ---- DiskANN / COS (hotels_diskann_cos) --- - 1. Ocean Water Resort & Spa, Score: 0.6184 - 2. Windy Ocean Motel, Score: 0.5056 - 3. Gastronomic Landscape Hotel, Score: 0.4892 - 4. Sublime Palace Hotel, Score: 0.4753 - 5. Luxury Lion Resort, Score: 0.4612 - Latency: 142ms -... - -Closing database connection... +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small +Reading JSON file from data/Hotels_Vector.json +Loaded 50 documents +Processing in batches of 50... +Batch 1 complete: 50 inserted + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 5 results)... ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection "hotels" Database connection closed ``` +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + > [!NOTE] > Latency values are approximate and vary by environment. Scores may differ slightly depending on your Azure OpenAI embedding deployment. ### Run all combinations -Leave `ALGORITHM` and `SIMILARITY` unset to run all 9 algorithm × similarity combinations: +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. ```bash npm run build @@ -357,9 +350,9 @@ The comparison table demonstrates key behaviors of vector search in DocumentDB: - **COS and IP produce identical scores** (0.6184 / 0.5056) because the `text-embedding-3-small` model outputs normalized (unit-length) vectors. For normalized vectors, cosine similarity equals inner product mathematically. -- **L2 (Euclidean distance) scores are inverted.** Higher L2 scores mean *more* distance — the #1 result has the *lowest* score (0.8736 = closest to query). This explains the negative Diff value (-0.1208). +- **L2 (Euclidean distance) scores represent distance.** In this output, the top result has the lower L2 score (0.8736) and the second result is farther away (0.9943). -- **Score separation (Diff column)** shows confidence. A larger positive diff means the search clearly distinguishes the best match from the second-best. This metric helps evaluate result quality regardless of the absolute score values. +- **Score separation (Diff column)** shows the gap between the top two results. A smaller diff indicates the algorithm found results with more similar relevance scores. [!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] @@ -367,12 +360,12 @@ The comparison table demonstrates key behaviors of vector search in DocumentDB: | Issue | Solution | |-------|----------| -| `MongoServerSelectionError` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | -| `MongoServerError: Authentication failed` | Check credentials in connection string. Verify you've run `az login` for passwordless auth. | +| `MongoServerSelectionError` | Verify your `DOCUMENTDB_CLUSTER_NAME` environment variable and ensure your IP is in the DocumentDB firewall rules. | +| `MongoServerError: Authentication failed` | Check your authentication setup and verify you've run `az login` for passwordless auth. | | TypeScript compilation errors | Run `npx tsc --version` to verify TypeScript is installed. Check `tsconfig.json` settings match the values shown in this article. | | `Cannot find module` errors | Run `npm install` to ensure all dependencies are installed. | -| `Embedding dimension mismatch` | Verify `AZURE_OPENAI_EMBEDDING_MODEL` in `.env` matches the model deployed in your Azure OpenAI resource. | -| Empty search results | The vector index may not be ready yet. The code includes retry logic, but if the dataset is large, increase the wait time. | +| `Embedding dimension mismatch` | Verify the `AZURE_OPENAI_EMBEDDING_MODEL` environment variable matches the model deployed in your Azure OpenAI resource. | +| Empty search results | The vector index may not be ready yet. The code retries up to 6 total attempts with a 2-second delay between attempts. | ## Clean up resources diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts index 00fbfd9..2978c94 100644 --- a/ai/select-algorithm-typescript/src/compare-all.ts +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -45,7 +45,7 @@ async function main() { const db = dbClient.db(baseConfig.dbName); // Drop collection if it exists for a clean start - let collections = await db.listCollections({ name: collectionName }).toArray(); + const collections = await db.listCollections({ name: collectionName }).toArray(); if (collections.length > 0) { try { const col = db.collection(collectionName); @@ -57,13 +57,11 @@ async function main() { } catch {} } } - await new Promise(r => setTimeout(r, 2000)); await db.dropCollection(collectionName); console.log(`Dropped existing collection: ${collectionName}`); } catch (e: any) { console.log(`Cleanup note: ${e.message.split('\n')[0]}`); } - await new Promise(r => setTimeout(r, 10000)); } // Load data once for reuse @@ -72,7 +70,7 @@ async function main() { // Insert data into collection const collection = db.collection(collectionName); - await insertData(baseConfig, collection, data); + await insertData(baseConfig, collection, data, false); // Generate one embedding for the query console.log(`\nQuery: "${queryText}"`); @@ -100,10 +98,6 @@ async function main() { try { await collection.dropIndex(idx.name); droppedAny = true; } catch {} } } - if (droppedAny) { - await new Promise(r => setTimeout(r, 2000)); - } - // 2. Create this specific index const indexOptions = { createIndexes: collectionName, @@ -142,20 +136,19 @@ async function main() { let searchResults: any[] = []; let lastSearchError: unknown; - await new Promise(r => setTimeout(r, 1000)); - for (let attempt = 1; attempt <= 5; attempt++) { + for (let attempt = 1; attempt <= 6; attempt++) { try { searchResults = await collection.aggregate(searchPipeline).toArray(); - if (searchResults.length > 0 || attempt === 5) { + if (searchResults.length > 0 || attempt === 6) { break; } - console.log(` ...search returned no results yet, retrying (${attempt}/5)`); + console.log(` ...search returned no results yet, retrying (${attempt}/6)`); } catch (e) { lastSearchError = e; - if (attempt === 5) { + if (attempt === 6) { throw e; } - console.log(` ...search not ready yet, retrying (${attempt}/5)`); + console.log(` ...search not ready yet, retrying (${attempt}/6)`); } await new Promise(r => setTimeout(r, 2000)); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts index d2ddcf7..09ec3dd 100644 --- a/ai/select-algorithm-typescript/src/utils.ts +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -85,7 +85,12 @@ export async function readFileReturnJson(filePath: string): Promise return JSON.parse(fileAsString); } -export async function insertData(config: { batchSize: number }, collection: Collection, data: Document[]) { +export async function insertData( + config: { batchSize: number }, + collection: Collection, + data: Document[], + createScalarIndexes: boolean = true +) { console.log(`Processing in batches of ${config.batchSize}...`); const totalBatches = Math.ceil(data.length / config.batchSize); @@ -118,12 +123,13 @@ export async function insertData(config: { batchSize: number }, collection: Coll } } - // Create standard field indexes - const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; - for (const col of indexColumns) { - const indexSpec: Record = {}; - indexSpec[col] = 1; - await collection.createIndex(indexSpec); + if (createScalarIndexes) { + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec: Record = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } } return { total: data.length, inserted, failed }; From c721052ce7bf805c1b1bac7784c3967eb4c6c624 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 14:56:48 -0700 Subject: [PATCH 10/11] fix: update output files and quickstarts with actual run results - Python: updated scores to match pymongo float precision (0.6183/0.5057/0.8735/0.9942) - .NET: added Summary line and Done footer to output - Java: fixed output order (table before cleanup), DISKANN casing, added Summary - devcontainer: removed stale vscode-cosmosdb extension - appsettings.json: reverted to placeholder values Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .devcontainer/devcontainer.json | 3 --- ai/select-algorithm-dotnet/appsettings.json | 26 +++++++++---------- .../output/compare_all.txt | 4 +++ ai/select-algorithm-dotnet/quickstart.md | 4 +++ .../output/compare_all.txt | 18 ++++++++----- ai/select-algorithm-java/quickstart.md | 18 ++++++++----- .../output/compare_all.txt | 20 +++++++------- ai/select-algorithm-python/quickstart.md | 20 +++++++------- 8 files changed, 65 insertions(+), 48 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c003a88..bdf441a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -11,9 +11,6 @@ "customizations": { "vscode": { "extensions": [ - "ms-azuretools.vscode-cosmosdb", - "buildwithlayer.mongodb-integration-expert-qS6DB", - "mongodb.mongodb-vscode", "ms-azuretools.vscode-documentdb" ] } diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json index 6e57d30..68ee696 100644 --- a/ai/select-algorithm-dotnet/appsettings.json +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -1,24 +1,24 @@ { + "MongoDB": { + "DatabaseName": "Hotels", + "ClusterName": "", + "LoadBatchSize": 50 + }, + "VectorSearch": { + "Similarity": "", + "TopK": 5, + "Query": "luxury hotel near the beach" + }, "AzureOpenAI": { - "Endpoint": "https://.openai.azure.com", + "Endpoint": "https://.openai.azure.com/", "EmbeddingModel": "text-embedding-3-small" }, - "MongoDB": { - "ClusterName": "", - "DatabaseName": "Hotels", - "LoadBatchSize": 100 + "DataFiles": { + "WithVectors": "../data/Hotels_Vector.json" }, "Embedding": { "EmbeddedField": "DescriptionVector", "Dimensions": 1536, "EmbeddingSizeBatch": 16 - }, - "VectorSearch": { - "Query": "quintessential lodging near running trails, eateries, retail", - "Similarity": "", - "TopK": 5 - }, - "DataFiles": { - "WithVectors": "data/Hotels_Vector.json" } } diff --git a/ai/select-algorithm-dotnet/output/compare_all.txt b/ai/select-algorithm-dotnet/output/compare_all.txt index 4f4d995..74a325e 100644 --- a/ai/select-algorithm-dotnet/output/compare_all.txt +++ b/ai/select-algorithm-dotnet/output/compare_all.txt @@ -44,4 +44,8 @@ Running 9 algorithm × metric combinations... │ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ └──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ +Summary: 9 succeeded, 0 failed + Cleanup: dropped collection 'hotels' + +Done! diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md index b9a9997..e94c2da 100644 --- a/ai/select-algorithm-dotnet/quickstart.md +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -437,7 +437,11 @@ Running 9 algorithm × metric combinations... │ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ └──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ +Summary: 9 succeeded, 0 failed + Cleanup: dropped collection 'hotels' + +Done! ``` The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. diff --git a/ai/select-algorithm-java/output/compare_all.txt b/ai/select-algorithm-java/output/compare_all.txt index 7794fd5..cbde87e 100644 --- a/ai/select-algorithm-java/output/compare_all.txt +++ b/ai/select-algorithm-java/output/compare_all.txt @@ -23,10 +23,6 @@ ✓ vector_diskann_cos created ✓ vector_diskann_l2 created ✓ vector_diskann_ip created - - Cleanup: dropping comparison collection... - Cleanup: dropped collection 'hotels' - ┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ │ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ @@ -42,9 +38,17 @@ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ │ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DISKANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DISKANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DISKANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ └──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' +============================================== + Comparison complete. +============================================== diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index 2f59830..0460f94 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -356,10 +356,6 @@ The program prints output similar to the following: ✓ vector_diskann_cos created ✓ vector_diskann_l2 created ✓ vector_diskann_ip created - - Cleanup: dropping comparison collection... - Cleanup: dropped collection 'hotels' - ┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ │ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ @@ -375,12 +371,20 @@ The program prints output similar to the following: ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ │ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DISKANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DISKANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ ├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ -│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DISKANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ └──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' +============================================== + Comparison complete. +============================================== ``` The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. diff --git a/ai/select-algorithm-python/output/compare_all.txt b/ai/select-algorithm-python/output/compare_all.txt index aa96c4f..8719419 100644 --- a/ai/select-algorithm-python/output/compare_all.txt +++ b/ai/select-algorithm-python/output/compare_all.txt @@ -25,23 +25,25 @@ Running 9 vector searches... +-------------+----------+--------------------------+---------+-------------------+---------+--------+ | Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | +=============+==========+==========================+=========+===================+=========+========+ -| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| IVF | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| IVF | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| IVF | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| IVF | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| HNSW | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| HNSW | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| DiskANN | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ +Summary: 9 succeeded, 0 failed + Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md index f4d02c9..07304b6 100644 --- a/ai/select-algorithm-python/quickstart.md +++ b/ai/select-algorithm-python/quickstart.md @@ -271,25 +271,27 @@ Running 9 vector searches... +-------------+----------+--------------------------+---------+-------------------+---------+--------+ | Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | +=============+==========+==========================+=========+===================+=========+========+ -| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| IVF | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| IVF | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| IVF | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| IVF | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| HNSW | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| HNSW | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| HNSW | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | L2 | Ocean Water Resort & Spa | 0.8736 | Windy Ocean Motel | 0.9943 | 0.1208 | +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ -| DiskANN | IP | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5056 | 0.1128 | +| DiskANN | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | +-------------+----------+--------------------------+---------+-------------------+---------+--------+ +Summary: 9 succeeded, 0 failed + Cleanup: dropped collection 'hotels' ``` From 8746b4895b90632513488eddc844c459840ab5b1 Mon Sep 17 00:00:00 2001 From: "Dina Berry (She/her)" Date: Wed, 20 May 2026 15:03:03 -0700 Subject: [PATCH 11/11] fix: Java quickstart accuracy - vector indexes only, fix combination count phrasing Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ai/select-algorithm-java/quickstart.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md index 0460f94..62a1e09 100644 --- a/ai/select-algorithm-java/quickstart.md +++ b/ai/select-algorithm-java/quickstart.md @@ -296,7 +296,7 @@ This main comparison logic provides: - **Algorithm comparison logic**: Tests all combinations of algorithms and similarity functions - **Collection management**: Creates separate collections for each configuration - **Data loading**: Inserts hotel data in batches -- **Index creation**: Creates both standard and vector indexes +- **Index creation**: Creates vector indexes for each algorithm and metric combination - **Performance measurement**: Measures average query latency - **Results display**: Outputs comparison table @@ -310,7 +310,7 @@ This main comparison logic provides: Verify: The build output ends with `BUILD SUCCESS`. -2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and always executes all 9 algorithm × 3 metric combinations: +2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and always executes all 9 combinations (3 algorithms × 3 metrics): ### [Bash](#tab/bash)