diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c003a88..bdf441a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -11,9 +11,6 @@ "customizations": { "vscode": { "extensions": [ - "ms-azuretools.vscode-cosmosdb", - "buildwithlayer.mongodb-integration-expert-qS6DB", - "mongodb.mongodb-vscode", "ms-azuretools.vscode-documentdb" ] } diff --git a/.devcontainer/typescript/devcontainer.json b/.devcontainer/typescript/devcontainer.json index d627844..a4db17d 100644 --- a/.devcontainer/typescript/devcontainer.json +++ b/.devcontainer/typescript/devcontainer.json @@ -11,10 +11,8 @@ "customizations": { "vscode": { "extensions": [ - "ms-azuretools.vscode-cosmosdb", - "buildwithlayer.mongodb-integration-expert-qS6DB", - "mongodb.mongodb-vscode", - "ms-azuretools.vscode-documentdb" + "ms-azuretools.vscode-documentdb", + "mongodb.mongodb-vscode" ] } } diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index c446bd6..50bac00 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -105,13 +105,14 @@ All samples MUST use these environment variable names and defaults: - efSearch: 40 ### DiskANN -- maxDegree: 20 -- lBuild: 10 +- vector-search samples: maxDegree: 20, lBuild: 10 +- select-algorithm compare-all samples: maxDegree: 32, lBuild: 50 - lSearch: 40 +- Select-algorithm samples use higher values for meaningful comparison results. ## Rules -1. **No Cosmos DB references.**Never use "Cosmos DB", "cosmosdb", "MongoDB vCore", or "mongo.cosmos.azure.com". Always use "Azure DocumentDB" and "documentdb.azure.com". Exception: `mongocluster.cosmos.azure.com` (hostname), `cosmosSearch` (API command), and `ms-azuretools.vscode-cosmosdb` (VS Code extension) are valid and NOT Cosmos references. +1. **No Cosmos DB references.** Never use "Cosmos DB", "cosmosdb", "MongoDB vCore", or "mongo.cosmos.azure.com". Always use "Azure DocumentDB" and "documentdb.azure.com". Exception: `mongocluster.cosmos.azure.com` (hostname) and `cosmosSearch` (API command) are valid and NOT Cosmos references. 2. **Vector field name is DescriptionVector.** Never default to "contentVector". 3. **Data file path from env var.** Code reads `DATA_FILE_WITH_VECTORS`. The default depends on the sample category: vector-search samples use `../data/Hotels_Vector.json` (shared data directory one level up), while select-algorithm samples use `data/Hotels_Vector.json` (local copy in each sample). .NET copies data locally to `data/Hotels_Vector.json` in the build output. 4. **Batch size is LOAD_SIZE_BATCH=100.** Do not use BATCH_SIZE or other variants. @@ -121,6 +122,37 @@ All samples MUST use these environment variable names and defaults: 8. **Output files are committed.** Each sample has an `output/` directory with expected output for each algorithm (`ivf.txt`, `hnsw.txt`, `diskann.txt`). Update these when output format changes. 9. **DocumentDB supports all index types at any dataset size.** IVF, HNSW, and DiskANN are all available — do not imply tier restrictions limit algorithm availability. 10. **No dotenv libraries.** Do NOT use `python-dotenv`, `godotenv`, `dotenv` (npm), or any `.env` file-loading library. Environment variables must be passed via the CLI invocation, not loaded from `.env` files at runtime. This keeps samples explicit and avoids hidden configuration. -11. **Collection naming:** `hotels_{algorithm}` (e.g., `hotels_ivf`, `hotels_hnsw`, `hotels_diskann`). Index naming: `vectorIndex_{algorithm}`. +11. **Collection naming:** Standard per-algorithm samples use `hotels_{algorithm}` (e.g., `hotels_ivf`, `hotels_hnsw`, `hotels_diskann`). Standard index naming is `vectorIndex_{algorithm}`. Compare-all samples that drop and recreate a single collection use collection `hotels` and index naming `vector_{algorithm}_{metric}` (for example, `vector_ivf_cos`). TypeScript `select-algorithm.ts` remains a separate per-collection mode. 12. **Vector search uses k=5.** All samples return top 5 results. Do not parameterize k unless explicitly required. 13. **Use the Global read-write hostname.** All samples MUST use the Global read-write connection string format: `.global.mongocluster.cosmos.azure.com`. The `.global.` form auto-follows the active write region after a replica promotion. The non-`.global.` form pins to one cluster and silently becomes read-only after failover — reserve that for read-scale-out scenarios only. (Confirmed by Khelan Modi, DocumentDB PM.) +14. **VS Code extension is DocumentDB for VS Code.** Always reference [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`). Never reference the Azure Databases extension (`ms-azuretools.vscode-cosmosdb`). + +## Sample Review Checklist + +Use this checklist when creating new samples or reviewing existing ones. Derived from PM (Khelan Modi) feedback. + +### Branding & Naming +- [ ] Environment variables use `DOCUMENTDB_CLUSTER_NAME` (not `MONGO_CLUSTER_NAME`) for select-algorithm samples +- [ ] All references say "Azure DocumentDB" — no "Cosmos DB" or "MongoDB vCore" +- [ ] Connection hostname uses `.global.mongocluster.cosmos.azure.com` format + +### Tooling References +- [ ] VS Code extension references point to [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) (`ms-azuretools.vscode-documentdb`) +- [ ] No references to Data Explorer for data browsing — use the VS Code extension instead +- [ ] No references to the old Azure Databases extension (`ms-azuretools.vscode-cosmosdb`) + +### Index Selection Guidance +- [ ] IVF is positioned for dev/test, demos, and small clusters (works on any tier) +- [ ] DiskANN is the default recommendation for production (M30+ clusters) +- [ ] HNSW is positioned for production when maximum recall is the top priority (M30+) +- [ ] Decision table or clear guidance helps readers pick the right algorithm quickly + +### DiskANN as Default +- [ ] DiskANN recommendation is prominent (not buried in a footnote) +- [ ] Higher dimension support called out (up to 16,000 vs HNSW's 8,000) +- [ ] Memory efficiency explained (index on disk, frees RAM for read/write ops) +- [ ] Operational benefits mentioned (lighter updates, easier backups, faster recovery) +- [ ] Future-proofing noted (less likely to need index redesign as models evolve) + +### Optional Enhancements +- [ ] Consider mentioning DocumentDB agent kit (`npx skills add Azure/documentdb-agent-kit`) where appropriate — currently beta/optional diff --git a/.github/workflows/validate-samples.yml b/.github/workflows/validate-samples.yml index 7bd29ec..06f494c 100644 --- a/.github/workflows/validate-samples.yml +++ b/.github/workflows/validate-samples.yml @@ -31,6 +31,7 @@ jobs: sample: - vector-search-typescript - vector-search-agent-typescript + - select-algorithm-typescript steps: - name: Checkout code @@ -52,10 +53,16 @@ jobs: run: npm run build validate-dotnet: - name: .NET + name: .NET - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - documentdb-samples.sln + - ai/select-algorithm-dotnet/SelectAlgorithm.csproj steps: - name: Checkout code @@ -66,8 +73,8 @@ jobs: with: dotnet-version: '8.0.x' - - name: Build solution - run: dotnet build documentdb-samples.sln + - name: Build + run: dotnet build ${{ matrix.sample }} validate-go: name: Go - ${{ matrix.sample }} @@ -80,6 +87,7 @@ jobs: sample: - vector-search-go - vector-search-agent-go + - select-algorithm-go steps: - name: Checkout code @@ -102,14 +110,20 @@ jobs: go build -o /dev/null "$f" utils.go done else - go build ./... + go build -o /dev/null ./... fi validate-python: - name: Python + name: Python - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - vector-search-python + - select-algorithm-python steps: - name: Checkout code @@ -121,19 +135,25 @@ jobs: python-version: '3.11' - name: Install dependencies - working-directory: ai/vector-search-python + working-directory: ai/${{ matrix.sample }} run: pip install -r requirements.txt - name: Validate Python syntax - working-directory: ai/vector-search-python + working-directory: ai/${{ matrix.sample }} run: | find . -name "*.py" -exec python -m py_compile {} + validate-java: - name: Java + name: Java - ${{ matrix.sample }} runs-on: ubuntu-latest timeout-minutes: 10 continue-on-error: false + strategy: + fail-fast: false + matrix: + sample: + - vector-search-java + - select-algorithm-java steps: - name: Checkout code @@ -147,5 +167,5 @@ jobs: cache: 'maven' - name: Compile Java - working-directory: ai/vector-search-java + working-directory: ai/${{ matrix.sample }} run: mvn compile -DskipTests diff --git a/ai/includes/choosing-algorithm.md b/ai/includes/choosing-algorithm.md new file mode 100644 index 0000000..3879862 --- /dev/null +++ b/ai/includes/choosing-algorithm.md @@ -0,0 +1,41 @@ +### Choosing the right algorithm + +> [!IMPORTANT] +> For production workloads, start with **DiskANN** on an M30+ cluster. DiskANN supports higher embedding dimensions, uses less cluster memory, and is less likely to require an index redesign as your models evolve. + +Use this quick-reference table to select the right algorithm for your workload: + +| Scenario | Algorithm | Cluster tier | Max dimensions | +|----------|-----------|--------------|----------------| +| Dev/test, demos, small datasets | **IVF** | Any (free tier OK) | 2,000 | +| Production (default) | **DiskANN** | M30+ | 16,000 | +| Production (max recall priority) | **HNSW** | M30+ | 8,000 | + +**IVF** (inverted file index): +- Best for: Test environments, demos, and small clusters +- Pros: Fast to build, low resource requirements, works on any cluster tier +- Cons: Lower recall compared to graph-based algorithms at scale +- Tune: Increase `numLists` for larger datasets, increase `nProbes` for better recall + +**DiskANN** (disk-based approximate nearest neighbor) — *recommended for production*: +- Best for: Production workloads on M30+ clusters +- Pros: Supports embeddings up to 16,000 dimensions, keeps most index data on disk freeing cluster memory for reads and writes, lighter index updates, easier backups, faster recovery +- Cons: Requires M30+ cluster tier +- Tune: Increase `maxDegree` and `lBuild` for better accuracy, increase `lSearch` for better recall +- Why default: As embedding models evolve (some already exceed 8,000 dimensions), DiskANN avoids costly index redesigns. Its disk-based architecture also means your cluster memory stays available for operational workloads rather than index storage. + +**HNSW** (hierarchical navigable small world): +- Best for: Production workloads on M30+ clusters where maximum recall is the top priority +- Pros: Excellent recall, fast queries +- Cons: Requires M30+ cluster tier, supports embeddings up to 8,000 dimensions (vs 16,000 for DiskANN), higher memory usage since the full graph lives in RAM +- Tune: Increase `m` and `efConstruction` for better index quality, increase `efSearch` for better recall + +### Choosing the right similarity function + +| Function | Score meaning | Best for | +|----------|-------------|----------| +| **COS (Cosine)** | Higher = more similar (0–1) | Text embeddings (normalized vectors) | +| **L2 (Euclidean)** | Lower = more similar (distance) | When magnitude matters | +| **IP (Inner Product)** | Higher = more similar | Equivalent to COS for normalized vectors | + +For the `text-embedding-3-small` model used in this quickstart, **COS (cosine similarity) is recommended** because OpenAI embeddings are normalized and optimized for cosine similarity. diff --git a/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json new file mode 100644 index 0000000..fcda282 --- /dev/null +++ b/ai/select-algorithm-dotnet/.devcontainer/devcontainer.json @@ -0,0 +1,48 @@ +{ + "name": "Azure DocumentDB Select Algorithm - .NET 8", + "image": "mcr.microsoft.com/devcontainers/dotnet:1-8.0-bookworm", + + "features": { + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/devcontainers/features/common-utils:2": { + "installZsh": true, + "configureZshAsDefaultShell": true, + "installOhMyZsh": true + } + }, + + "customizations": { + "vscode": { + "extensions": [ + "ms-dotnettools.csdevkit", + "ms-dotnettools.vscodeintellicode-csharp", + "ms-azuretools.vscode-azureresourcegroups", + "ms-azuretools.vscode-documentdb", + "mongodb.mongodb-vscode" + ], + "settings": { + "dotnet.completion.showCompletionItemsFromUnimportedNamespaces": true, + "files.exclude": { + "**/bin": true, + "**/obj": true + } + } + } + }, + + "postCreateCommand": "dotnet restore && dotnet build", + "remoteUser": "vscode", + + "containerEnv": { + "DOTNET_CLI_TELEMETRY_OPTOUT": "1", + "DOTNET_NOLOGO": "1" + }, + + "mounts": [ + "source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure,type=bind,consistency=cached" + ], + + "capAdd": ["SYS_PTRACE"], + "securityOpt": ["seccomp:unconfined"] +} diff --git a/ai/select-algorithm-dotnet/.gitignore b/ai/select-algorithm-dotnet/.gitignore new file mode 100644 index 0000000..de285c3 --- /dev/null +++ b/ai/select-algorithm-dotnet/.gitignore @@ -0,0 +1,7 @@ +bin/ +obj/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-dotnet/CompareAll.cs b/ai/select-algorithm-dotnet/CompareAll.cs new file mode 100644 index 0000000..9eb9c75 --- /dev/null +++ b/ai/select-algorithm-dotnet/CompareAll.cs @@ -0,0 +1,302 @@ +/// Unified comparison runner for all 9 combinations (3 algorithms × 3 similarity metrics). +/// Executes vector searches sequentially for fair timing and prints a formatted comparison table. + +namespace SelectAlgorithm; + +using MongoDB.Driver; +using MongoDB.Bson; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +public static class CompareAll +{ + private record IndexConfig(string Name, string Kind, string Similarity, BsonDocument ExtraParams); + + private record SearchResult(string Algorithm, string Metric, string Top1Name, double Top1Score, string Top2Name, double Top2Score); + + private static string GetAlgoDisplay(string kind) => kind switch + { + "vector-ivf" => "IVF", + "vector-hnsw" => "HNSW", + "vector-diskann" => "DiskANN", + _ => kind + }; + + public static void Run(AppConfiguration appConfig) + { + Console.WriteLine(new string('=', 60)); + Console.WriteLine(" Compare All Algorithms × Metrics"); + Console.WriteLine(" 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP"); + Console.WriteLine(new string('=', 60)); + + // Use config values with env var overrides for compare-specific settings + var databaseName = appConfig.MongoDB.DatabaseName; + var dataFile = appConfig.DataFiles.WithVectors; + var vectorField = appConfig.Embedding.EmbeddedField; + var dimensions = appConfig.Embedding.Dimensions; + var batchSize = appConfig.MongoDB.LoadBatchSize; + var queryText = Environment.GetEnvironmentVariable("QUERY_TEXT") ?? "luxury hotel near the beach"; + var topK = int.Parse(Environment.GetEnvironmentVariable("TOP_K") ?? "5"); + + var mongoClient = Utils.GetMongoClientPasswordless(appConfig); + var embeddingClient = Utils.GetEmbeddingClient(appConfig); + + try + { + var database = mongoClient.GetDatabase(databaseName); + + // Drop collection for a clean comparison + database.DropCollection("hotels"); + Console.WriteLine("Dropped existing 'hotels' collection (if any)"); + + var collection = database.GetCollection("hotels"); + + // Load data once into single collection + var data = Utils.ReadJsonFile(dataFile); + var documents = data.Where(d => d.Contains(vectorField)).ToList(); + Console.WriteLine($"\nLoaded {documents.Count} documents with embeddings"); + Utils.InsertData(collection, documents, batchSize); + + // Generate ONE embedding for the query (reused for all 9 searches) + Console.WriteLine($"\nQuery: \"{queryText}\""); + Console.WriteLine($"Top K: {topK}"); + var embeddingResult = embeddingClient.GenerateEmbedding(queryText); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + Console.WriteLine("Embedding generated (reused for all searches)\n"); + + // Define 9 index configurations + var configs = BuildIndexConfigs(dimensions); + + // Run each config sequentially: drop→create→wait→search + // DocumentDB doesn't allow multiple vector indexes of the same kind on the same field + Console.WriteLine("Running 9 algorithm × metric combinations...\n"); + var results = new List(); + foreach (var config in configs) + { + // 1. Drop all existing vector indexes + DropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + CreateIndex(collection, vectorField, config); + Console.WriteLine($" ✓ {config.Name} created"); + + // 3. Search with retries while the index becomes available + var searchResults = RunVectorSearchWithRetry(collection, queryVector, vectorField, config.Name, topK); + if (searchResults.Count == 0) + { + results.Add(new SearchResult(GetAlgoDisplay(config.Kind), config.Similarity, "(failed)", 0.0, "(failed)", 0.0)); + continue; + } + + // 4. Extract top 2 results and record + var algoDisplay = GetAlgoDisplay(config.Kind); + var top1Name = "-"; var top1Score = 0.0; + var top2Name = "-"; var top2Score = 0.0; + if (searchResults.Count > 0) + { + var doc1 = searchResults[0]; + top1Name = doc1.Contains("HotelName") ? doc1["HotelName"].AsString : "Unknown"; + top1Score = doc1.Contains("score") ? doc1["score"].ToDouble() : 0.0; + } + if (searchResults.Count > 1) + { + var doc2 = searchResults[1]; + top2Name = doc2.Contains("HotelName") ? doc2["HotelName"].AsString : "Unknown"; + top2Score = doc2.Contains("score") ? doc2["score"].ToDouble() : 0.0; + } + results.Add(new SearchResult(algoDisplay, config.Similarity, top1Name, top1Score, top2Name, top2Score)); + } + + var successCount = results.Count(r => r.Top1Name != "(failed)"); + + // Print comparison table + PrintComparisonTable(results); + + if (successCount == 0) + { + Console.WriteLine("\n❌ All 9 comparisons failed — no algorithm returned results."); + Environment.ExitCode = 1; + } + else + { + Console.WriteLine($"\nSummary: {successCount} succeeded, {9 - successCount} failed"); + } + } + finally + { + // Cleanup: drop the comparison collection + try + { + var database = mongoClient.GetDatabase(databaseName); + database.DropCollection("hotels"); + Console.WriteLine("\nCleanup: dropped collection 'hotels'"); + } + catch (Exception ex) + { + Console.WriteLine($"Cleanup warning: {ex.Message}"); + } + mongoClient.Cluster.Dispose(); + } + } + + private static List BuildIndexConfigs(int dimensions) + { + string[] metrics = ["COS", "L2", "IP"]; + var configs = new List(); + + // IVF + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_ivf_{metric.ToLower()}", "vector-ivf", metric, new BsonDocument { { "numLists", 1 } })); + + // HNSW + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_hnsw_{metric.ToLower()}", "vector-hnsw", metric, new BsonDocument { { "m", 16 }, { "efConstruction", 64 } })); + + // DiskANN + foreach (var metric in metrics) + configs.Add(new IndexConfig($"vector_diskann_{metric.ToLower()}", "vector-diskann", metric, new BsonDocument { { "maxDegree", 32 }, { "lBuild", 50 } })); + + return configs; + } + + private static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + foreach (var idx in cursor.ToList()) + { + var name = idx.GetValue("name", "").AsString; + var key = idx.GetValue("key", new BsonDocument()).AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + try { collection.Indexes.DropOne(name); } catch { } + } + } + } + catch { } + } + + private static void CreateIndex(IMongoCollection collection, string vectorField, IndexConfig config) + { + // Drop existing index with same name if present + try + { + collection.Indexes.DropOne(config.Name); + } + catch (MongoCommandException) + { + // Index doesn't exist, that's fine + } + + var cosmosSearchOptions = new BsonDocument + { + { "kind", config.Kind }, + { "dimensions", int.Parse(Environment.GetEnvironmentVariable("EMBEDDING_DIMENSIONS") ?? "1536") }, + { "similarity", config.Similarity } + }; + + foreach (var param in config.ExtraParams) + { + cosmosSearchOptions.Add(param); + } + + var command = new BsonDocument + { + { "createIndexes", collection.CollectionNamespace.CollectionName }, + { "indexes", new BsonArray + { + new BsonDocument + { + { "name", config.Name }, + { "key", new BsonDocument(vectorField, "cosmosSearch") }, + { "cosmosSearchOptions", cosmosSearchOptions } + } + } + } + }; + + try + { + collection.Database.RunCommand(command); + } + catch (MongoCommandException ex) when (ex.Message.Contains("already exists")) + { + // Index already exists with same config — idempotent + } + } + + private static List RunVectorSearch( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "HotelName", 1 }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + private static List RunVectorSearchWithRetry( + IMongoCollection collection, + float[] queryVector, + string vectorField, + string indexName, + int topK) + { + const int maxRetries = 5; + const int retryDelayMs = 2000; + + for (var attempt = 0; attempt <= maxRetries; attempt++) + { + var results = RunVectorSearch(collection, queryVector, vectorField, indexName, topK); + if (results.Count > 0) + { + return results; + } + + if (attempt < maxRetries) + { + Console.WriteLine($" No results for {indexName} yet. Retrying in 2 seconds ({attempt + 1}/{maxRetries})..."); + Thread.Sleep(retryDelayMs); + } + } + + Console.WriteLine($" Search for {indexName} did not return results after {maxRetries} retries. Recording as failed."); + return []; + } + + private static void PrintComparisonTable(List results) + { + Console.WriteLine(); + Console.WriteLine("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + Console.WriteLine($"│ {"Algorithm",-9}│ {"Metric",-7}│ {"Top 1 Result",-27}│ {"Score",-7}│ {"Top 2 Result",-27}│ {"Score",-7}│ {"Diff",-6}│"); + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (var i = 0; i < results.Count; i++) + { + var r = results[i]; + var diff = Math.Abs(r.Top1Score - r.Top2Score); + var top1Display = r.Top1Name.Length > 27 ? r.Top1Name[..24] + "..." : r.Top1Name; + var top2Display = r.Top2Name.Length > 27 ? r.Top2Name[..24] + "..." : r.Top2Name; + Console.WriteLine($"│ {r.Algorithm,-9}│ {r.Metric,-7}│ {top1Display,-27}│ {r.Top1Score,-7:F4}│ {top2Display,-27}│ {r.Top2Score,-7:F4}│ {diff,-6:F4}│"); + if (i < results.Count - 1) + Console.WriteLine("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + Console.WriteLine("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } +} diff --git a/ai/select-algorithm-dotnet/Models/Configuration.cs b/ai/select-algorithm-dotnet/Models/Configuration.cs new file mode 100644 index 0000000..cbca25b --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/Configuration.cs @@ -0,0 +1,41 @@ +namespace SelectAlgorithm.Models; + +public class AppConfiguration +{ + public AzureOpenAIConfiguration AzureOpenAI { get; set; } = new(); + public MongoDBConfiguration MongoDB { get; set; } = new(); + public EmbeddingConfiguration Embedding { get; set; } = new(); + public VectorSearchConfiguration VectorSearch { get; set; } = new(); + public DataFilesConfiguration DataFiles { get; set; } = new(); +} + +public class AzureOpenAIConfiguration +{ + public string Endpoint { get; set; } = string.Empty; + public string EmbeddingModel { get; set; } = "text-embedding-3-small"; +} + +public class MongoDBConfiguration +{ + public string ClusterName { get; set; } = string.Empty; + public string DatabaseName { get; set; } = "Hotels"; + public int LoadBatchSize { get; set; } = 100; +} + +public class EmbeddingConfiguration +{ + public string EmbeddedField { get; set; } = "DescriptionVector"; + public int Dimensions { get; set; } = 1536; +} + +public class VectorSearchConfiguration +{ + public string Query { get; set; } = "luxury hotel near the beach"; + public string Similarity { get; set; } = ""; + public int TopK { get; set; } = 5; +} + +public class DataFilesConfiguration +{ + public string WithVectors { get; set; } = "data/Hotels_Vector.json"; +} diff --git a/ai/select-algorithm-dotnet/Models/HotelData.cs b/ai/select-algorithm-dotnet/Models/HotelData.cs new file mode 100644 index 0000000..4821ee3 --- /dev/null +++ b/ai/select-algorithm-dotnet/Models/HotelData.cs @@ -0,0 +1,19 @@ +using MongoDB.Bson; +using MongoDB.Bson.Serialization.Attributes; + +namespace SelectAlgorithm.Models; + +public class HotelData +{ + [BsonId] + [BsonRepresentation(BsonType.ObjectId)] + public string? Id { get; set; } + + public string HotelId { get; set; } = string.Empty; + public string HotelName { get; set; } = string.Empty; + public string Description { get; set; } = string.Empty; + public string Category { get; set; } = string.Empty; + + [BsonExtraElements] + public BsonDocument? ExtraElements { get; set; } +} diff --git a/ai/select-algorithm-dotnet/Program.cs b/ai/select-algorithm-dotnet/Program.cs new file mode 100644 index 0000000..37992ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Program.cs @@ -0,0 +1,40 @@ +using Microsoft.Extensions.Configuration; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +class Program +{ + static void Main(string[] args) + { + Console.WriteLine(); + Console.WriteLine("Select Algorithm Demo - Azure DocumentDB Vector Search (.NET)"); + Console.WriteLine(new string('-', 60)); + Console.WriteLine(); + + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) + .AddEnvironmentVariables() + .Build(); + + var appConfig = new AppConfiguration(); + configuration.Bind(appConfig); + + var command = args.Length > 0 ? args[0].ToLower() : "compare-all"; + + switch (command) + { + case "compare-all": + CompareAll.Run(appConfig); + break; + default: + Console.WriteLine($"Unknown command: {command}"); + Console.WriteLine("Usage: dotnet run -- compare-all"); + return; + } + + Console.WriteLine(); + Console.WriteLine("Done!"); + } +} diff --git a/ai/select-algorithm-dotnet/README.md b/ai/select-algorithm-dotnet/README.md new file mode 100644 index 0000000..2621f77 --- /dev/null +++ b/ai/select-algorithm-dotnet/README.md @@ -0,0 +1,137 @@ +# Select Algorithm - .NET (C#) + +Demonstrates three vector index algorithms available in Azure DocumentDB: + +| Algorithm | Best For | Cluster Tier | Key Parameters | +|-----------|----------|--------------|----------------| +| **IVF** | < 10,000 documents | M10+ | `numLists` | +| **HNSW** | 10,000–50,000 documents | M30+ | `m`, `efConstruction` | +| **DiskANN** | 50,000+ documents | M40+ | `maxDegree`, `lBuild` | + +## Prerequisites + +- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) +- Azure DocumentDB cluster +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. **Configure environment:** + + The .NET sample uses `appsettings.json` for configuration. After deploying with `azd up`, you can export values: + + ```bash + azd env get-values + ``` + + Then update `appsettings.json` with your Azure resource values. + +2. Edit `appsettings.json` with your configuration: + + ```json + { + "AzureOpenAI": { + "EmbeddingModel": "text-embedding-3-small", + "Endpoint": "https://.openai.azure.com" + }, + "MongoDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "DataFiles": { + "WithVectors": "../data/Hotels_Vector.json" + } + } + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Restore packages: + + ```bash + dotnet restore + ``` + +## Usage + +Run all 9 combinations (default): + +```bash +dotnet run +``` + +## Configuration + +| Setting (appsettings.json) | Default | Description | +|---------------------------|---------|-------------| +| `MongoDB:ClusterName` | (required) | DocumentDB cluster name | +| `AzureOpenAI:Endpoint` | (required) | Azure OpenAI endpoint | +| `AzureOpenAI:EmbeddingModel` | (required) | Embedding model deployment name | +| `DataFiles:WithVectors` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `Embedding:EmbeddedField` | `DescriptionVector` | Field name containing embeddings | +| `Embedding:Dimensions` | `1536` | Vector dimensions | +| `MongoDB:DatabaseName` | `Hotels` | Target database name | +| `MongoDB:LoadBatchSize` | `100` | Batch size for data loading | +| `Embedding:EmbeddingSizeBatch` | `16` | Batch size for embedding requests | + +**Additional environment variables for compare mode:** + +| Variable | Default | Description | +|----------|---------|-------------| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show detailed per-result output | + +## How It Works + +1. **Connect** to DocumentDB using Microsoft Entra ID (OIDC) passwordless authentication +2. **Load** hotel documents with pre-computed embeddings from `Hotels_Vector.json` +3. For each of 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +## Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + +## Authentication + +This sample uses `DefaultAzureCredential` for both: +- **DocumentDB**: OIDC-based MongoDB authentication +- **Azure OpenAI**: Token-based authentication with `https://cognitiveservices.azure.com/.default` scope + +Ensure you are logged in with `az login` and have appropriate RBAC roles assigned. + +## Project Structure + +``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json # Dev container configuration +├── Models/ +│ ├── Configuration.cs # App configuration model +│ └── HotelData.cs # Hotel document model +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs # OIDC token handler +├── appsettings.json # Configuration file +├── CompareAll.cs # Unified 9-combination comparison runner +├── Program.cs # Entry point +├── README.md # This file +├── SelectAlgorithm.csproj # Project file +└── Utils.cs # Shared helpers (connection, embedding, search) +``` diff --git a/ai/select-algorithm-dotnet/SelectAlgorithm.csproj b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj new file mode 100644 index 0000000..331e522 --- /dev/null +++ b/ai/select-algorithm-dotnet/SelectAlgorithm.csproj @@ -0,0 +1,23 @@ + + + Exe + net8.0 + enable + enable + SelectAlgorithm + + + + + + + + + + + + + PreserveNewest + + + diff --git a/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs new file mode 100644 index 0000000..eca94fd --- /dev/null +++ b/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs @@ -0,0 +1,32 @@ +using Azure.Core; +using MongoDB.Driver.Authentication.Oidc; + +namespace SelectAlgorithm.Utilities; + +internal sealed class AzureIdentityTokenHandler( + TokenCredential credential, + string? tenantId +) : IOidcCallback +{ + private readonly string[] scopes = ["https://ossrdbms-aad.database.windows.net/.default"]; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = credential.GetToken( + new TokenRequestContext(scopes, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + AccessToken token = await credential.GetTokenAsync( + new TokenRequestContext(scopes, parentRequestId: null, tenantId: tenantId), + cancellationToken + ); + + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} diff --git a/ai/select-algorithm-dotnet/Utils.cs b/ai/select-algorithm-dotnet/Utils.cs new file mode 100644 index 0000000..62590ad --- /dev/null +++ b/ai/select-algorithm-dotnet/Utils.cs @@ -0,0 +1,190 @@ +using MongoDB.Driver; +using MongoDB.Driver.Authentication.Oidc; +using MongoDB.Bson; +using MongoDB.Bson.Serialization; +using Azure.Identity; +using Azure.Core; +using Azure.AI.OpenAI; +using OpenAI.Embeddings; +using SelectAlgorithm.Models; + +namespace SelectAlgorithm; + +public class AzureOidcCallback : IOidcCallback +{ + private readonly DefaultAzureCredential _credential; + private static readonly string[] Scopes = { "https://ossrdbms-aad.database.windows.net/.default" }; + + public AzureOidcCallback(DefaultAzureCredential credential) => _credential = credential; + + public OidcAccessToken GetOidcAccessToken(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = _credential.GetToken(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } + + public async Task GetOidcAccessTokenAsync(OidcCallbackParameters parameters, CancellationToken cancellationToken) + { + var token = await _credential.GetTokenAsync(new TokenRequestContext(Scopes), cancellationToken); + return new OidcAccessToken(token.Token, token.ExpiresOn - DateTimeOffset.UtcNow); + } +} + +public static class Utils +{ + public static IMongoClient GetMongoClientPasswordless(AppConfiguration config) + { + var clusterName = config.MongoDB.ClusterName; + if (string.IsNullOrEmpty(clusterName)) + throw new InvalidOperationException("MongoDB:ClusterName is required in appsettings.json"); + + var credential = new DefaultAzureCredential(); + + var connectionString = $"mongodb+srv://{clusterName}.global.mongocluster.cosmos.azure.com/"; + var settings = MongoClientSettings.FromConnectionString(connectionString); + settings.ConnectTimeout = TimeSpan.FromSeconds(120); + settings.UseTls = true; + settings.RetryWrites = false; + + // Custom OIDC callback using DefaultAzureCredential + // Chains through CLI, managed identity, etc. + var oidcCallback = new AzureOidcCallback(credential); + settings.Credential = MongoCredential.CreateOidcCredential(oidcCallback, null); + + return new MongoClient(settings); + } + + public static EmbeddingClient GetEmbeddingClient(AppConfiguration config) + { + var endpoint = config.AzureOpenAI.Endpoint; + if (string.IsNullOrEmpty(endpoint)) + throw new InvalidOperationException("AzureOpenAI:Endpoint is required in appsettings.json"); + + var model = config.AzureOpenAI.EmbeddingModel; + + var credential = new DefaultAzureCredential(); + var azureClient = new AzureOpenAIClient(new Uri(endpoint), credential); + return azureClient.GetEmbeddingClient(model); + } + + public static List ReadJsonFile(string path) + { + if (!File.Exists(path)) + throw new FileNotFoundException($"Data file not found: {path}"); + + var json = File.ReadAllText(path); + return BsonSerializer.Deserialize>(json); + } + + public static void InsertData(IMongoCollection collection, List data, int batchSize) + { + var totalDocuments = data.Count; + var existingCount = collection.CountDocuments(new BsonDocument()); + + if (existingCount >= totalDocuments) + { + Console.WriteLine($"Collection already has {existingCount} documents, skipping insert"); + return; + } + + if (existingCount > 0) + { + collection.DeleteMany(new BsonDocument()); + } + + var insertedCount = 0; + for (var i = 0; i < totalDocuments; i += batchSize) + { + var batch = data.Skip(i).Take(batchSize).ToList(); + try + { + collection.InsertMany(batch, new InsertManyOptions { IsOrdered = false }); + insertedCount += batch.Count; + } + catch (MongoBulkWriteException) + { + // Some documents may have been inserted before the error + insertedCount += batch.Count; + } + Thread.Sleep(100); + } + + Console.WriteLine($"Inserted {insertedCount}/{totalDocuments} documents"); + } + + public static void DropVectorIndexes(IMongoCollection collection, string vectorField) + { + try + { + using var cursor = collection.Indexes.List(); + var indexes = cursor.ToList(); + foreach (var index in indexes) + { + if (index.Contains("key")) + { + var key = index["key"].AsBsonDocument; + if (key.Contains(vectorField) && key[vectorField].AsString == "cosmosSearch") + { + var indexName = index["name"].AsString; + collection.Indexes.DropOne(indexName); + Console.WriteLine($"Dropped existing vector index: {indexName}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"Warning: Error dropping indexes: {ex.Message}"); + } + } + + public static List PerformVectorSearch( + IMongoCollection collection, + EmbeddingClient client, + string query, + string vectorField, + string model, + int topK = 5) + { + var embeddingResult = client.GenerateEmbedding(query); + var queryVector = embeddingResult.Value.ToFloats().ToArray(); + + var pipeline = new[] + { + new BsonDocument("$search", new BsonDocument("cosmosSearch", new BsonDocument + { + { "vector", new BsonArray(queryVector.Select(f => (double)f)) }, + { "path", vectorField }, + { "k", topK } + })), + new BsonDocument("$project", new BsonDocument + { + { "document", "$$ROOT" }, + { "score", new BsonDocument("$meta", "searchScore") } + }) + }; + + return collection.Aggregate(pipeline).ToList(); + } + + public static void PrintSearchResults(List results, string algorithm) + { + Console.WriteLine(); + Console.WriteLine(new string('=', 60)); + Console.WriteLine($" {algorithm} Search Results ({results.Count} found)"); + Console.WriteLine(new string('=', 60)); + + for (var i = 0; i < results.Count; i++) + { + var result = results[i]; + var doc = result.Contains("document") ? result["document"].AsBsonDocument : result; + var name = doc.Contains("HotelName") ? doc["HotelName"].AsString + : doc.Contains("name") ? doc["name"].AsString + : "Unknown"; + var score = result.Contains("score") ? result["score"].ToDouble() : 0.0; + Console.WriteLine($" {i + 1}. {name} (score: {score:F4})"); + } + + Console.WriteLine(); + } +} diff --git a/ai/select-algorithm-dotnet/appsettings.json b/ai/select-algorithm-dotnet/appsettings.json new file mode 100644 index 0000000..68ee696 --- /dev/null +++ b/ai/select-algorithm-dotnet/appsettings.json @@ -0,0 +1,24 @@ +{ + "MongoDB": { + "DatabaseName": "Hotels", + "ClusterName": "", + "LoadBatchSize": 50 + }, + "VectorSearch": { + "Similarity": "", + "TopK": 5, + "Query": "luxury hotel near the beach" + }, + "AzureOpenAI": { + "Endpoint": "https://.openai.azure.com/", + "EmbeddingModel": "text-embedding-3-small" + }, + "DataFiles": { + "WithVectors": "../data/Hotels_Vector.json" + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + } +} diff --git a/ai/select-algorithm-dotnet/data/README.md b/ai/select-algorithm-dotnet/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-dotnet/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-dotnet/output/compare_all.txt b/ai/select-algorithm-dotnet/output/compare_all.txt new file mode 100644 index 0000000..74a325e --- /dev/null +++ b/ai/select-algorithm-dotnet/output/compare_all.txt @@ -0,0 +1,51 @@ +============================================================ + Compare All Algorithms × Metrics + 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 5 +Embedding generated (reused for all searches) + +Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' + +Done! diff --git a/ai/select-algorithm-dotnet/quickstart.md b/ai/select-algorithm-dotnet/quickstart.md new file mode 100644 index 0000000..e94c2da --- /dev/null +++ b/ai/select-algorithm-dotnet/quickstart.md @@ -0,0 +1,493 @@ +--- +title: Quickstart - Vector index with .NET +description: Compare DiskANN, HNSW, and IVF vector search algorithms in Azure DocumentDB using the .NET client library with passwordless authentication. +ms.devlang: csharp +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with .NET in Azure DocumentDB + +This article shows you how to compare all three vector search algorithms (DiskANN, HNSW, and IVF) in Azure DocumentDB using the .NET client library. The sample demonstrates how each algorithm performs with different similarity functions (COS, L2, IP) and helps you choose the right configuration for your workload. This quickstart uses a sample hotel dataset in a JSON file with pre-calculated vectors from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-dotnet) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later. + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` [raw data file with vectors](https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json) to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file downloaded successfully: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a .NET project + +1. Create a new directory for your project and initialize the .NET console application: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-dotnet + cd select-algorithm-dotnet + dotnet new console --framework net8.0 + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-dotnet + Set-Location select-algorithm-dotnet + dotnet new console --framework net8.0 + ``` + + --- + + Verify the project was created: + + ### [Bash](#tab/bash) + + ```bash + ls *.csproj + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem *.csproj + ``` + + --- + +2. Install the required NuGet packages: + + ```bash + dotnet add package Azure.AI.OpenAI --version 2.1.0 + dotnet add package Azure.Identity --version 1.17.1 + dotnet add package MongoDB.Driver --version 3.0.0 + dotnet add package Microsoft.Extensions.Configuration --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.Binder --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.EnvironmentVariables --version 9.0.0 + dotnet add package Microsoft.Extensions.Configuration.Json --version 9.0.0 + dotnet add package Microsoft.Extensions.DependencyInjection --version 9.0.0 + dotnet add package Microsoft.Extensions.Logging --version 9.0.0 + dotnet add package Microsoft.Extensions.Logging.Console --version 9.0.0 + ``` + + These packages provide: + - `Azure.AI.OpenAI`: Azure OpenAI client library to create vector embeddings + - `Azure.Identity`: Azure Identity library for passwordless authentication with DefaultAzureCredential + - `MongoDB.Driver`: MongoDB driver for .NET to interact with DocumentDB + - `Microsoft.Extensions.*`: Configuration, dependency injection, and logging infrastructure + + Verify installed packages: + + ```bash + dotnet list package + ``` + +3. Create environment variables for authentication and configuration overrides. The sample uses `DefaultAzureCredential` for passwordless authentication, and .NET maps environment variables to `appsettings.json` keys by using the `Section__Key` format: + + ### [Bash](#tab/bash) + + ```bash + export AzureOpenAI__Endpoint="https://.openai.azure.com" + export AzureOpenAI__EmbeddingModel="text-embedding-3-small" + export MongoDB__ClusterName="" + export DataFiles__WithVectors="data/Hotels_Vector.json" + export AZURE_TENANT_ID="" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:AzureOpenAI__Endpoint="https://.openai.azure.com" + $env:AzureOpenAI__EmbeddingModel="text-embedding-3-small" + $env:MongoDB__ClusterName="" + $env:DataFiles__WithVectors="data/Hotels_Vector.json" + $env:AZURE_TENANT_ID="" + ``` + + --- + + Replace the placeholder values with your own information: + - ``: Your Azure OpenAI resource name + - ``: Your Azure DocumentDB cluster name + - ``: Your Microsoft Entra tenant ID + + These environment variables override the matching values in `appsettings.json`. For example, `MongoDB__ClusterName` overrides `MongoDB:ClusterName` and `AzureOpenAI__Endpoint` overrides `AzureOpenAI:Endpoint`. + + You should always prefer passwordless authentication. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate .NET apps to Azure services by using the Azure SDK for .NET](/dotnet/azure/sdk/authentication). + +4. Sign in with Azure CLI for passwordless authentication: + + ```bash + az login + ``` + +5. Create an `appsettings.json` configuration file: + + ### [Bash](#tab/bash) + + ```bash + touch appsettings.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType File -Name appsettings.json + ``` + + --- + + Add this content to `appsettings.json`: + + ```json + { + "AzureOpenAI": { + "Endpoint": "https://.openai.azure.com", + "EmbeddingModel": "text-embedding-3-small" + }, + "MongoDB": { + "ClusterName": "", + "DatabaseName": "Hotels", + "LoadBatchSize": 100 + }, + "Embedding": { + "EmbeddedField": "DescriptionVector", + "Dimensions": 1536, + "EmbeddingSizeBatch": 16 + }, + "VectorSearch": { + "Query": "quintessential lodging near running trails, eateries, retail", + "Similarity": "", + "TopK": 5 + }, + "DataFiles": { + "WithVectors": "data/Hotels_Vector.json" + } + } + ``` + + You can keep placeholder values in `appsettings.json` and override them at runtime with environment variables such as `AzureOpenAI__Endpoint` and `MongoDB__ClusterName`. + +## Create code files + +Continue the project by creating code files for vector search comparison. When you are done, the project structure should look like this: + +``` +select-algorithm-dotnet/ +├── .devcontainer/ +│ └── devcontainer.json +├── data/ +│ └── README.md +├── Models/ +│ ├── Configuration.cs +│ └── HotelData.cs +├── output/ +│ └── compare_all.txt +├── Utilities/ +│ └── AzureIdentityTokenHandler.cs +├── .gitignore +├── appsettings.json +├── CompareAll.cs +├── Program.cs +├── quickstart.md +├── README.md +├── SelectAlgorithm.csproj +└── Utils.cs +``` + +1. Create the directory structure: + + ### [Bash](#tab/bash) + + ```bash + mkdir Models + mkdir Utilities + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name Models + New-Item -ItemType Directory -Name Utilities + ``` + + --- + +2. Create the code files: + + ### [Bash](#tab/bash) + + ```bash + touch CompareAll.cs + touch Utils.cs + touch Models/Configuration.cs + touch Models/HotelData.cs + touch Utilities/AzureIdentityTokenHandler.cs + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType File -Name CompareAll.cs + New-Item -ItemType File -Name Utils.cs + New-Item -ItemType File -Path Models\Configuration.cs + New-Item -ItemType File -Path Models\HotelData.cs + New-Item -ItemType File -Path Utilities\AzureIdentityTokenHandler.cs + ``` + + --- + +## Create the algorithm comparison code + +### Program.cs + +Replace the contents of `Program.cs` with this code: + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Program.cs" ::: + +This main entry point: +- Loads configuration from appsettings.json and environment variables +- Sets up dependency injection with logging infrastructure +- Initializes Azure OpenAI and DocumentDB clients using passwordless authentication +- Calls `CompareAll.Run()` to execute the flat project entry point +- Runs the comparison and prints results in a table format + +### CompareAll.cs + +Add this code to `CompareAll.cs`: + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/CompareAll.cs" ::: + +This service: +- Manages the comparison workflow for all algorithms +- Creates collections and indexes for each algorithm/similarity combination +- Inserts data and executes vector searches +- Measures and collects latency metrics +- Configures algorithm-specific parameters for index creation and search + +### Supporting files + +Create the following supporting files in the project: + +#### Utils.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Utils.cs" ::: + +#### Utilities/AzureIdentityTokenHandler.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Utilities/AzureIdentityTokenHandler.cs" ::: + +#### Models/Configuration.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Models/Configuration.cs" ::: + +#### Models/HotelData.cs + +:::code language="csharp" source="~/../documentdb-samples/ai/select-algorithm-dotnet/Models/HotelData.cs" ::: + +These supporting files provide: +- Passwordless authentication setup for Azure OpenAI and DocumentDB +- OIDC token handler for automatic token refresh +- JSON file reading and deserialization +- Batch data insertion with error handling +- Results formatting and display + +## Run the code + +1. Build the project: + + ```bash + dotnet build + ``` + +2. Run the flat `SelectAlgorithm.csproj` entry point to compare all 9 algorithm × similarity combinations: + + ```bash + dotnet run + ``` + + The application loads the sample data once, then creates and tests all 9 algorithm × similarity combinations sequentially. + +3. The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +4. Repeat `dotnet run` whenever you want to rerun the flat `SelectAlgorithm.csproj` entry point: + + ### [Bash](#tab/bash) + + ```bash + dotnet run + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + dotnet run + ``` + + --- + +### Expected output + +The application displays progress logs and a comparison table: + +``` +============================================================ + Compare All Algorithms × Metrics + 9 combinations: IVF, HNSW, DiskANN × COS, L2, IP +============================================================ +Dropped existing 'hotels' collection (if any) + +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Query: "luxury hotel near the beach" +Top K: 5 +Embedding generated (reused for all searches) + +Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' + +Done! +``` + +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `TimeoutException` during connection | Verify your connection string and environment variables. Ensure your IP is in the DocumentDB firewall rules. | +| `AuthenticationException` | Check that `DefaultAzureCredential` can acquire a token. Run `az login` to refresh your credentials. | +| Build errors with .NET version | Ensure you have .NET 8.0 or later installed. Run `dotnet --version` to check. | +| `BsonSerializationException` | Ensure your model classes match the document structure in the collection. | +| Empty search results | The vector index might not be ready yet. The sample includes retry logic, but if you still see empty results, wait a few seconds and retry. | +| `IndexOptionsConflict` (code 85) | DocumentDB doesn't allow multiple vector indexes of the same kind on the same field. Drop the existing index before creating a new one. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [VS Code extension](#tab/vscode) + +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-go/.gitignore b/ai/select-algorithm-go/.gitignore new file mode 100644 index 0000000..76985d9 --- /dev/null +++ b/ai/select-algorithm-go/.gitignore @@ -0,0 +1,7 @@ +*.exe +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-go/README.md b/ai/select-algorithm-go/README.md new file mode 100644 index 0000000..f03828e --- /dev/null +++ b/ai/select-algorithm-go/README.md @@ -0,0 +1,199 @@ +# DocumentDB Vector Search - Go Algorithm Comparison Sample + +This sample demonstrates how to compare different vector search algorithms (IVF, HNSW, DiskANN) and similarity metrics (Cosine, L2, Inner Product) with Azure DocumentDB. + +## Prerequisites + +- [Go 1.24+](https://golang.org/dl/) +- [Azure DocumentDB cluster](/azure/documentdb/) (M40+ tier for DiskANN) +- [Azure OpenAI resource](https://learn.microsoft.com/azure/ai-services/openai/) with an embedding model deployed +- [Azure CLI](https://learn.microsoft.com/cli/azure/) (for passwordless authentication) +- Pre-generated embeddings file (`Hotels_Vector.json`) — see the `vector-search-go` sample + +## Setup + +1. **Clone the repository** and navigate to this directory: + + ```bash + cd ai/select-algorithm-go + ``` + +2. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + Required variables: + ```env + DOCUMENTDB_CLUSTER_NAME=your-cluster-name + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-resource.openai.azure.com + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_DOCUMENTDB_DATABASENAME=Hotels + DATA_FILE_WITH_VECTORS=../data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + ``` + +3. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +4. **Install dependencies**: + + ```bash + go mod download + ``` + +5. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +## Usage + +### Compare All Algorithms + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single execution: + +```bash +go run ./src/... +``` + +This creates indexes sequentially (create/search/drop per combo — DocumentDB allows one vector index per kind per field) and prints a comparison table showing scores and top results. + +**Output:** +``` +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== + ... +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +ALGORITHM SIMILARITY #1 RESULT #1 SCORE #2 RESULT #2 SCORE DIFF +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +### On Windows (PowerShell) + +```powershell +go run ./src/... +``` + +## Environment Variables + +| Variable | Default | Description | +|--------------|----------------------------------|---------------------------------| +| `DOCUMENTDB_CLUSTER_NAME` | *(required)* | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | *(required)* | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | `text-embedding-3-small` | Embedding model name | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Database name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to data file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Embedding vector dimensions | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data insertion | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query | +| `VERBOSE` | `false` | Show full results | + +## How It Works + +### Comparison Mode (`compare_all.go`) + +1. **Data Loading:** Loads hotel data with pre-generated embeddings +2. **Index Creation:** Creates vector indexes sequentially (one at a time): + - For each algorithm (IVF, HNSW, DiskANN) × each metric (COS, L2, IP): + - Create the index → wait for readiness → search → drop the index + - DocumentDB only allows one vector index per kind per field +3. **Query Execution:** Generates embedding once, reuses for all 9 searches +4. **Result Comparison:** Prints formatted table with #1/#2 results, scores, and diff + +## Index Parameters + +| Algorithm | Kind | Key Parameters | Values Used | +|-----------|-----------------|-----------------------------|-----------------------------| +| IVF | `vector-ivf` | `numLists` | 1 (optimized for small datasets) | +| HNSW | `vector-hnsw` | `m`, `efConstruction` | 16, 64 | +| DiskANN | `vector-diskann`| `maxDegree`, `lBuild` | 32, 50 | + +## Project Structure + +``` +select-algorithm-go/ +├── .env.example # Environment variable template +├── go.mod # Go module dependencies +├── go.sum # Go module checksums +├── output/ # Sample output files +├── README.md # This file +└── src/ + ├── main.go # Entry point + ├── utils.go # Shared config, auth, data, and search helpers + └── compare_all.go # Unified 9-combination comparison runner (create/search/drop) +``` + +## Authentication + +This sample uses **passwordless (OIDC) authentication** with `DefaultAzureCredential`. Ensure your Azure identity has: + +- **DocumentDB**: Appropriate RBAC role on the cluster +- **Azure OpenAI**: `Cognitive Services OpenAI User` role on the OpenAI resource + +The MongoDB OIDC auth uses the `https://ossrdbms-aad.database.windows.net/.default` scope, and the OpenAI client uses Azure token credentials. + +## Important Notes + +- **COS/IP scores:** Higher = more similar (0–1 range) +- **L2 scores:** Lower = more similar (distance metric) +- **Sequential indexing:** DocumentDB requires create/search/drop per combo (one vector index per kind per field) +- **Cleanup:** The sample automatically drops collections on exit +- **bson.D ordering:** All MongoDB commands use `bson.D` (ordered) instead of `bson.M` (unordered) to avoid "multi-key map" errors + +## Troubleshooting + +**"OIDC authentication failed"** +- Run `az login` and ensure you're authenticated +- Verify your Azure identity has RBAC permissions on the DocumentDB cluster +- Check that `DOCUMENTDB_CLUSTER_NAME` matches your cluster name + +**"DiskANN indexes require a higher cluster tier"** +- DiskANN requires M40+ cluster tier +- Try IVF or HNSW instead, or upgrade your cluster + +**"No documents found with embeddings"** +- Ensure `DATA_FILE_WITH_VECTORS` points to the correct file +- Verify the file contains the field specified in `EMBEDDED_FIELD` +- Check that embeddings were generated with the correct dimensions + +## Learn More + +- [Azure DocumentDB Documentation](/azure/documentdb/) +- [Vector Search in DocumentDB](/azure/documentdb/vector-search) +- [Choosing a Vector Index Algorithm](/azure/documentdb/vector-search-algorithms) +- [Go MongoDB driver](https://pkg.go.dev/go.mongodb.org/mongo-driver) diff --git a/ai/select-algorithm-go/data/README.md b/ai/select-algorithm-go/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-go/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-go/go.mod b/ai/select-algorithm-go/go.mod new file mode 100644 index 0000000..f669ace --- /dev/null +++ b/ai/select-algorithm-go/go.mod @@ -0,0 +1,35 @@ +module documentdb-select-algorithm + +go 1.24.0 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/openai/openai-go/v3 v3.12.0 + go.mongodb.org/mongo-driver v1.17.6 +) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/tidwall/gjson v1.18.0 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.1 // indirect + github.com/tidwall/sjson v1.2.5 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect +) diff --git a/ai/select-algorithm-go/go.sum b/ai/select-algorithm-go/go.sum new file mode 100644 index 0000000..6263657 --- /dev/null +++ b/ai/select-algorithm-go/go.sum @@ -0,0 +1,95 @@ +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= +github.com/openai/openai-go/v3 v3.12.0 h1:NkrImaglFQeDycc/n/fEmpFV8kKr8snl9/8X2x4eHOg= +github.com/openai/openai-go/v3 v3.12.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ai/select-algorithm-go/output/compare_all.txt b/ai/select-algorithm-go/output/compare_all.txt new file mode 100644 index 0000000..509f4b4 --- /dev/null +++ b/ai/select-algorithm-go/output/compare_all.txt @@ -0,0 +1,41 @@ +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 5 + +Loading data from data/Hotels_Vector.json... +Loaded 50 documents with embeddings +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create→search→drop)... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-go/quickstart.md b/ai/select-algorithm-go/quickstart.md new file mode 100644 index 0000000..bd9963b --- /dev/null +++ b/ai/select-algorithm-go/quickstart.md @@ -0,0 +1,409 @@ +--- +title: Quickstart - Vector index with Go +description: Compare DiskANN, HNSW, and IVF vector index algorithms using Go to select and tune the optimal index for your workload +ms.devlang: golang +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Go in Azure DocumentDB + +This quickstart walks you through building a Go application that compares all three vector index algorithms (DiskANN, HNSW, and IVF) side by side with different similarity functions to help you choose the best configuration for your workload. The sample uses a hotels dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-go) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Go](https://go.dev/doc/install) 1.22 or greater + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` [raw data file with vectors](https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json) to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Go project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-go + cd select-algorithm-go + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-go + Set-Location select-algorithm-go + code . + ``` + + --- + +2. Initialize a new Go module: + + ```bash + go mod init documentdb-vector-samples + ``` + + Verify the module was initialized: + + ### [Bash](#tab/bash) + + ```bash + cat go.mod + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content go.mod + ``` + + --- + +3. Install the required packages: + + ```bash + go get github.com/Azure/azure-sdk-for-go/sdk/azcore@v1.20.0 + go get github.com/Azure/azure-sdk-for-go/sdk/azidentity@v1.13.1 + go get github.com/openai/openai-go/v3@v3.12.0 + go get go.mongodb.org/mongo-driver@v1.17.6 + go mod tidy + ``` + + - `azcore`: Core Azure SDK functionality for Go + - `azidentity`: Azure Identity library for passwordless authentication with DefaultAzureCredential + - `openai-go/v3`: OpenAI client library with Azure support to generate embeddings + - `mongo-driver`: Official MongoDB driver for Go to work with DocumentDB + + Verify the packages are installed: + + ### [Bash](#tab/bash) + + ```bash + go list -m all | grep mongo + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + go list -m all | Select-String mongo + ``` + + --- + +4. Create a `.env` file for environment variables in `select-algorithm-go`: + + ```bash + # Azure OpenAI Embedding Configuration + AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 + AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-openai-resource.openai.azure.com/ + + # Data File Configuration + DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + EMBEDDED_FIELD=DescriptionVector + EMBEDDING_DIMENSIONS=1536 + LOAD_SIZE_BATCH=100 + + # DocumentDB Configuration + DOCUMENTDB_CLUSTER_NAME=your-cluster-name + + # The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). + # The ALGORITHM and SIMILARITY environment variables are used only by the single-algorithm mode. + + # Database name + AZURE_DOCUMENTDB_DATABASENAME=Hotels + ``` + + For the passwordless authentication used in this article, replace the placeholder values in the `.env` file with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name (not the full connection string, just the name) + + Verify the `.env` file was created: + + ### [Bash](#tab/bash) + + ```bash + cat .env + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Content .env + ``` + + --- + + You should always prefer passwordless authentication. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Go apps to Azure services by using the Azure SDK for Go](/azure/developer/go/azure-sdk-authentication). + +## Create code files + +Create a `src` directory and add the main application file: + +### [Bash](#tab/bash) + +```bash +mkdir src +touch src/main.go +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Name src +New-Item -ItemType File -Path src/main.go +``` + +--- + +When you're done, the project structure should look like this: + +```text +select-algorithm-go/ +├── data/ +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare_all.go +│ ├── main.go +│ └── utils.go +├── .gitignore +├── go.mod +├── quickstart.md +└── README.md +``` + +## Create the algorithm comparison code + +Create the following source files in the `src` directory. + +### src/main.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/main.go" ::: + +### src/compare_all.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/compare_all.go" ::: + +### src/utils.go + +:::code language="go" source="~/../documentdb-samples/ai/select-algorithm-go/src/utils.go" ::: + +This code provides a complete vector algorithm comparison application with these key features: + +- **Passwordless authentication**: Uses `DefaultAzureCredential` for both Azure OpenAI and DocumentDB via OIDC +- **Three vector algorithms**: Implements DiskANN, HNSW, and IVF with algorithm-specific tuning parameters +- **Three similarity functions**: Supports COS (cosine), L2 (Euclidean), and IP (inner product) +- **Single compare-all entry point**: Always runs all 9 algorithm × similarity combinations in one pass +- **Index lifecycle automation**: Creates, queries, and drops each vector index in sequence +- **Comparison output**: Generates a formatted table showing the top two results and score gap for each combination +- **Production-ready patterns**: Includes batched insertion, error handling, and connection pooling + +## Run the code + +Before running the code, source your `.env` file to load environment variables into your shell session. + +### [Bash](#tab/bash) + +```bash +export $(grep -v '^#' .env | xargs) +``` + +### [PowerShell](#tab/powershell) + +```powershell +Get-Content .env | ForEach-Object { + if ($_ -match '^\s*([^#][^=]+)=(.*)') { + [System.Environment]::SetEnvironmentVariable($Matches[1].Trim(), $Matches[2].Trim()) + } +} +``` + +--- + +After sourcing the environment variables, run the application: + +```bash +go run ./src/ +``` + +The application will: + +1. Connect to Azure DocumentDB and Azure OpenAI using passwordless authentication +2. Load the hotel data and insert it into the `hotels` collection +3. Generate an embedding for the search query +4. Run all 9 vector index comparisons by creating, querying, and dropping each index in sequence +5. Display a comparison table with the top two results and score gap for each combination +6. Drop the `hotels` collection during cleanup + +Expected output: + +```text +====================================================================== + COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations) +====================================================================== +Query: "luxury hotel near the beach" +Top-K: 5 + +Loading data from data/Hotels_Vector.json... +Loaded 50 documents with embeddings +Insertion completed: 50 inserted, 0 failed + +Generating embedding for query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running 9 vector index comparisons (create→search→drop)... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created + +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' +``` + +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +## Understanding the results + +The comparison table shows how different algorithms perform on the same dataset with the same query: + +- **Algorithm**: DiskANN, HNSW, or IVF +- **Metric**: The similarity metric (COS, L2, or IP) +- **Top 1 Result**: The highest-ranked hotel for that algorithm and metric +- **Score**: The relevance score for the corresponding result +- **Top 2 Result**: The second-highest-ranked hotel for that algorithm and metric +- **Diff**: The score gap between the top two results + +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] + +## Run all combinations + +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `server selection error` | Verify your connection string in `.env`. Ensure your IP is in the DocumentDB firewall rules. | +| `authentication failed` | Check credentials in connection string. Ensure `DefaultAzureCredential` is configured (run `az login`). | +| `go: module not found` | Run `go mod tidy` to resolve dependencies. | +| Build errors | Ensure Go 1.22+ is installed. Run `go version` to check. | +| Empty search results | The vector index may not be ready yet. The code includes retry logic, but larger datasets may need more time. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [VS Code extension](#tab/vscode) + +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-go/src/compare_all.go b/ai/select-algorithm-go/src/compare_all.go new file mode 100644 index 0000000..eda792a --- /dev/null +++ b/ai/select-algorithm-go/src/compare_all.go @@ -0,0 +1,348 @@ +package main + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "time" + + "github.com/openai/openai-go/v3" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" +) + +// CompareResult holds the result of a single algorithm+metric search +type CompareResult struct { + Algorithm string + Metric string + Results []SearchResult + Top1Name string + Top1Score float64 + Top2Name string + Top2Score float64 + Error error +} + +// indexSpec defines one of the 9 combinations +type indexSpec struct { + Algorithm string + Kind string + Metric string + IndexName string + Options bson.D +} + +// RunCompareAll executes all 9 algorithm×metric combinations on a single collection +func RunCompareAll(ctx context.Context, config *Config, dbClient *mongo.Client, aiClient openai.Client) error { + queryText := getEnvOrDefault("QUERY_TEXT", "luxury hotel near the beach") + topK, _ := strconv.Atoi(getEnvOrDefault("TOP_K", "5")) + + fmt.Println("\n" + strings.Repeat("=", 70)) + fmt.Println(" COMPARE ALL: 3 Algorithms × 3 Similarity Metrics (9 combinations)") + fmt.Println(strings.Repeat("=", 70)) + fmt.Printf("Query: %q\n", queryText) + fmt.Printf("Top-K: %d\n", topK) + + // 1. Drop collection for clean comparison, then load data + database := dbClient.Database(config.DatabaseName) + collection := database.Collection("hotels") + + // Drop existing collection for a clean comparison + if err := collection.Drop(ctx); err != nil { + fmt.Printf("Note: could not drop collection (may not exist): %v\n", err) + } else { + fmt.Println("Dropped existing 'hotels' collection") + } + + // Ensure cleanup on exit + defer func() { + fmt.Println("\nCleanup: dropping comparison collection...") + if dropErr := collection.Drop(ctx); dropErr != nil { + fmt.Printf("Cleanup warning: %v\n", dropErr) + } else { + fmt.Println("Cleanup: dropped collection 'hotels'") + } + }() + + fmt.Printf("\nLoading data from %s...\n", config.DataFile) + data, err := ReadFileReturnJSON(config.DataFile) + if err != nil { + return fmt.Errorf("failed to load data: %v", err) + } + + documentsWithEmbeddings := FilterDocumentsWithEmbeddings(data, config.VectorField) + if len(documentsWithEmbeddings) == 0 { + return fmt.Errorf("no documents found with embeddings in field '%s'", config.VectorField) + } + fmt.Printf("Loaded %d documents with embeddings\n", len(documentsWithEmbeddings)) + + stats, err := PrepareCollection(ctx, collection, documentsWithEmbeddings, config.BatchSize) + if err != nil { + return err + } + fmt.Printf("Insertion completed: %d inserted, %d failed\n", stats.Inserted, stats.Failed) + + // 2. Generate ONE embedding for the query (reused for all 9 searches) + fmt.Printf("\nGenerating embedding for query: %q\n", queryText) + queryEmbedding, err := GenerateEmbedding(ctx, aiClient, queryText, config.ModelName) + if err != nil { + return fmt.Errorf("failed to generate query embedding: %v", err) + } + fmt.Printf("Embedding generated (%d dimensions)\n", len(queryEmbedding)) + + // 3. Define all 9 index specs + metrics := []string{"COS", "L2", "IP"} + specs := buildIndexSpecs(config.VectorField, config.Dimensions, metrics) + + // 4. Create→search→drop each index sequentially (DocumentDB only allows one vector index per field) + fmt.Printf("\nRunning %d vector index comparisons (create→search→drop)...\n", len(specs)) + var results []CompareResult + successfulComparisons := 0 + failedComparisons := 0 + + for _, spec := range specs { + // Drop all existing vector indexes on this field + DropVectorIndexes(ctx, collection, config.VectorField) + + // Create this specific index with retry (drop may still be in progress) + var createErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(3 * time.Second) + } + createErr = createNamedVectorIndex(ctx, collection, config.VectorField, spec) + if createErr == nil { + break + } + } + if createErr != nil { + results = append(results, CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Error: createErr, + }) + failedComparisons++ + fmt.Printf(" ⚠ %s: %v\n", spec.IndexName, createErr) + continue + } + fmt.Printf(" ✓ %s created\n", spec.IndexName) + + // Search using simple cosmosSearch with bounded retry for index readiness. + searchResults, searchErr := runVectorSearchWithRetry(ctx, collection, queryEmbedding, config.VectorField, topK) + + top1Name, top1Score := extractResult(searchResults, 0) + top2Name, top2Score := extractResult(searchResults, 1) + + cr := CompareResult{ + Algorithm: spec.Algorithm, + Metric: spec.Metric, + Results: searchResults, + Top1Name: top1Name, + Top1Score: top1Score, + Top2Name: top2Name, + Top2Score: top2Score, + Error: searchErr, + } + results = append(results, cr) + if searchErr != nil { + failedComparisons++ + } else { + successfulComparisons++ + } + } + + // 6. Print comparison table + fmt.Println() + printComparisonTable(results) + fmt.Printf("\nSummary: %d succeeded, %d failed\n", successfulComparisons, failedComparisons) + if successfulComparisons == 0 { + return fmt.Errorf("all %d comparisons failed", failedComparisons) + } + + return nil +} + +func runVectorSearchWithRetry(ctx context.Context, collection *mongo.Collection, queryEmbedding []float64, vectorField string, topK int) ([]SearchResult, error) { + const maxAttempts = 6 + const retryDelay = 2 * time.Second + + var searchResults []SearchResult + var searchErr error + + for attempt := 1; attempt <= maxAttempts; attempt++ { + searchResults, searchErr = vectorSearchSimple(ctx, collection, queryEmbedding, vectorField, topK) + if searchErr == nil { + if len(searchResults) > 0 { + return searchResults, nil + } + searchErr = fmt.Errorf("search returned no results") + } + + if attempt < maxAttempts { + time.Sleep(retryDelay) + } + } + + return searchResults, searchErr +} + +// buildIndexSpecs creates the 9 index specifications +func buildIndexSpecs(vectorField string, dimensions int, metrics []string) []indexSpec { + var specs []indexSpec + + type algoConfig struct { + name string + kind string + options bson.D + } + + algos := []algoConfig{ + {"IVF", "vector-ivf", bson.D{{"numLists", 1}}}, + {"HNSW", "vector-hnsw", bson.D{{"m", 16}, {"efConstruction", 64}}}, + {"DiskANN", "vector-diskann", bson.D{{"maxDegree", 32}, {"lBuild", 50}}}, + } + + for _, algo := range algos { + for _, metric := range metrics { + metricLower := strings.ToLower(metric) + opts := bson.D{ + {"kind", algo.kind}, + {"dimensions", dimensions}, + {"similarity", metric}, + } + for _, o := range algo.options { + opts = append(opts, o) + } + + specs = append(specs, indexSpec{ + Algorithm: algo.name, + Kind: algo.kind, + Metric: metric, + IndexName: fmt.Sprintf("vector_%s_%s", strings.ToLower(algo.name), metricLower), + Options: opts, + }) + } + } + + return specs +} + +// createNamedVectorIndex creates a single named vector index +func createNamedVectorIndex(ctx context.Context, collection *mongo.Collection, vectorField string, spec indexSpec) error { + indexCommand := bson.D{ + {"createIndexes", collection.Name()}, + {"indexes", []bson.D{ + { + {"name", spec.IndexName}, + {"key", bson.D{ + {vectorField, "cosmosSearch"}, + }}, + {"cosmosSearchOptions", spec.Options}, + }, + }}, + } + + var result bson.M + err := collection.Database().RunCommand(ctx, indexCommand).Decode(&result) + if err != nil { + if strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "IndexAlreadyExists") { + return nil + } + return err + } + return nil +} + +// vectorSearchSimple performs a vector search using the active vector index +func vectorSearchSimple(ctx context.Context, collection *mongo.Collection, embedding []float64, vectorField string, topK int) ([]SearchResult, error) { + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": embedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, err + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, err + } + + return results, nil +} + +// extractResult returns the name and score of the result at the given index +func extractResult(results []SearchResult, idx int) (string, float64) { + if idx >= len(results) { + return "(no results)", 0 + } + doc := results[idx].Document.(bson.D) + var name string + for _, elem := range doc { + if elem.Key == "HotelName" { + name = fmt.Sprintf("%v", elem.Value) + break + } + } + if name == "" { + name = "Unknown" + } + return name, results[idx].Score +} + +// printComparisonTable outputs a formatted table of results +func printComparisonTable(results []CompareResult) { + fmt.Println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐") + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff") + fmt.Println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤") + + for _, r := range results { + if r.Error != nil { + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %-6s │ %-26s │ %-6s │ %-5s │\n", + r.Algorithm, r.Metric, "ERROR", "-", "-", "-", "-") + continue + } + + top1 := r.Top1Name + if len(top1) > 26 { + top1 = top1[:26] + } + top2 := r.Top2Name + if len(top2) > 26 { + top2 = top2[:26] + } + diff := math.Abs(r.Top1Score - r.Top2Score) + + fmt.Printf("│ %-8s │ %-6s │ %-26s │ %6.4f │ %-26s │ %6.4f │%6.4f │\n", + r.Algorithm, r.Metric, top1, r.Top1Score, top2, r.Top2Score, diff) + } + + fmt.Println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘") +} diff --git a/ai/select-algorithm-go/src/main.go b/ai/select-algorithm-go/src/main.go new file mode 100644 index 0000000..0596aa1 --- /dev/null +++ b/ai/select-algorithm-go/src/main.go @@ -0,0 +1,31 @@ +package main + +import ( + "context" + "fmt" + "log" +) + +func main() { + fmt.Println("Starting vector algorithm comparison...") + + ctx := context.Background() + config, err := LoadConfig() + if err != nil { + log.Fatalf("Invalid configuration: %v", err) + } + + fmt.Println("\nInitializing clients with passwordless authentication...") + mongoClient, azureOpenAIClient, err := GetClientsPasswordless(ctx, config) + if err != nil { + log.Fatalf("Failed to initialize clients: %v", err) + } + defer mongoClient.Disconnect(ctx) + + err = RunCompareAll(ctx, config, mongoClient, azureOpenAIClient) + if err != nil { + log.Fatalf("Compare all failed: %v", err) + } + + fmt.Println("\nComparison completed successfully!") +} diff --git a/ai/select-algorithm-go/src/utils.go b/ai/select-algorithm-go/src/utils.go new file mode 100644 index 0000000..c358892 --- /dev/null +++ b/ai/select-algorithm-go/src/utils.go @@ -0,0 +1,404 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/openai/openai-go/v3" + "github.com/openai/openai-go/v3/azure" + "github.com/openai/openai-go/v3/option" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +// Config holds the application configuration +type Config struct { + ClusterName string + DatabaseName string + DataFile string + VectorField string + ModelName string + Dimensions int + BatchSize int + Similarity string + Algorithm string +} + +// SearchResult represents a search result document +type SearchResult struct { + Document interface{} `bson:"document"` + Score float64 `bson:"score"` +} + +// InsertStats holds statistics about data insertion +type InsertStats struct { + Total int `json:"total"` + Inserted int `json:"inserted"` + Failed int `json:"failed"` +} + +// LoadConfig loads configuration from environment variables +func LoadConfig() (*Config, error) { + dimensions, err := parsePositiveIntEnv("EMBEDDING_DIMENSIONS", "1536") + if err != nil { + return nil, err + } + + batchSize, err := parsePositiveIntEnv("LOAD_SIZE_BATCH", "100") + if err != nil { + return nil, err + } + + return &Config{ + ClusterName: getEnvOrDefault("DOCUMENTDB_CLUSTER_NAME", ""), + DatabaseName: getEnvOrDefault("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"), + DataFile: getEnvOrDefault("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"), + VectorField: getEnvOrDefault("EMBEDDED_FIELD", "DescriptionVector"), + ModelName: getEnvOrDefault("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), + Dimensions: dimensions, + BatchSize: batchSize, + Similarity: getEnvOrDefault("SIMILARITY", ""), + Algorithm: strings.ToLower(getEnvOrDefault("ALGORITHM", "")), + }, nil +} + +func parsePositiveIntEnv(key, defaultValue string) (int, error) { + value := getEnvOrDefault(key, defaultValue) + parsedValue, err := strconv.Atoi(value) + if err != nil { + return 0, fmt.Errorf("%s must be a positive integer, got %q", key, value) + } + if parsedValue <= 0 { + return 0, fmt.Errorf("%s must be greater than 0, got %q", key, value) + } + return parsedValue, nil +} + +// getEnvOrDefault returns environment variable value or default if not set +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +// GetClientsPasswordless creates MongoDB and Azure OpenAI clients with passwordless authentication +func GetClientsPasswordless(ctx context.Context, config *Config) (*mongo.Client, openai.Client, error) { + if config.ClusterName == "" { + return nil, openai.Client{}, fmt.Errorf("DOCUMENTDB_CLUSTER_NAME environment variable is required") + } + + // Create Azure credential + credential, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("failed to create Azure credential: %v", err) + } + + // Connect to DocumentDB with OIDC authentication + mongoURI := fmt.Sprintf("mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", config.ClusterName) + + fmt.Println("Attempting OIDC authentication...") + mongoClient, err := connectWithOIDC(ctx, mongoURI, credential) + if err != nil { + return nil, openai.Client{}, fmt.Errorf("OIDC authentication failed: %v", err) + } + fmt.Println("OIDC authentication successful!") + + // Get Azure OpenAI endpoint + azureOpenAIEndpoint := os.Getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if azureOpenAIEndpoint == "" { + return nil, openai.Client{}, fmt.Errorf("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + } + + // Create Azure OpenAI client with credential-based authentication + openAIClient := openai.NewClient( + option.WithBaseURL(fmt.Sprintf("%s/openai/v1", azureOpenAIEndpoint)), + azure.WithTokenCredential(credential)) + + return mongoClient, openAIClient, nil +} + +// connectWithOIDC attempts to connect using OIDC authentication +func connectWithOIDC(ctx context.Context, mongoURI string, credential *azidentity.DefaultAzureCredential) (*mongo.Client, error) { + oidcCallback := func(ctx context.Context, args *options.OIDCArgs) (*options.OIDCCredential, error) { + scope := "https://ossrdbms-aad.database.windows.net/.default" + fmt.Printf("Getting token with scope: %s\n", scope) + token, err := credential.GetToken(ctx, policy.TokenRequestOptions{ + Scopes: []string{scope}, + }) + if err != nil { + return nil, fmt.Errorf("failed to get token with scope %s: %v", scope, err) + } + + fmt.Printf("Successfully obtained token\n") + + return &options.OIDCCredential{ + AccessToken: token.Token, + }, nil + } + + clientOptions := options.Client(). + ApplyURI(mongoURI). + SetConnectTimeout(30 * time.Second). + SetServerSelectionTimeout(30 * time.Second). + SetRetryWrites(false). + SetAuth(options.Credential{ + AuthMechanism: "MONGODB-OIDC", + AuthMechanismProperties: map[string]string{ + "TOKEN_RESOURCE": "https://ossrdbms-aad.database.windows.net", + }, + OIDCMachineCallback: oidcCallback, + }) + + mongoClient, err := mongo.Connect(ctx, clientOptions) + if err != nil { + return nil, err + } + + return mongoClient, nil +} + +// ReadFileReturnJSON reads a JSON file and returns the data as a slice of maps +func ReadFileReturnJSON(filePath string) ([]map[string]interface{}, error) { + file, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file '%s': %v", filePath, err) + } + + var data []map[string]interface{} + err = json.Unmarshal(file, &data) + if err != nil { + return nil, fmt.Errorf("error parsing JSON in file '%s': %v", filePath, err) + } + + return data, nil +} + +// InsertData inserts data into a MongoDB collection in batches +func InsertData(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + totalDocuments := len(data) + insertedCount := 0 + failedCount := 0 + + fmt.Printf("Starting batch insertion of %d documents...\n", totalDocuments) + + for i := 0; i < totalDocuments; i += batchSize { + end := i + batchSize + if end > totalDocuments { + end = totalDocuments + } + + batch := data[i:end] + batchNum := (i / batchSize) + 1 + + documents := make([]interface{}, len(batch)) + for j, doc := range batch { + documents[j] = doc + } + + result, err := collection.InsertMany(ctx, documents, options.InsertMany().SetOrdered(false)) + if err != nil { + if bulkErr, ok := err.(mongo.BulkWriteException); ok { + errorCount := len(bulkErr.WriteErrors) + insertedCount += len(batch) - errorCount + failedCount += errorCount + fmt.Printf("Batch %d had errors: %d inserted, %d failed\n", batchNum, len(batch)-errorCount, errorCount) + for _, writeErr := range bulkErr.WriteErrors { + fmt.Printf(" Error: %s\n", writeErr.Message) + } + } else { + failedCount += len(batch) + fmt.Printf("Batch %d failed completely: %v\n", batchNum, err) + } + } else { + insertedCount += len(result.InsertedIDs) + fmt.Printf("Batch %d completed: %d documents inserted\n", batchNum, len(result.InsertedIDs)) + } + + time.Sleep(100 * time.Millisecond) + } + + return &InsertStats{ + Total: totalDocuments, + Inserted: insertedCount, + Failed: failedCount, + }, nil +} + +// DropVectorIndexes drops existing vector indexes on the specified field +func DropVectorIndexes(ctx context.Context, collection *mongo.Collection, vectorField string) error { + cursor, err := collection.Indexes().List(ctx) + if err != nil { + return fmt.Errorf("could not list indexes: %v", err) + } + defer cursor.Close(ctx) + + var vectorIndexes []string + for cursor.Next(ctx) { + var index bson.M + if err := cursor.Decode(&index); err != nil { + continue + } + + if key, ok := index["key"].(bson.M); ok { + if indexType, exists := key[vectorField]; exists && indexType == "cosmosSearch" { + if name, ok := index["name"].(string); ok { + vectorIndexes = append(vectorIndexes, name) + } + } + } + } + + for _, indexName := range vectorIndexes { + fmt.Printf("Dropping existing vector index: %s\n", indexName) + _, err := collection.Indexes().DropOne(ctx, indexName) + if err != nil { + fmt.Printf("Warning: Could not drop index %s: %v\n", indexName, err) + } + } + + if len(vectorIndexes) > 0 { + fmt.Printf("Dropped %d existing vector index(es)\n", len(vectorIndexes)) + } else { + fmt.Println("No existing vector indexes found to drop") + } + + return nil +} + +// PerformVectorSearch performs a vector search using the cosmosSearch aggregation pipeline +func PerformVectorSearch(ctx context.Context, collection *mongo.Collection, client openai.Client, query, vectorField, model string, topK int) ([]SearchResult, error) { + fmt.Printf("Performing vector search for: '%s'\n", query) + + queryEmbedding, err := GenerateEmbedding(ctx, client, query, model) + if err != nil { + return nil, fmt.Errorf("error generating embedding: %v", err) + } + + pipeline := []bson.M{ + { + "$search": bson.M{ + "cosmosSearch": bson.M{ + "vector": queryEmbedding, + "path": vectorField, + "k": topK, + }, + }, + }, + { + "$project": bson.M{ + "document": "$$ROOT", + "score": bson.M{"$meta": "searchScore"}, + }, + }, + } + + cursor, err := collection.Aggregate(ctx, pipeline) + if err != nil { + return nil, fmt.Errorf("error performing vector search: %v", err) + } + defer cursor.Close(ctx) + + var results []SearchResult + for cursor.Next(ctx) { + var result SearchResult + if err := cursor.Decode(&result); err != nil { + fmt.Printf("Warning: Could not decode result: %v\n", err) + continue + } + results = append(results, result) + } + + if err := cursor.Err(); err != nil { + return nil, fmt.Errorf("cursor error: %v", err) + } + + return results, nil +} + +// GenerateEmbedding generates an embedding for the given text using Azure OpenAI +func GenerateEmbedding(ctx context.Context, client openai.Client, text, modelName string) ([]float64, error) { + resp, err := client.Embeddings.New(ctx, openai.EmbeddingNewParams{ + Input: openai.EmbeddingNewParamsInputUnion{ + OfString: openai.String(text), + }, + Model: modelName, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate embedding: %v", err) + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding data received") + } + + embedding := make([]float64, len(resp.Data[0].Embedding)) + for i, v := range resp.Data[0].Embedding { + embedding[i] = float64(v) + } + + return embedding, nil +} + +// PrintSearchResults prints search results in a formatted way +func PrintSearchResults(results []SearchResult, algorithm string) { + if len(results) == 0 { + fmt.Println("No search results found.") + return + } + + fmt.Printf("\n%s Search Results (top %d):\n", strings.ToUpper(algorithm), len(results)) + fmt.Println(strings.Repeat("=", 80)) + + for i, result := range results { + doc := result.Document.(bson.D) + var hotelName string + for _, elem := range doc { + if elem.Key == "HotelName" { + hotelName = fmt.Sprintf("%v", elem.Value) + break + } + } + + fmt.Printf("%d. HotelName: %s, Score: %.4f\n", i+1, hotelName, result.Score) + } +} + +// FilterDocumentsWithEmbeddings returns only documents that contain the vector field +func FilterDocumentsWithEmbeddings(data []map[string]interface{}, vectorField string) []map[string]interface{} { + var filtered []map[string]interface{} + for _, doc := range data { + if _, exists := doc[vectorField]; exists { + filtered = append(filtered, doc) + } + } + return filtered +} + +// PrepareCollection clears existing data and inserts new documents +func PrepareCollection(ctx context.Context, collection *mongo.Collection, data []map[string]interface{}, batchSize int) (*InsertStats, error) { + fmt.Printf("Preparing collection '%s'...\n", collection.Name()) + + deleteResult, err := collection.DeleteMany(ctx, bson.M{}) + if err != nil { + return nil, fmt.Errorf("failed to clear existing data: %v", err) + } + if deleteResult.DeletedCount > 0 { + fmt.Printf("Cleared %d existing documents from collection\n", deleteResult.DeletedCount) + } + + stats, err := InsertData(ctx, collection, data, batchSize) + if err != nil { + return nil, fmt.Errorf("failed to insert data: %v", err) + } + + return stats, nil +} diff --git a/ai/select-algorithm-java/.gitignore b/ai/select-algorithm-java/.gitignore new file mode 100644 index 0000000..9ae5e73 --- /dev/null +++ b/ai/select-algorithm-java/.gitignore @@ -0,0 +1,7 @@ +target/ +.env +*.class + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-java/README.md b/ai/select-algorithm-java/README.md new file mode 100644 index 0000000..2449f40 --- /dev/null +++ b/ai/select-algorithm-java/README.md @@ -0,0 +1,128 @@ +# Select Algorithm - Java + +This sample demonstrates how to compare all three vector search index algorithms (IVF, HNSW, DiskANN) with Azure DocumentDB using the MongoDB Java driver. + +## Prerequisites + +- Java 17 or later +- Maven 3.8+ +- Azure DocumentDB cluster with vector search enabled +- Azure OpenAI resource with an embedding model deployed +- Azure CLI logged in (`az login`) for passwordless authentication + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Update `.env` with your Azure resource details (if not using `azd`): + - `DOCUMENTDB_CLUSTER_NAME` — your DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT` — your Azure OpenAI endpoint + - `AZURE_OPENAI_EMBEDDING_MODEL` — deployment name (e.g., `text-embedding-3-small`) + - `DATA_FILE_WITH_VECTORS` — path to the pre-computed vectors JSON file + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +## Build + +```bash +mvn clean compile +``` + +## Run + +Compare all 9 algorithm × similarity combinations: + +```bash +mvn exec:java -Pcompare +``` + +Or via the `ALGORITHM` environment variable: + +```bash +ALGORITHM=compare mvn exec:java +``` + +On Windows (PowerShell): + +```powershell +$env:ALGORITHM="compare"; mvn exec:java +``` + +## Algorithms + +| Algorithm | Description | Best For | +|-----------|-------------|----------| +| **IVF** | Inverted File index — partitions vectors into clusters | Large datasets with batch queries | +| **HNSW** | Hierarchical Navigable Small World graph | Low-latency, high-recall searches | +| **DiskANN** | Disk-based Approximate Nearest Neighbor | Very large datasets that exceed memory | + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `DOCUMENTDB_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Print detailed per-index results | + +## Authentication + +This sample uses **passwordless authentication** via `DefaultAzureCredential`: + +- **DocumentDB**: OIDC mechanism with Azure identity +- **Azure OpenAI**: Entra ID token-based auth + +Ensure your identity has the appropriate RBAC roles assigned on both resources. + +### What It Does + +1. Connects to DocumentDB and loads hotel data into a single `hotels` collection +2. Generates one embedding for the query text (reused for all searches) +3. For each of the 9 algorithm/metric combinations: creates the index → searches → drops the index +4. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially +5. Prints a formatted comparison table with scores, top results, and key insights + +### Index Parameters + +| Algorithm | Kind | Parameters | +|-----------|------|------------| +| IVF | `vector-ivf` | numLists=1 | +| HNSW | `vector-hnsw` | m=16, efConstruction=64 | +| DiskANN | `vector-diskann` | maxDegree=32, lBuild=50 | + +## Project Structure + +``` +src/main/java/com/azure/documentdb/selectalgorithm/ +├── Main.java — Entry point, runs CompareAll +├── Utils.java — Shared helpers (connection, embedding, data loading) +└── CompareAll.java — Unified comparison runner (all 9 combinations) +``` diff --git a/ai/select-algorithm-java/data/README.md b/ai/select-algorithm-java/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-java/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-java/output/compare_all.txt b/ai/select-algorithm-java/output/compare_all.txt new file mode 100644 index 0000000..cbde87e --- /dev/null +++ b/ai/select-algorithm-java/output/compare_all.txt @@ -0,0 +1,54 @@ +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 5 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' +============================================== + Comparison complete. +============================================== diff --git a/ai/select-algorithm-java/pom.xml b/ai/select-algorithm-java/pom.xml new file mode 100644 index 0000000..99c57e9 --- /dev/null +++ b/ai/select-algorithm-java/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + + diff --git a/ai/select-algorithm-java/quickstart.md b/ai/select-algorithm-java/quickstart.md new file mode 100644 index 0000000..62a1e09 --- /dev/null +++ b/ai/select-algorithm-java/quickstart.md @@ -0,0 +1,436 @@ +--- +title: Quickstart - Vector index with Java +description: Test and compare DiskANN, HNSW, and IVF vector indexes in Azure DocumentDB using Java to select the best algorithm for your vector search workload. +ms.devlang: java +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Java in Azure DocumentDB + +This quickstart compares vector index algorithms (DiskANN, HNSW, IVF) in Azure DocumentDB using Java to help you select the best configuration for your vector search workload. The sample uses the same hotel dataset with pre-calculated vectors as the other quickstarts to demonstrate performance differences across algorithms and similarity functions. + + + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Java 17 or higher](/java/openjdk/download) + +- [Maven 3.6 or higher](https://maven.apache.org/download.cgi) + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` data file with vectors to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify: Confirm the file exists and is valid JSON: + + ### [Bash](#tab/bash) + + ```bash + ls -lh data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-Item data\Hotels_Vector.json + ``` + + --- + +## Create a Java project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-java + cd select-algorithm-java + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-java + Set-Location select-algorithm-java + code . + ``` + + --- + +2. Create a standard Maven project structure: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p src/main/java/com/azure/documentdb/selectalgorithm + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Path "src\main\java\com\azure\documentdb\selectalgorithm" -Force + ``` + + --- + +3. Create a `pom.xml` file in the root directory with the following content: + + ```xml + + + 4.0.0 + + com.azure.documentdb + select-algorithm-java + 1.0.0 + jar + + DocumentDB Select Algorithm - Java + Demonstrates IVF, HNSW, and DiskANN vector search indexes with Azure DocumentDB + + + 17 + 17 + UTF-8 + + + + + org.mongodb + mongodb-driver-sync + 5.4.0 + + + com.azure + azure-identity + 1.16.0 + + + com.azure + azure-ai-openai + 1.0.0-beta.16 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.Main + + + + + + + + compare + + + + org.codehaus.mojo + exec-maven-plugin + 3.4.1 + + com.azure.documentdb.selectalgorithm.CompareAll + + + + + + + + ``` + + Verify: Run `mvn dependency:resolve` to confirm all dependencies resolve without errors. + +4. Set environment variables in your shell before running the sample: + + ### [Bash](#tab/bash) + + ```bash + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com/ + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_DOCUMENTDB_DATABASENAME=Hotels + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:DOCUMENTDB_CLUSTER_NAME="" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT="https://.openai.azure.com/" + $env:AZURE_OPENAI_EMBEDDING_MODEL="text-embedding-3-small" + $env:AZURE_DOCUMENTDB_DATABASENAME="Hotels" + $env:DATA_FILE_WITH_VECTORS="data/Hotels_Vector.json" + $env:EMBEDDED_FIELD="DescriptionVector" + $env:EMBEDDING_DIMENSIONS="1536" + $env:LOAD_SIZE_BATCH="100" + ``` + + --- + + Replace the placeholder values with your Azure resource information: + + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `AZURE_OPENAI_EMBEDDING_MODEL`: Your Azure OpenAI embedding deployment name + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + + This sample uses passwordless authentication with `DefaultAzureCredential`, which requires your identity to have proper RBAC roles assigned. For more information on authentication options, see [Authenticate Java apps to Azure services by using the Azure SDK for Java](/azure/developer/java/sdk/authentication/overview). + +## Create code files + +When you are done, the project structure should look like this: + +```text +select-algorithm-java/ +├── data/ +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/main/java/com/azure/documentdb/selectalgorithm/ +│ ├── CompareAll.java +│ ├── Main.java +│ └── Utils.java +├── .gitignore +├── pom.xml +├── quickstart.md +└── README.md +``` + +## Create the algorithm comparison code + +### Create utility functions + +Create `src/main/java/com/azure/documentdb/selectalgorithm/Utils.java` and paste the following code: + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java" ::: + +This utility class provides: + +- **Environment variable management**: Reads configuration from environment variables by using `System.getenv()` +- **Passwordless authentication**: Uses `DefaultAzureCredential` for both MongoDB and Azure OpenAI +- **MongoDB client creation**: Configures OIDC authentication for DocumentDB +- **Azure OpenAI client creation**: Sets up the OpenAI client for embedding generation +- **Data loading**: Reads hotel data from JSON file +- **Embedding generation**: Creates vector embeddings for text queries +- **Index configuration**: Generates algorithm-specific vector index options +- **Search configuration**: Generates algorithm-specific search parameters +- **Results formatting**: Prints comparison table of algorithm performance + +### Create main comparison logic + +Create the following source files in `src/main/java/com/azure/documentdb/selectalgorithm/`: + +#### CompareAll.java + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java" ::: + +#### Main.java + +:::code language="java" source="~/../documentdb-samples/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java" ::: + + +This main comparison logic provides: + +- **Algorithm comparison logic**: Tests all combinations of algorithms and similarity functions +- **Collection management**: Creates separate collections for each configuration +- **Data loading**: Inserts hotel data in batches +- **Index creation**: Creates vector indexes for each algorithm and metric combination +- **Performance measurement**: Measures average query latency +- **Results display**: Outputs comparison table + +## Run the code + +1. Compile the project: + + ```bash + mvn clean compile + ``` + + Verify: The build output ends with `BUILD SUCCESS`. + +2. Run the comparison entry point. `Main.java` calls `CompareAll.run()` and always executes all 9 combinations (3 algorithms × 3 metrics): + + ### [Bash](#tab/bash) + + ```bash + mvn exec:java -Dexec.mainClass="com.azure.documentdb.selectalgorithm.Main" + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + mvn exec:java "-Dexec.mainClass=com.azure.documentdb.selectalgorithm.Main" + ``` + + --- + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +The program prints output similar to the following: + +```text +============================================== + Azure DocumentDB - Compare All Algorithms +============================================== + Query: "luxury hotel near the beach" + Top K: 5 + Metrics: COS, L2, IP + Algos: IVF, HNSW, DiskANN + + Loading data from: data/Hotels_Vector.json + Loaded 50 documents + Collection reset. + + Generating embedding for: "luxury hotel near the beach" + Embedding generated (1536 dimensions) + + Running 9 algorithm × metric combinations... + ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DISKANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Summary: 9 succeeded, 0 failed + + Cleanup: dropping comparison collection... + Cleanup: dropped collection 'hotels' +============================================== + Comparison complete. +============================================== +``` + +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +## Understanding the results + +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `MongoTimeoutException` | Verify the `DOCUMENTDB_CLUSTER_NAME` environment variable, and ensure your IP is in the DocumentDB firewall rules. | +| `MongoSecurityException` | Check credentials in connection string. | +| Maven build failures | Run `mvn dependency:resolve` to check for missing dependencies. Ensure Java 17+ is installed. | +| `No plugin found for prefix 'exec'` | Add `exec-maven-plugin` to your `pom.xml` as shown in this article. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [VS Code extension](#tab/vscode) + +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java new file mode 100644 index 0000000..66281ed --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/CompareAll.java @@ -0,0 +1,263 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import org.bson.Document; + +import java.util.ArrayList; +import java.util.List; + +/** + * Unified comparison runner that executes all 9 combinations + * (3 algorithms x 3 similarity metrics) and prints a formatted table. + */ +public class CompareAll { + + private static final String COLLECTION_NAME = "hotels"; + private static final String[] ALGORITHMS = {"ivf", "hnsw", "diskann"}; + private static final String[] METRICS = {"COS", "L2", "IP"}; + + public static void main(String[] args) { + run(); + } + + public static void run() { + String queryText = Utils.getEnv("QUERY_TEXT", "luxury hotel near the beach"); + int topK = Integer.parseInt(Utils.getEnv("TOP_K", "5")); + + String databaseName = Utils.getEnv("AZURE_DOCUMENTDB_DATABASENAME", "Hotels"); + String dataFile = Utils.getEnv("DATA_FILE_WITH_VECTORS", "data/Hotels_Vector.json"); + String vectorField = Utils.getEnv("EMBEDDED_FIELD", "DescriptionVector"); + int dimensions = Integer.parseInt(Utils.getEnv("EMBEDDING_DIMENSIONS", "1536")); + String model = Utils.getEnv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"); + + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.printf(" Query: \"%s\"%n", queryText); + System.out.printf(" Top K: %d%n", topK); + System.out.printf(" Metrics: COS, L2, IP%n"); + System.out.printf(" Algos: IVF, HNSW, DiskANN%n"); + System.out.println(); + + List results = new ArrayList<>(); + + try (MongoClient mongoClient = Utils.getMongoClient()) { + MongoDatabase database = mongoClient.getDatabase(databaseName); + MongoCollection collection = database.getCollection(COLLECTION_NAME); + + // Load data ONCE into the single collection + System.out.println(" Loading data from: " + dataFile); + List data = Utils.readJsonFile(dataFile); + System.out.printf(" Loaded %d documents%n", data.size()); + + collection.drop(); + System.out.println(" Collection reset."); + Utils.insertData(collection, data, 100); + + // Generate ONE embedding for the query (reused for all 9 searches) + OpenAIClient aiClient = Utils.getOpenAIClient(); + System.out.printf("%n Generating embedding for: \"%s\"%n", queryText); + List queryVector = Utils.getEmbedding(aiClient, queryText, model); + System.out.printf(" Embedding generated (%d dimensions)%n%n", queryVector.size()); + + // Convert to doubles for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + // Run 9 algorithm × metric combinations sequentially (create→search→drop) + // DocumentDB does not allow multiple vector indexes of the same kind + // on the same field path simultaneously. + System.out.println(" Running 9 algorithm × metric combinations...\n"); + for (String algo : ALGORITHMS) { + for (String metric : METRICS) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + // 1. Drop all existing vector indexes + dropVectorIndexes(collection, vectorField); + + // 2. Create this specific index + createIndex(database, collection, vectorField, dimensions, algo, metric); + System.out.printf(" ✓ %s created%n", indexName); + + // 3. Search with retries while the index becomes available + List searchResults = performSearchWithRetry( + collection, vectorAsDoubles, vectorField, topK, indexName); + if (searchResults.isEmpty()) { + results.add(new SearchResult(algo.toUpperCase(), metric, "(failed)", 0.0, "(failed)", 0.0)); + continue; + } + + // 4. Extract top 2 results + String top1Name = "-"; double top1Score = 0.0; + String top2Name = "-"; double top2Score = 0.0; + if (!searchResults.isEmpty()) { + Document top1 = searchResults.get(0); + top1Name = top1.getString("HotelName") != null ? top1.getString("HotelName") : "-"; + top1Score = top1.getDouble("score") != null ? top1.getDouble("score") : 0.0; + } + if (searchResults.size() > 1) { + Document top2 = searchResults.get(1); + top2Name = top2.getString("HotelName") != null ? top2.getString("HotelName") : "-"; + top2Score = top2.getDouble("score") != null ? top2.getDouble("score") : 0.0; + } + results.add(new SearchResult(algo.toUpperCase(), metric, top1Name, top1Score, top2Name, top2Score)); + } + } + + // Print comparison table + printComparisonTable(results); + + int successCount = (int) results.stream().filter(r -> !r.top1Name().equals("(failed)")).count(); + if (successCount == 0) { + System.out.println("\n❌ All 9 comparisons failed — no algorithm returned results."); + System.exit(1); + } else { + System.out.printf("%nSummary: %d succeeded, %d failed%n", successCount, 9 - successCount); + } + + // Cleanup: drop the comparison collection + System.out.println("\n Cleanup: dropping comparison collection..."); + collection.drop(); + System.out.println(" Cleanup: dropped collection 'hotels'"); + } + } + + private static void dropVectorIndexes(MongoCollection collection, String vectorField) { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + Document key = idx.get("key", Document.class); + if (key != null && "cosmosSearch".equals(key.getString(vectorField))) { + try { + collection.dropIndex(name); + } catch (Exception e) { + // Ignore if index doesn't exist + } + } + } + } + + private static void createIndex(MongoDatabase database, MongoCollection collection, + String vectorField, int dimensions, + String algo, String metric) { + String indexName = String.format("vector_%s_%s", algo, metric.toLowerCase()); + + Document cosmosSearchOptions = new Document() + .append("dimensions", dimensions) + .append("similarity", metric); + + switch (algo) { + case "ivf" -> cosmosSearchOptions + .append("kind", "vector-ivf") + .append("numLists", 1); + case "hnsw" -> cosmosSearchOptions + .append("kind", "vector-hnsw") + .append("m", 16) + .append("efConstruction", 64); + case "diskann" -> cosmosSearchOptions + .append("kind", "vector-diskann") + .append("maxDegree", 32) + .append("lBuild", 50); + } + + Document indexDefinition = new Document() + .append("name", indexName) + .append("key", new Document(vectorField, "cosmosSearch")) + .append("cosmosSearchOptions", cosmosSearchOptions); + + Document command = new Document("createIndexes", collection.getNamespace().getCollectionName()) + .append("indexes", List.of(indexDefinition)); + + try { + database.runCommand(command); + } catch (Exception e) { + // Idempotent: ignore if index already exists + if (!e.getMessage().contains("already exists")) { + throw e; + } + } + } + + private static List performSearch(MongoCollection collection, + List vectorAsDoubles, + String vectorField, int topK) { + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + return results; + } + + private static List performSearchWithRetry(MongoCollection collection, + List vectorAsDoubles, + String vectorField, + int topK, + String indexName) { + int maxRetries = 5; + int retryDelayMs = 2000; + + for (int attempt = 0; attempt <= maxRetries; attempt++) { + List results = performSearch(collection, vectorAsDoubles, vectorField, topK); + if (!results.isEmpty()) { + return results; + } + + if (attempt < maxRetries) { + System.out.printf(" No results for %s yet. Retrying in 2 seconds (%d/%d)...%n", + indexName, attempt + 1, maxRetries); + try { + Thread.sleep(retryDelayMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + } + + System.out.printf(" Search for %s did not return results after %d retries. Recording as failed.%n", + indexName, maxRetries); + return List.of(); + } + + private static void printComparisonTable(List results) { + System.out.println("┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐"); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7s│ %-27s│ %-7s│ %-6s│%n", + "Algorithm", "Metric", "Top 1 Result", "Score", "Top 2 Result", "Score", "Diff"); + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + + for (int i = 0; i < results.size(); i++) { + SearchResult r = results.get(i); + double diff = Math.abs(r.top1Score() - r.top2Score()); + String top1Display = r.top1Name().length() > 27 ? r.top1Name().substring(0, 24) + "..." : r.top1Name(); + String top2Display = r.top2Name().length() > 27 ? r.top2Name().substring(0, 24) + "..." : r.top2Name(); + System.out.printf("│ %-9s│ %-7s│ %-27s│ %-7.4f│ %-27s│ %-7.4f│ %-6.4f│%n", + r.algorithm(), r.metric(), top1Display, r.top1Score(), top2Display, r.top2Score(), diff); + if (i < results.size() - 1) { + System.out.println("├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤"); + } + } + System.out.println("└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘"); + } + + private record SearchResult( + String algorithm, + String metric, + String top1Name, + double top1Score, + String top2Name, + double top2Score) { + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java new file mode 100644 index 0000000..5a9d54c --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Main.java @@ -0,0 +1,17 @@ +package com.azure.documentdb.selectalgorithm; + +public class Main { + + public static void main(String[] args) { + System.out.println("=============================================="); + System.out.println(" Azure DocumentDB - Compare All Algorithms"); + System.out.println("=============================================="); + System.out.println(); + + CompareAll.run(); + + System.out.println("=============================================="); + System.out.println(" Comparison complete."); + System.out.println("=============================================="); + } +} diff --git a/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java new file mode 100644 index 0000000..b8b761e --- /dev/null +++ b/ai/select-algorithm-java/src/main/java/com/azure/documentdb/selectalgorithm/Utils.java @@ -0,0 +1,190 @@ +package com.azure.documentdb.selectalgorithm; + +import com.azure.ai.openai.OpenAIClient; +import com.azure.ai.openai.OpenAIClientBuilder; +import com.azure.ai.openai.models.EmbeddingItem; +import com.azure.ai.openai.models.EmbeddingsOptions; +import com.azure.core.credential.AccessToken; +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.mongodb.ConnectionString; +import com.mongodb.MongoClientSettings; +import com.mongodb.MongoCredential; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoClients; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.InsertManyOptions; +import org.bson.Document; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class Utils { + + public static String getEnv(String key, String defaultValue) { + String value = System.getenv(key); + return (value != null && !value.isBlank()) ? value : defaultValue; + } + + public static String getEnv(String key) { + return getEnv(key, null); + } + + public static MongoClient getMongoClient() { + String clusterName = getEnv("DOCUMENTDB_CLUSTER_NAME"); + if (clusterName == null) { + throw new IllegalStateException("DOCUMENTDB_CLUSTER_NAME environment variable is required"); + } + + String connectionUri = String.format( + "mongodb+srv://%s.global.mongocluster.cosmos.azure.com/", clusterName); + + // Use custom OIDC callback with DefaultAzureCredential + // This chains through CLI, managed identity, etc. + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + String tokenResource = "https://ossrdbms-aad.database.windows.net/.default"; + + MongoCredential mongoCredential = MongoCredential.createOidcCredential(null) + .withMechanismProperty("OIDC_CALLBACK", (MongoCredential.OidcCallback) context -> { + AccessToken token = credential.getToken( + new com.azure.core.credential.TokenRequestContext() + .addScopes(tokenResource)).block(); + return new MongoCredential.OidcCallbackResult(token.getToken()); + }); + + MongoClientSettings settings = MongoClientSettings.builder() + .applyConnectionString(new ConnectionString(connectionUri)) + .credential(mongoCredential) + .retryWrites(false) + .build(); + + return MongoClients.create(settings); + } + + public static OpenAIClient getOpenAIClient() { + String endpoint = getEnv("AZURE_OPENAI_EMBEDDING_ENDPOINT"); + if (endpoint == null) { + throw new IllegalStateException("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required"); + } + + DefaultAzureCredential credential = new DefaultAzureCredentialBuilder().build(); + + return new OpenAIClientBuilder() + .endpoint(endpoint) + .credential(credential) + .buildClient(); + } + + public static List readJsonFile(String path) { + try { + String content = Files.readString(Path.of(path)); + // Parse JSON array of documents + @SuppressWarnings("unchecked") + List docs = Document.parse("{\"data\":" + content + "}").getList("data", Document.class); + return docs; + } catch (IOException e) { + throw new RuntimeException("Failed to read data file: " + path, e); + } + } + + public static void insertData(MongoCollection collection, List data, int batchSize) { + System.out.printf(" Inserting %d documents in batches of %d...%n", data.size(), batchSize); + InsertManyOptions options = new InsertManyOptions().ordered(false); + + for (int i = 0; i < data.size(); i += batchSize) { + List batch = data.subList(i, Math.min(i + batchSize, data.size())); + // Remove _id to avoid duplicate key errors on re-run + List cleaned = new ArrayList<>(); + for (Document doc : batch) { + Document copy = new Document(doc); + copy.remove("_id"); + cleaned.add(copy); + } + try { + collection.insertMany(cleaned, options); + } catch (Exception e) { + // Ignore duplicate key errors on re-insert + if (!e.getMessage().contains("duplicate key")) { + throw e; + } + } + System.out.printf(" Inserted batch %d-%d%n", i + 1, Math.min(i + batchSize, data.size())); + } + System.out.println(" Data insertion complete."); + } + + public static void dropVectorIndexes(MongoCollection collection, String vectorField) { + try { + for (Document idx : collection.listIndexes()) { + String name = idx.getString("name"); + if (name != null && name.contains(vectorField) && !name.equals("_id_")) { + System.out.printf(" Dropping existing index: %s%n", name); + collection.dropIndex(name); + } + } + } catch (Exception e) { + // Ignore errors when indexes don't exist + System.out.println(" No existing vector indexes to drop."); + } + } + + public static List getEmbedding(OpenAIClient client, String text, String model) { + EmbeddingsOptions options = new EmbeddingsOptions(List.of(text)); + List embeddings = client.getEmbeddings(model, options).getData(); + if (embeddings.isEmpty()) { + throw new RuntimeException("No embedding returned for query text"); + } + return embeddings.get(0).getEmbedding(); + } + + public static List performVectorSearch( + MongoCollection collection, + OpenAIClient aiClient, + String query, + String vectorField, + String model, + int topK) { + + System.out.printf(" Generating embedding for query: \"%s\"%n", query); + List queryVector = getEmbedding(aiClient, query, model); + System.out.printf(" Embedding generated (%d dimensions)%n", queryVector.size()); + + // Convert List to List for BSON + List vectorAsDoubles = queryVector.stream() + .map(Float::doubleValue) + .toList(); + + Document searchStage = new Document("$search", new Document("cosmosSearch", new Document() + .append("vector", vectorAsDoubles) + .append("path", vectorField) + .append("k", topK))); + + Document projectStage = new Document("$project", new Document() + .append("_id", 0) + .append("HotelName", 1) + .append("Description", 1) + .append("score", new Document("$meta", "searchScore"))); + + List pipeline = List.of(searchStage, projectStage); + List results = new ArrayList<>(); + collection.aggregate(pipeline).forEach(results::add); + + return results; + } + + public static void printResults(List results) { + System.out.println("\n === Search Results ==="); + for (int i = 0; i < results.size(); i++) { + Document doc = results.get(i); + System.out.printf(" %d. %s (score: %.4f)%n", + i + 1, + doc.getString("HotelName"), + doc.getDouble("score")); + System.out.printf(" %s%n", doc.getString("Description")); + } + System.out.println(); + } +} diff --git a/ai/select-algorithm-python/.gitignore b/ai/select-algorithm-python/.gitignore new file mode 100644 index 0000000..87965ce --- /dev/null +++ b/ai/select-algorithm-python/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.env +.venv/ + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md diff --git a/ai/select-algorithm-python/README.md b/ai/select-algorithm-python/README.md new file mode 100644 index 0000000..1fe7746 --- /dev/null +++ b/ai/select-algorithm-python/README.md @@ -0,0 +1,96 @@ + +# Select Vector Algorithm (Python) + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB. Each algorithm is optimized for different dataset sizes and performance requirements. + +## Algorithm Selection Guide + +| Algorithm | Dataset Size | Cluster Tier | Key Parameters | +|-----------|-------------|--------------|----------------| +| IVF | < 10K docs | M10+ | numLists | +| HNSW | 10K-50K | M30+ | m, efConstruction | +| DiskANN | 50K+ | M40+ | maxDegree, lBuild | + +## Prerequisites + +- Azure subscription +- Azure DocumentDB cluster (M40+ for all algorithms, M10+ for IVF only) +- Azure OpenAI resource with `text-embedding-3-small` deployed +- Python 3.10+ +- Azure CLI (`az login` for passwordless auth) + +## Setup + +1. ### Configure environment variables + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file at the repository root with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + +2. Install dependencies: + ```bash + cd src + pip install -r ../requirements.txt + ``` + +3. Copy the shared data file: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + +4. Ensure you're logged in to Azure: + ```bash + az login + ``` + +## Run + +Compare all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation: + +```bash +cd src +python compare_all.py +``` + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +## Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `DOCUMENTDB_CLUSTER_NAME` | (required) | DocumentDB cluster name | +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | (required) | Azure OpenAI endpoint | +| `AZURE_OPENAI_EMBEDDING_MODEL` | (required) | Embedding model deployment name | +| `DATA_FILE_WITH_VECTORS` | `../data/Hotels_Vector.json` | Path to vectors JSON file | +| `EMBEDDED_FIELD` | `DescriptionVector` | Field name containing embeddings | +| `EMBEDDING_DIMENSIONS` | `1536` | Vector dimensions | +| `AZURE_DOCUMENTDB_DATABASENAME` | `Hotels` | Target database name | +| `LOAD_SIZE_BATCH` | `100` | Batch size for data loading | +| `EMBEDDING_SIZE_BATCH` | `16` | Batch size for embedding requests | +| `ALGORITHM` | (empty = all) | Which algorithm to run | +| `SIMILARITY` | (empty = all) | Similarity metric: `COS`, `L2`, `IP` | +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per search | +| `VERBOSE` | `false` | Show all k results per combo | diff --git a/ai/select-algorithm-python/data/README.md b/ai/select-algorithm-python/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-python/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-python/output/compare_all.txt b/ai/select-algorithm-python/output/compare_all.txt new file mode 100644 index 0000000..8719419 --- /dev/null +++ b/ai/select-algorithm-python/output/compare_all.txt @@ -0,0 +1,49 @@ +====================================================================== + Compare All Algorithms — 9 Combinations + (3 Algorithms × 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 5 + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | ++=============+==========+==========================+=========+===================+=========+========+ +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' diff --git a/ai/select-algorithm-python/quickstart.md b/ai/select-algorithm-python/quickstart.md new file mode 100644 index 0000000..07304b6 --- /dev/null +++ b/ai/select-algorithm-python/quickstart.md @@ -0,0 +1,369 @@ +--- +title: Quickstart - Vector index with Python +description: Compare vector index algorithms and similarity functions using the Python SDK in Azure DocumentDB to optimize search performance for your workload. +ms.devlang: python +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with Python in Azure DocumentDB + +In this quickstart, you compare three vector index algorithms (DiskANN, HNSW, and IVF) and three similarity functions (cosine, L2, and inner product) to find the optimal configuration for your search workload. This quickstart uses a sample hotel dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-python) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Python](https://www.python.org/downloads/) 3.10 or greater + +## Create data file with vectors + +1. Create a new data directory and download the hotels data file with vectors: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p data + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Force -Path data + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/ + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data/ + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Python project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir -p select-algorithm + cd select-algorithm + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Force -Path select-algorithm + Set-Location select-algorithm + code . + ``` + + --- + +2. In the terminal, create and activate a virtual environment: + + For Windows: + + ```powershell + python -m venv venv + venv\Scripts\activate + ``` + + For macOS/Linux: + + ```bash + python -m venv venv + source venv/bin/activate + ``` + +3. Install the required packages: + + ```bash + pip install "pymongo>=4.7" openai==1.55.3 azure-identity==1.15.0 + ``` + + - `pymongo`: MongoDB driver for Python (≥4.7 required for OIDC authentication) + - `openai`: OpenAI client library to create vectors + - `azure-identity`: Azure Identity library for passwordless authentication + + Verify the packages are installed: + + ### [Bash](#tab/bash) + + ```bash + pip list | grep pymongo + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + pip list | Select-String pymongo + ``` + + --- + + You should see `pymongo` with a version of 4.7 or greater. + +4. Set the required environment variables in your current shell session before you run the sample: + + ### [Bash](#tab/bash) + + ```bash + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_OPENAI_EMBEDDING_API_VERSION=2024-10-21 + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_DOCUMENTDB_DATABASENAME=Hotels + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small" + $env:AZURE_OPENAI_EMBEDDING_API_VERSION = "2024-10-21" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT = "https://.openai.azure.com" + $env:DATA_FILE_WITH_VECTORS = "data/Hotels_Vector.json" + $env:EMBEDDED_FIELD = "DescriptionVector" + $env:EMBEDDING_DIMENSIONS = "1536" + $env:LOAD_SIZE_BATCH = "100" + $env:DOCUMENTDB_CLUSTER_NAME = "" + $env:AZURE_DOCUMENTDB_DATABASENAME = "Hotels" + ``` + + --- + + For the passwordless authentication used in this article, replace the placeholder values with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate Python apps to Azure services by using the Azure SDK for Python](/azure/developer/python/sdk/authentication/overview). + +## Create code files + +Create the following project structure: + +``` +select-algorithm-python/ +├── data/ +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare_all.py +│ └── utils.py +├── .gitignore +├── quickstart.md +├── README.md +└── requirements.txt +``` + +Create the `src` directory: + +### [Bash](#tab/bash) + +```bash +mkdir -p src +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Force -Path src +``` + +--- + +## Create the algorithm comparison code + +Create the `src/compare_all.py` file with the following code: + +:::code language="python" source="~/../documentdb-samples/ai/select-algorithm-python/src/compare_all.py" ::: + +This script orchestrates the algorithm comparison by: + +- Loading configuration from environment variables +- Initializing MongoDB and Azure OpenAI clients with passwordless authentication +- Loading hotel data with pre-calculated embeddings +- Testing each algorithm/similarity combination by creating a collection, inserting data, creating an index, and executing a search +- Measuring and comparing search performance across all configurations +- Displaying results in a comparison table + +## Create utility functions + +Create the `src/utils.py` file with the following code: + +:::code language="python" source="~/../documentdb-samples/ai/select-algorithm-python/src/utils.py" ::: + +The utilities provide essential functions for: + +- Passwordless authentication to DocumentDB and Azure OpenAI using DefaultAzureCredential +- Reading JSON data files with error handling +- Batch insertion of documents with DocumentDB's 16 MB payload limit in mind +- Formatted display of comparison results showing algorithm performance + +## Run the code + +Execute the comparison script to run all 9 combinations: + +```bash +python src/compare_all.py +``` + +The output matches `output/compare_all.txt`: + +``` +====================================================================== + Compare All Algorithms — 9 Combinations + (3 Algorithms × 3 Similarity Metrics) +====================================================================== + + Query: "luxury hotel near the beach" + Top K: 5 + +Dropped existing 'hotels' collection (if any) +Loaded 50 documents with embeddings +Inserted 50/50 documents + +Generating embedding for query... +Running 9 vector searches... + + Created index 'vector_ivf_cos' + Created index 'vector_ivf_l2' + Created index 'vector_ivf_ip' + Created index 'vector_hnsw_cos' + Created index 'vector_hnsw_l2' + Created index 'vector_hnsw_ip' + Created index 'vector_diskann_cos' + Created index 'vector_diskann_l2' + Created index 'vector_diskann_ip' ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| Algorithm | Metric | Top 1 Result | Score | Top 2 Result | Score | Diff | ++=============+==========+==========================+=========+===================+=========+========+ +| IVF | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| IVF | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| HNSW | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | COS | Ocean Water Resort & Spa | 0.6184 | Windy Ocean Motel | 0.5057 | 0.1128 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | L2 | Ocean Water Resort & Spa | 0.8735 | Windy Ocean Motel | 0.9942 | 0.1207 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ +| DiskANN | IP | Ocean Water Resort & Spa | 0.6183 | Windy Ocean Motel | 0.5056 | 0.1127 | ++-------------+----------+--------------------------+---------+-------------------+---------+--------+ + +Summary: 9 succeeded, 0 failed + +Cleanup: dropped collection 'hotels' +``` + +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +### Run all combinations + +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +### [Bash](#tab/bash) + +```bash +python src/compare_all.py +``` + +### [PowerShell](#tab/powershell) + +```powershell +python src/compare_all.py +``` + +--- + +### Understanding the results + +The comparison table helps you choose the best configuration for your workload: + +- **Latency**: Query execution time in milliseconds. Lower is better for user-facing search. +- **Score**: Similarity score using the selected function. Higher scores indicate better matches. +- **Top Result**: The highest-scoring hotel for the query. Consistency across algorithms indicates stable results. + +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `ServerSelectionTimeoutError` | Verify that your environment variables are set in the current shell. Ensure your IP is in the DocumentDB firewall rules. | +| `AuthenticationFailed` | Check that your connection string includes the correct username and password, or that your Microsoft Entra token is valid. | +| `pymongo.errors.OperationFailure` | Ensure the database and collection exist. Check that the vector index was created successfully. | +| `ModuleNotFoundError: No module named 'pymongo'` | Activate your virtual environment and run `pip install "pymongo>=4.7"`. | +| Empty search results | The vector index may not be ready yet. The script includes retry logic, but large datasets may require longer wait times. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [VS Code extension](#tab/vscode) + +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-python/requirements.txt b/ai/select-algorithm-python/requirements.txt new file mode 100644 index 0000000..36e664e --- /dev/null +++ b/ai/select-algorithm-python/requirements.txt @@ -0,0 +1,11 @@ +# MongoDB driver for connecting to DocumentDB +pymongo>=4.7.0 + +# Azure OpenAI SDK for generating embeddings +openai>=1.0.0,<2.0.0 + +# Azure authentication library for passwordless connection +azure-identity>=1.15.0 + +# Formatted table output for compare_all.py +tabulate>=0.9.0 diff --git a/ai/select-algorithm-python/src/compare_all.py b/ai/select-algorithm-python/src/compare_all.py new file mode 100644 index 0000000..1f7dcb1 --- /dev/null +++ b/ai/select-algorithm-python/src/compare_all.py @@ -0,0 +1,253 @@ +""" +Compare All Algorithms — Unified comparison runner. + +Executes all 9 combinations (3 algorithms × 3 similarity metrics) in a single +invocation and prints a formatted comparison table. + +Algorithms: IVF, HNSW, DiskANN +Metrics: COS, L2, IP +""" +import os +import sys +import time +from typing import Dict, List, Any + +from tabulate import tabulate +from utils import ( + get_clients_passwordless, get_config, read_file_return_json, + insert_data +) + +# Index definitions: (algo_label, kind, extra_params) +ALGORITHMS = [ + ("IVF", "vector-ivf", {"numLists": 1}), + ("HNSW", "vector-hnsw", {"m": 16, "efConstruction": 64}), + ("DiskANN", "vector-diskann", {"maxDegree": 32, "lBuild": 50}), +] + +METRICS = ["COS", "L2", "IP"] + + +def get_compare_config() -> Dict[str, Any]: + """Load comparison-specific configuration from environment variables.""" + config = get_config() + config["query_text"] = os.getenv("QUERY_TEXT", "luxury hotel near the beach") + config["top_k"] = int(os.getenv("TOP_K", "5")) + return config + + +def index_name(algo: str, metric: str) -> str: + """Generate canonical index name: vector_{algo}_{metric}.""" + return f"vector_{algo.lower()}_{metric.lower()}" + + +def get_existing_index_names(collection) -> List[str]: + """Return names of existing indexes on the collection.""" + return [idx["name"] for idx in collection.list_indexes()] + + +def drop_vector_indexes(collection, vector_field: str) -> None: + """Drop all existing vector indexes on *vector_field*.""" + for idx in collection.list_indexes(): + name = idx.get("name", "") + key = idx.get("key", {}) + if vector_field in key and key[vector_field] == "cosmosSearch": + collection.drop_index(name) + + +def create_vector_index(collection, name: str, kind: str, vector_field: str, + dimensions: int, similarity: str, + extra_params: Dict[str, Any]) -> None: + """Create a single vector index.""" + cosmos_options = { + "kind": kind, + "dimensions": dimensions, + "similarity": similarity, + **extra_params, + } + + index_command = { + "createIndexes": collection.name, + "indexes": [ + { + "name": name, + "key": {vector_field: "cosmosSearch"}, + "cosmosSearchOptions": cosmos_options, + } + ], + } + collection.database.command(index_command) + + +def generate_embedding(azure_openai_client, query_text: str, + model_name: str) -> List[float]: + """Generate a single embedding for the query text.""" + response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + return response.data[0].embedding + + +def vector_search_with_index(collection, query_embedding: List[float], + vector_field: str, + top_k: int) -> List[Dict[str, Any]]: + """Run vector search using the single active index and return results.""" + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + results = list(collection.aggregate(pipeline)) + + return results + + +def vector_search_with_retry(collection, query_embedding: List[float], + vector_field: str, top_k: int, + index_name_value: str) -> List[Dict[str, Any]]: + """Retry vector search until results are available or retries are exhausted.""" + max_retries = 5 + retry_delay_seconds = 2 + + for attempt in range(max_retries + 1): + results = vector_search_with_index( + collection, query_embedding, vector_field, top_k + ) + if results: + return results + + if attempt < max_retries: + print( + f" No results for '{index_name_value}' yet. " + f"Retrying in {retry_delay_seconds} seconds " + f"({attempt + 1}/{max_retries})..." + ) + time.sleep(retry_delay_seconds) + + print( + f" Search for '{index_name_value}' did not return results " + f"after {max_retries} retries. Recording as failed." + ) + return [] + + +def main(): + print("=" * 70) + print(" Compare All Algorithms — 9 Combinations") + print(" (3 Algorithms × 3 Similarity Metrics)") + print("=" * 70) + + config = get_compare_config() + query_text = config["query_text"] + top_k = config["top_k"] + + print(f"\n Query: \"{query_text}\"") + print(f" Top K: {top_k}\n") + + mongo_client, azure_openai_client = get_clients_passwordless() + + try: + database = mongo_client[config["database_name"]] + + # Drop collection for a clean comparison + database.drop_collection("hotels") + print("Dropped existing 'hotels' collection (if any)") + + # Create fresh collection and load data + collection = database["hotels"] + data = read_file_return_json(config["data_file"]) + documents = [doc for doc in data if config["vector_field"] in doc] + print(f"Loaded {len(documents)} documents with embeddings") + insert_data(collection, documents, config["batch_size"]) + + # Generate ONE embedding for the query + print("\nGenerating embedding for query...") + query_embedding = generate_embedding( + azure_openai_client, query_text, config["model_name"] + ) + + # Run all 9 searches sequentially (create→search→drop for each) + print("Running 9 vector searches...\n") + table_rows = [] + + for algo_label, kind, extra_params in ALGORITHMS: + for metric in METRICS: + name = index_name(algo_label, metric) + # Drop all vector indexes first + drop_vector_indexes(collection, config["vector_field"]) + # Create this specific index + create_vector_index( + collection, name, kind, config["vector_field"], + config["dimensions"], metric, extra_params + ) + print(f" Created index '{name}'") + results = vector_search_with_retry( + collection, query_embedding, config["vector_field"], top_k, name + ) + + if not results: + table_rows.append([ + algo_label, + metric, + "(failed)", + f"{0:.4f}", + "(failed)", + f"{0:.4f}", + f"{0:.4f}", + ]) + continue + + top1_name = results[0].get("document", results[0]).get("HotelName", "Unknown") if len(results) > 0 else "(no results)" + top1_score = results[0].get("score", 0) if len(results) > 0 else 0 + top2_name = results[1].get("document", results[1]).get("HotelName", "Unknown") if len(results) > 1 else "(no results)" + top2_score = results[1].get("score", 0) if len(results) > 1 else 0 + + table_rows.append([ + algo_label, + metric, + top1_name, + f"{top1_score:.4f}", + top2_name, + f"{top2_score:.4f}", + f"{abs(top1_score - top2_score):.4f}", + ]) + + # Print comparison table + headers = ["Algorithm", "Metric", "Top 1 Result", "Score", + "Top 2 Result", "Score", "Diff"] + print(tabulate(table_rows, headers=headers, tablefmt="grid")) + + success_count = sum(1 for row in table_rows if row[2] != "(failed)") + if success_count == 0: + print("\n❌ All 9 comparisons failed — no algorithm returned results.") + sys.exit(1) + else: + print(f"\nSummary: {success_count} succeeded, {9 - success_count} failed") + + finally: + # Cleanup: drop the comparison collection + try: + database = mongo_client[config["database_name"]] + database.drop_collection("hotels") + print("\nCleanup: dropped collection 'hotels'") + except Exception as e: + print(f"Cleanup warning: {e}") + mongo_client.close() + + +if __name__ == "__main__": + main() diff --git a/ai/select-algorithm-python/src/utils.py b/ai/select-algorithm-python/src/utils.py new file mode 100644 index 0000000..de0f8b4 --- /dev/null +++ b/ai/select-algorithm-python/src/utils.py @@ -0,0 +1,171 @@ +import json +import os +import time +import warnings +from typing import Dict, List, Any, Optional, Tuple + +# Suppress the PyMongo DocumentDB cluster detection warning +warnings.filterwarnings( + "ignore", + message="You appear to be connected to a DocumentDB cluster.*", +) + +from pymongo import MongoClient, InsertOne +from pymongo.collection import Collection +from pymongo.errors import BulkWriteError +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from pymongo.auth_oidc import OIDCCallback, OIDCCallbackContext, OIDCCallbackResult +from openai import AzureOpenAI + + +class AzureIdentityTokenCallback(OIDCCallback): + def __init__(self, credential): + self.credential = credential + + def fetch(self, context: OIDCCallbackContext) -> OIDCCallbackResult: + token = self.credential.get_token( + "https://ossrdbms-aad.database.windows.net/.default").token + return OIDCCallbackResult(access_token=token) + + +def get_clients_passwordless() -> Tuple[MongoClient, AzureOpenAI]: + """Create MongoDB and Azure OpenAI clients using passwordless auth.""" + cluster_name = os.getenv("DOCUMENTDB_CLUSTER_NAME") + if not cluster_name: + raise ValueError("DOCUMENTDB_CLUSTER_NAME environment variable is required") + + credential = DefaultAzureCredential() + + mongo_client = MongoClient( + f"mongodb+srv://{cluster_name}.global.mongocluster.cosmos.azure.com/", + connectTimeoutMS=120000, + tls=True, + retryWrites=False, + authMechanism="MONGODB-OIDC", + authMechanismProperties={"OIDC_CALLBACK": AzureIdentityTokenCallback(credential)} + ) + + azure_openai_endpoint = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") + if not azure_openai_endpoint: + raise ValueError("AZURE_OPENAI_EMBEDDING_ENDPOINT environment variable is required") + + token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default") + + azure_openai_client = AzureOpenAI( + azure_endpoint=azure_openai_endpoint, + azure_ad_token_provider=token_provider, + api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION", "2023-05-15") + ) + + return mongo_client, azure_openai_client + + +def get_config() -> Dict[str, Any]: + """Load configuration from environment variables.""" + return { + 'database_name': os.getenv('AZURE_DOCUMENTDB_DATABASENAME', 'Hotels'), + 'data_file': os.getenv('DATA_FILE_WITH_VECTORS', 'data/Hotels_Vector.json'), + 'vector_field': os.getenv('EMBEDDED_FIELD', 'DescriptionVector'), + 'model_name': os.getenv('AZURE_OPENAI_EMBEDDING_MODEL', 'text-embedding-3-small'), + 'dimensions': int(os.getenv('EMBEDDING_DIMENSIONS', '1536')), + 'batch_size': int(os.getenv('LOAD_SIZE_BATCH', '100')), + 'similarity': os.getenv('SIMILARITY', ''), + } + + +def read_file_return_json(file_path: str) -> List[Dict[str, Any]]: + """Read a JSON file and return the parsed data.""" + try: + with open(file_path, 'r', encoding='utf-8') as file: + return json.load(file) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + raise + + +def insert_data(collection: Collection, data: List[Dict[str, Any]], + batch_size: int = 100) -> Dict[str, Any]: + """Insert data into collection in batches, skipping if already populated.""" + total_documents = len(data) + + existing_count = collection.count_documents({}) + if existing_count >= total_documents: + print(f"Collection already has {existing_count} documents, skipping insert") + return {'total': total_documents, 'inserted': 0, 'skipped': True} + + if existing_count > 0: + collection.delete_many({}) + + inserted_count = 0 + for i in range(0, total_documents, batch_size): + batch = data[i:i + batch_size] + try: + operations = [InsertOne(doc) for doc in batch] + result = collection.bulk_write(operations, ordered=False) + inserted_count += result.inserted_count + except BulkWriteError as e: + inserted_count += e.details.get('nInserted', 0) + time.sleep(0.1) + + print(f"Inserted {inserted_count}/{total_documents} documents") + return {'total': total_documents, 'inserted': inserted_count, 'skipped': False} + + +def drop_vector_indexes(collection: Collection, vector_field: str) -> None: + """Drop any existing vector indexes on the specified field.""" + try: + indexes = list(collection.list_indexes()) + for index in indexes: + if 'key' in index and vector_field in index['key']: + if index['key'][vector_field] == 'cosmosSearch': + collection.drop_index(index['name']) + print(f"Dropped existing vector index: {index['name']}") + except Exception as e: + print(f"Warning: Error dropping indexes: {e}") + + +def perform_vector_search(collection: Collection, + azure_openai_client: AzureOpenAI, + query_text: str, + vector_field: str, + model_name: str, + top_k: int = 5) -> List[Dict[str, Any]]: + """Perform vector search using the $search aggregation stage.""" + embedding_response = azure_openai_client.embeddings.create( + input=[query_text], + model=model_name + ) + query_embedding = embedding_response.data[0].embedding + + pipeline = [ + { + "$search": { + "cosmosSearch": { + "vector": query_embedding, + "path": vector_field, + "k": top_k + } + } + }, + { + "$project": { + "document": "$$ROOT", + "score": {"$meta": "searchScore"} + } + } + ] + + return list(collection.aggregate(pipeline)) + + +def print_search_results(results: List[Dict[str, Any]], algorithm: str) -> None: + """Print formatted search results.""" + print(f"\n{'='*60}") + print(f" {algorithm} Search Results ({len(results)} found)") + print(f"{'='*60}") + for i, result in enumerate(results, 1): + doc = result.get('document', result) + name = doc.get('HotelName', doc.get('name', 'Unknown')) + score = result.get('score', 0) + print(f" {i}. {name} (score: {score:.4f})") + print() diff --git a/ai/select-algorithm-typescript/.gitignore b/ai/select-algorithm-typescript/.gitignore new file mode 100644 index 0000000..9a088e4 --- /dev/null +++ b/ai/select-algorithm-typescript/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +dist/ +.env + +# Local data copy (user copies from ai/data/) +data/*.json +!data/README.md +Hotels_Vector.json diff --git a/ai/select-algorithm-typescript/README.md b/ai/select-algorithm-typescript/README.md new file mode 100644 index 0000000..73414a8 --- /dev/null +++ b/ai/select-algorithm-typescript/README.md @@ -0,0 +1,116 @@ +# Select Algorithm — TypeScript + +Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB using TypeScript. + +## Prerequisites + +- [Node.js 20+](https://nodejs.org/) +- [Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli) (for `az login`) +- An Azure DocumentDB cluster with vector search enabled +- An Azure OpenAI resource with an embedding model deployed + +## Setup + +1. **Install dependencies:** + + ```bash + npm install + ``` + +2. **Sign in to Azure** (for passwordless authentication): + + ```bash + az login + ``` + +3. **Configure environment variables:** + + After deploying with `azd up`, create a `.env` file with your provisioned resource values: + + ```bash + azd env get-values > .env + ``` + + This creates a `.env` file in the project folder with the connection strings and endpoints needed to run the sample. + + Alternatively, copy the example and fill in values manually: + + ```bash + cp .env.example .env + ``` + + | Variable | Description | + |---|---| + | `DOCUMENTDB_CLUSTER_NAME` | Your DocumentDB cluster name | + | `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Azure OpenAI endpoint URL | + | `AZURE_OPENAI_EMBEDDING_MODEL` | Embedding model deployment name | + | `AZURE_OPENAI_EMBEDDING_API_VERSION` | Azure OpenAI API version | + | `AZURE_DOCUMENTDB_DATABASENAME` | Database name (default: `Hotels`) | + | `DATA_FILE_WITH_VECTORS` | Path to JSON data file with vectors | + | `EMBEDDED_FIELD` | Field name containing the vector (default: `DescriptionVector`) | + | `EMBEDDING_DIMENSIONS` | Vector dimensions (default: `1536`) | + | `LOAD_SIZE_BATCH` | Batch size for data insertion | + | `SIMILARITY` | Similarity metric: `COS`, `L2`, or `IP` | + +5. **Copy the shared data file** into this directory: + + ```bash + cp ../data/Hotels_Vector.json . + ``` + + The `DATA_FILE_WITH_VECTORS` env var defaults to `../data/Hotels_Vector.json`. + +6. **Build the project:** + + ```bash + npm run build + ``` + +## Run + +Run all 9 combinations (3 algorithms × 3 similarity metrics) in a single invocation and view a formatted comparison table: + +```bash +npm start +``` + +**Environment variables** (optional overrides): + +| Variable | Default | Description | +|---|---|---| +| `QUERY_TEXT` | `luxury hotel near the beach` | Search query text | +| `TOP_K` | `5` | Number of results per combination | +| `VERBOSE` | `false` | When `true`, shows all k results per combo | + +The script creates a single `hotels` collection, loads data once, then for each of the 9 algorithm/metric combinations: creates the index → searches → drops the index. DocumentDB only allows one vector index per kind per field, so indexes are created sequentially. + +**Output:** +``` +==================================================================================================== + COMPARISON RESULTS +==================================================================================================== +Algorithm Similarity #1 Result #1 Score #2 Result #2 Score Diff +---------------------------------------------------------------------------------------------------- +IVF COS Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +IVF L2 Ocean Water Resort &.. 0.8736 Windy Ocean Motel 0.9943 -0.1208 +IVF IP Ocean Water Resort &.. 0.6184 Windy Ocean Motel 0.5056 0.1128 +... +==================================================================================================== + KEY INSIGHTS +==================================================================================================== + 🎯 Highest #1 score: IVF/COS (0.6184) + 📊 Biggest separation: IVF/COS (diff: 0.1128) + 🔑 All algorithms return the same top results — algorithm choice + affects performance at scale, not accuracy on small datasets. + 📐 COS and IP produce identical scores (normalized embeddings). + 📏 L2 scores are distances (lower = closer), not similarities. +==================================================================================================== +``` + +## Algorithm comparison + +| Algorithm | Index type | Best for | +|---|---|---| +| **IVF** | `vector-ivf` | Smaller datasets, lower memory usage | +| **HNSW** | `vector-hnsw` | Fast approximate search, balanced recall/speed | +| **DiskANN** | `vector-diskann` | Large-scale datasets, disk-based search | diff --git a/ai/select-algorithm-typescript/data/README.md b/ai/select-algorithm-typescript/data/README.md new file mode 100644 index 0000000..c918009 --- /dev/null +++ b/ai/select-algorithm-typescript/data/README.md @@ -0,0 +1,5 @@ +# Data Files + +Copy `Hotels_Vector.json` into this folder before running the sample. + +The file is available in the repository at `ai/data/Hotels_Vector.json`. diff --git a/ai/select-algorithm-typescript/output/compare_all.txt b/ai/select-algorithm-typescript/output/compare_all.txt new file mode 100644 index 0000000..8e34340 --- /dev/null +++ b/ai/select-algorithm-typescript/output/compare_all.txt @@ -0,0 +1,42 @@ +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small +Reading JSON file from data/Hotels_Vector.json +Loaded 50 documents +Processing in batches of 50... +Batch 1 complete: 50 inserted + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 5 results)... ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection "hotels" +Database connection closed diff --git a/ai/select-algorithm-typescript/package-lock.json b/ai/select-algorithm-typescript/package-lock.json new file mode 100644 index 0000000..f0ceb74 --- /dev/null +++ b/ai/select-algorithm-typescript/package-lock.json @@ -0,0 +1,735 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "select-algorithm-typescript", + "version": "1.0.0", + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", + "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", + "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-util": "^1.13.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-client": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", + "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-rest-pipeline": "^1.22.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.23.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.23.0.tgz", + "integrity": "sha512-Evs1INHo+jUjwHi1T6SG6Ua/LHOQBCLuKEEE6efIpt4ZOoNonaT1kP32GoOcdNDbfqsD2445CPri3MubBy5DEQ==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@azure/core-auth": "^1.10.0", + "@azure/core-tracing": "^1.3.0", + "@azure/core-util": "^1.13.0", + "@azure/logger": "^1.3.0", + "@typespec/ts-http-runtime": "^0.3.4", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", + "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", + "license": "MIT", + "dependencies": { + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", + "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.1.2", + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/identity": { + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.1.tgz", + "integrity": "sha512-5C/2WD5Vb1lHnZS16dNQRPMjN6oV/Upba+C9nBIs15PmOi6A3ZGs4Lr2u60zw4S04gi+u3cEXiqTVP7M4Pz3kw==", + "license": "MIT", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.9.0", + "@azure/core-client": "^1.9.2", + "@azure/core-rest-pipeline": "^1.17.0", + "@azure/core-tracing": "^1.0.0", + "@azure/core-util": "^1.11.0", + "@azure/logger": "^1.0.0", + "@azure/msal-browser": "^5.5.0", + "@azure/msal-node": "^5.1.0", + "open": "^10.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", + "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", + "license": "MIT", + "dependencies": { + "@typespec/ts-http-runtime": "^0.3.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@azure/msal-browser": { + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-5.9.0.tgz", + "integrity": "sha512-CzE+4PefDSJWj26zU7G1bKchlGRRHMBFreG4tAlGuzyI8hAPiYGobaJvZBgZBf6L63iphX7VH+ityL8VgEQz9Q==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "16.5.2", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-16.5.2.tgz", + "integrity": "sha512-GkDEL6TYo3HgT3UuqakdgE9PZfc1hMki6+Hwgy1uddb/EauvAKfu85vVhuofRSo22D1xTnWt8Ucwfg4vSCVwvA==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-node": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-5.1.5.tgz", + "integrity": "sha512-ObTeMoNPmq19X3z40et9Xvs4ZoWVeJg43PZMRLG5iwVL+2nCtAerG3YTDItqPp1CfXNwmCXBbg8jn1DOx65c3g==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "16.5.2", + "jsonwebtoken": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/@mongodb-js/saslprep": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/@mongodb-js/saslprep/-/saslprep-1.4.9.tgz", + "integrity": "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA==", + "license": "MIT", + "dependencies": { + "sparse-bitfield": "^3.0.3" + } + }, + "node_modules/@types/node": { + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.16.0" + } + }, + "node_modules/@types/webidl-conversions": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", + "integrity": "sha512-CiJJvcRtIgzadHCYXw7dqEnMNRjhGZlYK05Mj9OyktqV8uVT8fD2BFOB7S1uwBE3Kj2Z+4UyPmFw/Ixgw/LAlA==", + "license": "MIT" + }, + "node_modules/@types/whatwg-url": { + "version": "11.0.5", + "resolved": "https://registry.npmjs.org/@types/whatwg-url/-/whatwg-url-11.0.5.tgz", + "integrity": "sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==", + "license": "MIT", + "dependencies": { + "@types/webidl-conversions": "*" + } + }, + "node_modules/@typespec/ts-http-runtime": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.5.tgz", + "integrity": "sha512-yURCknZhvywvQItHMMmFSo+fq5arCUIyz/CVk7jD89MSai7dkaX8ufjCWp3NttLojoTVbcE72ri+be/TnEbMHw==", + "license": "MIT", + "dependencies": { + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.0", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/bson": { + "version": "6.10.4", + "resolved": "https://registry.npmjs.org/bson/-/bson-6.10.4.tgz", + "integrity": "sha512-WIsKqkSC0ABoBJuT1LEX+2HEvNmNKKgnTAyd0fL8qzK4SH2i9NXg+t08YtdZp/V9IZ33cxe3iV4yM0qg8lMQng==", + "license": "Apache-2.0", + "engines": { + "node": ">=16.20.1" + } + }, + "node_modules/buffer-equal-constant-time": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", + "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", + "license": "BSD-3-Clause" + }, + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "license": "MIT", + "dependencies": { + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "license": "MIT", + "dependencies": { + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/ecdsa-sig-formatter": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", + "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/is-docker": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "license": "MIT", + "dependencies": { + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" + }, + "engines": { + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/is-wsl": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", + "license": "MIT", + "dependencies": { + "is-inside-container": "^1.0.0" + }, + "engines": { + "node": ">=16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/jsonwebtoken": { + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", + "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", + "license": "MIT", + "dependencies": { + "jws": "^4.0.1", + "lodash.includes": "^4.3.0", + "lodash.isboolean": "^3.0.3", + "lodash.isinteger": "^4.0.4", + "lodash.isnumber": "^3.0.3", + "lodash.isplainobject": "^4.0.6", + "lodash.isstring": "^4.0.1", + "lodash.once": "^4.0.0", + "ms": "^2.1.1", + "semver": "^7.5.4" + }, + "engines": { + "node": ">=12", + "npm": ">=6" + } + }, + "node_modules/jwa": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", + "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", + "license": "MIT", + "dependencies": { + "buffer-equal-constant-time": "^1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/jws": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", + "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", + "license": "MIT", + "dependencies": { + "jwa": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/lodash.includes": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", + "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", + "license": "MIT" + }, + "node_modules/lodash.isboolean": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", + "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", + "license": "MIT" + }, + "node_modules/lodash.isinteger": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", + "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", + "license": "MIT" + }, + "node_modules/lodash.isnumber": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", + "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", + "license": "MIT" + }, + "node_modules/lodash.isplainobject": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", + "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", + "license": "MIT" + }, + "node_modules/lodash.isstring": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", + "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", + "license": "MIT" + }, + "node_modules/lodash.once": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", + "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", + "license": "MIT" + }, + "node_modules/memory-pager": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/memory-pager/-/memory-pager-1.5.0.tgz", + "integrity": "sha512-ZS4Bp4r/Zoeq6+NLJpP+0Zzm0pR8whtGPf1XExKLJBAczGMnSi3It14OiNCStjQjM6NU1okjQGSxgEZN8eBYKg==", + "license": "MIT" + }, + "node_modules/mongodb": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/mongodb/-/mongodb-6.21.0.tgz", + "integrity": "sha512-URyb/VXMjJ4da46OeSXg+puO39XH9DeQpWCslifrRn9JWugy0D+DvvBvkm2WxmHe61O/H19JM66p1z7RHVkZ6A==", + "license": "Apache-2.0", + "dependencies": { + "@mongodb-js/saslprep": "^1.3.0", + "bson": "^6.10.4", + "mongodb-connection-string-url": "^3.0.2" + }, + "engines": { + "node": ">=16.20.1" + }, + "peerDependencies": { + "@aws-sdk/credential-providers": "^3.188.0", + "@mongodb-js/zstd": "^1.1.0 || ^2.0.0", + "gcp-metadata": "^5.2.0", + "kerberos": "^2.0.1", + "mongodb-client-encryption": ">=6.0.0 <7", + "snappy": "^7.3.2", + "socks": "^2.7.1" + }, + "peerDependenciesMeta": { + "@aws-sdk/credential-providers": { + "optional": true + }, + "@mongodb-js/zstd": { + "optional": true + }, + "gcp-metadata": { + "optional": true + }, + "kerberos": { + "optional": true + }, + "mongodb-client-encryption": { + "optional": true + }, + "snappy": { + "optional": true + }, + "socks": { + "optional": true + } + } + }, + "node_modules/mongodb-connection-string-url": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mongodb-connection-string-url/-/mongodb-connection-string-url-3.0.2.tgz", + "integrity": "sha512-rMO7CGo/9BFwyZABcKAWL8UJwH/Kc2x0g72uhDWzG48URRax5TCIcJ7Rc3RZqffZzO/Gwff/jyKwCU9TN8gehA==", + "license": "Apache-2.0", + "dependencies": { + "@types/whatwg-url": "^11.0.2", + "whatwg-url": "^14.1.0 || ^13.0.0" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/open": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", + "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", + "license": "MIT", + "dependencies": { + "default-browser": "^5.2.1", + "define-lazy-prop": "^3.0.0", + "is-inside-container": "^1.0.0", + "wsl-utils": "^0.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/openai": { + "version": "5.23.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.23.2.tgz", + "integrity": "sha512-MQBzmTulj+MM5O8SKEk/gL8a7s5mktS9zUtAkU257WjvobGc9nKcBuVwjyEEcb9SI8a8Y2G/mzn3vm9n1Jlleg==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/run-applescript": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", + "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/sparse-bitfield": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/sparse-bitfield/-/sparse-bitfield-3.0.3.tgz", + "integrity": "sha512-kvzhi7vqKTfkh0PZU+2D2PIllw2ymqJKujUcyPMd9Y75Nv4nPbGJZXNhxsgdQab2BmlDct1YnfQCguEvHr7VsQ==", + "license": "MIT", + "dependencies": { + "memory-pager": "^1.0.2" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "dev": true, + "license": "MIT" + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wsl-utils": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", + "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", + "license": "MIT", + "dependencies": { + "is-wsl": "^3.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/ai/select-algorithm-typescript/package.json b/ai/select-algorithm-typescript/package.json new file mode 100644 index 0000000..5c1f24a --- /dev/null +++ b/ai/select-algorithm-typescript/package.json @@ -0,0 +1,20 @@ +{ + "name": "select-algorithm-typescript", + "version": "1.0.0", + "description": "Compare IVF, HNSW, and DiskANN vector index algorithms in Azure DocumentDB", + "type": "module", + "scripts": { + "env:init": "azd env get-values > .env", + "build": "tsc", + "start": "node dist/compare-all.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/select-algorithm-typescript/quickstart.md b/ai/select-algorithm-typescript/quickstart.md new file mode 100644 index 0000000..b04fc58 --- /dev/null +++ b/ai/select-algorithm-typescript/quickstart.md @@ -0,0 +1,401 @@ +--- +title: Quickstart - Vector index with TypeScript +description: Compare vector index algorithms and similarity functions using TypeScript in Azure DocumentDB to optimize search performance for your workload. +ms.devlang: typescript +ms.topic: quickstart-sdk +ms.date: 05/07/2026 +ms.custom: sfi-ropc-nochange +ai-usage: ai-generated +author: diberry +ms.author: diberry +ms.service: azure-documentdb +--- + +# Quickstart: Vector index with TypeScript in Azure DocumentDB + +In this quickstart, you compare three vector index algorithms (DiskANN, HNSW, and IVF) and three similarity functions (cosine, L2, and inner product) to find the optimal configuration for your search workload. This quickstart uses a sample hotel dataset with pre-calculated embeddings from the `text-embedding-3-small` model. + + + +Find the [sample code](https://github.com/Azure-Samples/documentdb-samples/tree/main/ai/select-algorithm-typescript) on GitHub. + +## Prerequisites + +[!INCLUDE[Prerequisites](includes/prerequisite-quickstart-vector-index.md)] + +- [Node.js LTS](https://nodejs.org/download/) +- [TypeScript](https://www.typescriptlang.org/download) 5.x or greater + +## Create data file with vectors + +1. Create a new data directory for the hotels data file: + + ### [Bash](#tab/bash) + + ```bash + mkdir data + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name data + ``` + + --- + +2. Download the `Hotels_Vector.json` data file with vectors to your `data` directory: + + ### [Bash](#tab/bash) + + ```bash + curl -o data/Hotels_Vector.json https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Invoke-WebRequest -Uri "https://raw.githubusercontent.com/Azure-Samples/documentdb-samples/refs/heads/main/ai/data/Hotels_Vector.json" -OutFile "data/Hotels_Vector.json" + ``` + + --- + + Verify the file was downloaded: + + ### [Bash](#tab/bash) + + ```bash + ls data/Hotels_Vector.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem data\Hotels_Vector.json + ``` + + --- + + You should see `Hotels_Vector.json` in the `data` directory. + +## Create a Node.js project + +1. Create a new directory for your project and open it in Visual Studio Code: + + ### [Bash](#tab/bash) + + ```bash + mkdir select-algorithm-typescript + cd select-algorithm-typescript + code . + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + New-Item -ItemType Directory -Name select-algorithm-typescript + Set-Location select-algorithm-typescript + code . + ``` + + --- + +2. Initialize a TypeScript Node.js project: + + ```bash + npm init -y + ``` + + Verify the project was initialized: + + ### [Bash](#tab/bash) + + ```bash + ls package.json + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + Get-ChildItem package.json + ``` + + --- + +3. Install the required packages: + + ```bash + npm install mongodb openai @azure/identity + npm install --save-dev typescript @types/node + ``` + + - `mongodb`: MongoDB driver for Node.js + - `openai`: OpenAI client library to create vectors + - `@azure/identity`: Azure Identity library for passwordless authentication + - `typescript`: TypeScript compiler + + Verify: `npm list` shows all installed packages without errors. + +4. Create a `tsconfig.json` file in the project root: + + ```json + { + "compilerOptions": { + "target": "ES2022", + "module": "Node16", + "moduleResolution": "Node16", + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "strict": true, + "rootDir": "./src", + "outDir": "./dist" + }, + "include": ["src/**/*"], + "exclude": ["node_modules"] + } + ``` + +5. Update your `package.json` to include: + + ```json + { + "type": "module", + "scripts": { + "build": "tsc", + "start": "node dist/compare-all.js" + } + } + ``` + +6. Set the required environment variables in your shell before running the sample: + + ### [Bash](#tab/bash) + + ```bash + export AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-3-small + export AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com + export DATA_FILE_WITH_VECTORS=data/Hotels_Vector.json + export EMBEDDED_FIELD=DescriptionVector + export EMBEDDING_DIMENSIONS=1536 + export LOAD_SIZE_BATCH=100 + export DOCUMENTDB_CLUSTER_NAME= + export AZURE_DOCUMENTDB_DATABASENAME=Hotels + ``` + + ### [PowerShell](#tab/powershell) + + ```powershell + $env:AZURE_OPENAI_EMBEDDING_MODEL = "text-embedding-3-small" + $env:AZURE_OPENAI_EMBEDDING_ENDPOINT = "https://.openai.azure.com" + $env:DATA_FILE_WITH_VECTORS = "data/Hotels_Vector.json" + $env:EMBEDDED_FIELD = "DescriptionVector" + $env:EMBEDDING_DIMENSIONS = "1536" + $env:LOAD_SIZE_BATCH = "100" + $env:DOCUMENTDB_CLUSTER_NAME = "" + $env:AZURE_DOCUMENTDB_DATABASENAME = "Hotels" + ``` + + Replace the placeholder values with your own information: + + - `AZURE_OPENAI_EMBEDDING_ENDPOINT`: Your Azure OpenAI resource endpoint URL + - `DOCUMENTDB_CLUSTER_NAME`: Your Azure DocumentDB cluster name + + The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + + You should always prefer passwordless authentication, but it requires additional setup. For more information on setting up managed identity and the full range of your authentication options, see [Authenticate JavaScript apps to Azure services using the Azure SDK for JavaScript](/azure/developer/javascript/sdk/authentication/overview). + +## Create code files + +Create the following project structure: + +``` +select-algorithm-typescript/ +├── data/ +│ └── README.md +├── output/ +│ └── compare_all.txt +├── src/ +│ ├── compare-all.ts +│ ├── select-algorithm.ts +│ └── utils.ts +├── .gitignore +├── package.json +├── package-lock.json +├── quickstart.md +├── README.md +└── tsconfig.json +``` + +Create the `src` directory: + +### [Bash](#tab/bash) + +```bash +mkdir src +``` + +### [PowerShell](#tab/powershell) + +```powershell +New-Item -ItemType Directory -Name src +``` + +--- + +## Create the algorithm comparison code + +Create the `src/compare-all.ts` file with the following code: + +:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/compare-all.ts" ::: + +This script orchestrates the algorithm comparison by: + +- Loading configuration from environment variables +- Initializing MongoDB and Azure OpenAI clients with passwordless authentication +- Loading hotel data with pre-calculated embeddings +- Testing each algorithm/similarity combination by creating a collection, inserting data, creating an index, and executing a search +- Measuring and comparing search performance across all configurations +- Displaying results in a comparison table + +## Create utility functions + +Create the `src/utils.ts` file with the following code: + +:::code language="typescript" source="~/../documentdb-samples/ai/select-algorithm-typescript/src/utils.ts" ::: + +The utilities provide essential functions for: + +- Passwordless authentication to DocumentDB and Azure OpenAI using DefaultAzureCredential +- Reading JSON data files +- Batch insertion of documents with DocumentDB's 16 MB payload limit in mind +- Formatted display of comparison results showing algorithm performance + +## Run the code + +Execute the comparison script to test all 9 algorithm × similarity combinations: + +```bash +npm run build +npm start +``` + +The output shows the comparison across all algorithms and similarity metrics: + +``` +Using Azure OpenAI Embedding Deployment/Model: text-embedding-3-small +Reading JSON file from data/Hotels_Vector.json +Loaded 50 documents +Processing in batches of 50... +Batch 1 complete: 50 inserted + +Query: "luxury hotel near the beach" +Embedding generated (1536 dimensions) + +Running searches (top 5 results)... ✓ vector_ivf_cos created + ✓ vector_ivf_l2 created + ✓ vector_ivf_ip created + ✓ vector_hnsw_cos created + ✓ vector_hnsw_l2 created + ✓ vector_hnsw_ip created + ✓ vector_diskann_cos created + ✓ vector_diskann_l2 created + ✓ vector_diskann_ip created +┌──────────┬────────┬────────────────────────────┬────────┬────────────────────────────┬────────┬───────┐ +│ Algorithm│ Metric │ Top 1 Result │ Score │ Top 2 Result │ Score │ Diff │ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ IVF │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ HNSW │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ COS │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ L2 │ Ocean Water Resort & Spa │ 0.8736 │ Windy Ocean Motel │ 0.9943 │ 0.1208│ +├──────────┼────────┼────────────────────────────┼────────┼────────────────────────────┼────────┼───────┤ +│ DiskANN │ IP │ Ocean Water Resort & Spa │ 0.6184 │ Windy Ocean Motel │ 0.5056 │ 0.1128│ +└──────────┴────────┴────────────────────────────┴────────┴────────────────────────────┴────────┴───────┘ + +Cleanup: dropped collection "hotels" +Database connection closed +``` + +The **Diff** column shows the score gap between the top-1 and top-2 results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +> [!NOTE] +> Latency values are approximate and vary by environment. Scores may differ slightly depending on your Azure OpenAI embedding deployment. + +### Run all combinations + +The compare-all mode always runs all 9 combinations (3 algorithms × 3 metrics). The `ALGORITHM` and `SIMILARITY` environment variables are used only by the single-algorithm mode. + +```bash +npm run build +npm start +``` + +### Understanding the results + +The comparison table demonstrates key behaviors of vector search in DocumentDB: + +- **All algorithms return identical results on small datasets.** With 50 documents, every algorithm finds the same matches because the dataset fits entirely in memory regardless of index structure. Algorithm selection becomes important at scale (millions of documents) where tradeoffs in latency, memory, and recall diverge. + +- **COS and IP produce identical scores** (0.6184 / 0.5056) because the `text-embedding-3-small` model outputs normalized (unit-length) vectors. For normalized vectors, cosine similarity equals inner product mathematically. + +- **L2 (Euclidean distance) scores represent distance.** In this output, the top result has the lower L2 score (0.8736) and the second result is farther away (0.9943). + +- **Score separation (Diff column)** shows the gap between the top two results. A smaller diff indicates the algorithm found results with more similar relevance scores. + +[!INCLUDE[Choosing the right algorithm](../includes/choosing-algorithm.md)] + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `MongoServerSelectionError` | Verify your `DOCUMENTDB_CLUSTER_NAME` environment variable and ensure your IP is in the DocumentDB firewall rules. | +| `MongoServerError: Authentication failed` | Check your authentication setup and verify you've run `az login` for passwordless auth. | +| TypeScript compilation errors | Run `npx tsc --version` to verify TypeScript is installed. Check `tsconfig.json` settings match the values shown in this article. | +| `Cannot find module` errors | Run `npm install` to ensure all dependencies are installed. | +| `Embedding dimension mismatch` | Verify the `AZURE_OPENAI_EMBEDDING_MODEL` environment variable matches the model deployed in your Azure OpenAI resource. | +| Empty search results | The vector index may not be ready yet. The code retries up to 6 total attempts with a 2-second delay between attempts. | + +## Clean up resources + +When you're done, you can remove the database using mongosh or the DocumentDB for VS Code extension. + +### [mongosh](#tab/mongosh) + +Connect to your DocumentDB cluster and drop the database: + +```bash +mongosh "mongodb+srv://.global.mongocluster.cosmos.azure.com/" --tls --authenticationMechanism MONGODB-OIDC +``` + +```javascript +use Hotels +db.dropDatabase() +``` + +### [VS Code extension](#tab/vscode) + +1. Install the [DocumentDB for VS Code](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-documentdb) extension. +2. Connect to your Azure DocumentDB cluster. +3. Expand the cluster, right-click the **Hotels** database, and select **Drop Database**. + +--- + +If you created an Azure DocumentDB cluster specifically for this quickstart, you can also delete the entire resource group in the Azure portal to remove all associated resources. + +## Related content + +- [Vector search overview](./vector-search.md) +- [ENN vector search](./enn-vector-search.md) +- [Product quantization](./product-quantization.md) diff --git a/ai/select-algorithm-typescript/src/compare-all.ts b/ai/select-algorithm-typescript/src/compare-all.ts new file mode 100644 index 0000000..2978c94 --- /dev/null +++ b/ai/select-algorithm-typescript/src/compare-all.ts @@ -0,0 +1,236 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, getConfig, insertData } from './utils.js'; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +interface AlgorithmConfig { + name: string; + kind: string; + options: Record; +} + +interface SearchResult { + algorithm: string; + similarity: string; + top1Name: string; + top1Score: number; + top2Name: string; + top2Score: number; +} + +const ALGORITHMS: AlgorithmConfig[] = [ + { name: 'IVF', kind: 'vector-ivf', options: { numLists: 1 } }, + { name: 'HNSW', kind: 'vector-hnsw', options: { m: 16, efConstruction: 64 } }, + { name: 'DiskANN', kind: 'vector-diskann', options: { maxDegree: 32, lBuild: 50 } }, +]; + +const SIMILARITIES = ['COS', 'L2', 'IP']; + +async function main() { + const baseConfig = getConfig(); + const queryText = process.env.QUERY_TEXT || 'luxury hotel near the beach'; + const topK = parseInt(process.env.TOP_K || '5', 10); + const collectionName = 'hotels'; + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) throw new Error('AI client is not configured.'); + if (!dbClient) throw new Error('Database client is not configured.'); + + await dbClient.connect(); + const db = dbClient.db(baseConfig.dbName); + + // Drop collection if it exists for a clean start + const collections = await db.listCollections({ name: collectionName }).toArray(); + if (collections.length > 0) { + try { + const col = db.collection(collectionName); + const existingIndexes = await col.listIndexes().toArray(); + for (const idx of existingIndexes) { + if (idx.name !== '_id_') { + try { + await col.dropIndex(idx.name); + } catch {} + } + } + await db.dropCollection(collectionName); + console.log(`Dropped existing collection: ${collectionName}`); + } catch (e: any) { + console.log(`Cleanup note: ${e.message.split('\n')[0]}`); + } + } + + // Load data once for reuse + const data = await readFileReturnJson(path.join(__dirname, '..', baseConfig.dataFile)); + console.log(`Loaded ${data.length} documents`); + + // Insert data into collection + const collection = db.collection(collectionName); + await insertData(baseConfig, collection, data, false); + + // Generate one embedding for the query + console.log(`\nQuery: "${queryText}"`); + const embeddingResponse = await aiClient.embeddings.create({ + model: baseConfig.deployment, + input: [queryText] + }); + const queryVector = embeddingResponse.data[0].embedding; + console.log(`Embedding generated (${queryVector.length} dimensions)`); + + // Sequential create→search→drop for each algorithm+similarity combo + // DocumentDB does not allow multiple vector indexes of the same kind on the same field + console.log(`\nRunning searches (top ${topK} results)...\n`); + const results: SearchResult[] = []; + + for (const algo of ALGORITHMS) { + for (const sim of SIMILARITIES) { + const indexName = `vector_${algo.kind.replace('vector-', '')}_${sim.toLowerCase()}`; + + // 1. Drop all existing vector indexes + const indexes = await collection.listIndexes().toArray(); + let droppedAny = false; + for (const idx of indexes) { + if (idx.key && idx.key[baseConfig.embeddedField] === 'cosmosSearch') { + try { await collection.dropIndex(idx.name); droppedAny = true; } catch {} + } + } + // 2. Create this specific index + const indexOptions = { + createIndexes: collectionName, + indexes: [{ + name: indexName, + key: { [baseConfig.embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: { + kind: algo.kind, + ...algo.options, + similarity: sim, + dimensions: baseConfig.embeddingDimensions + } + }] + }; + await db.command(indexOptions); + console.log(` ✓ ${indexName} created`); + + // 3. Search with bounded retry while the new index becomes ready + const searchPipeline = [ + { + $search: { + cosmosSearch: { + vector: queryVector, + path: baseConfig.embeddedField, + k: topK + } + } + }, + { + $project: { + score: { $meta: 'searchScore' }, + document: '$$ROOT' + } + } + ]; + + let searchResults: any[] = []; + let lastSearchError: unknown; + for (let attempt = 1; attempt <= 6; attempt++) { + try { + searchResults = await collection.aggregate(searchPipeline).toArray(); + if (searchResults.length > 0 || attempt === 6) { + break; + } + console.log(` ...search returned no results yet, retrying (${attempt}/6)`); + } catch (e) { + lastSearchError = e; + if (attempt === 6) { + throw e; + } + console.log(` ...search not ready yet, retrying (${attempt}/6)`); + } + + await new Promise(r => setTimeout(r, 2000)); + } + + if (searchResults.length === 0 && lastSearchError) { + throw lastSearchError; + } + + // Record top 2 results + const top1 = searchResults[0] as any; + const top2 = searchResults[1] as any; + results.push({ + algorithm: algo.name, + similarity: sim, + top1Name: top1?.document?.HotelName ?? '(none)', + top1Score: top1?.score ?? 0, + top2Name: top2?.document?.HotelName ?? '(none)', + top2Score: top2?.score ?? 0, + }); + } + } + + // Print comparison table + printComparisonTable(results); + + } catch (error) { + console.error('Compare-all failed:', error); + process.exitCode = 1; + } finally { + // Cleanup: drop the comparison collection + if (dbClient) { + try { + const db = dbClient.db(baseConfig.dbName); + await db.dropCollection(collectionName); + console.log(`\nCleanup: dropped collection "${collectionName}"`); + } catch (cleanupErr) { + console.error('Cleanup warning:', cleanupErr); + } + await dbClient.close(); + console.log('Database connection closed'); + } + } +} + +function printComparisonTable(results: SearchResult[]) { + const algoW = 10; + const simW = 8; + const name1W = 28; + const score1W = 8; + const name2W = 28; + const score2W = 8; + const diffW = 7; + + const pad = (s: string, w: number) => s.length >= w ? s.slice(0, w) : s + ' '.repeat(w - s.length); + + const cols = [algoW, simW, name1W, score1W, name2W, score2W, diffW]; + const topLine = `┌${cols.map(w => '─'.repeat(w)).join('┬')}┐`; + const headerSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const rowSep = `├${cols.map(w => '─'.repeat(w)).join('┼')}┤`; + const bottomLine = `└${cols.map(w => '─'.repeat(w)).join('┴')}┘`; + + console.log(topLine); + console.log( + `│${pad(' Algorithm', algoW)}│${pad(' Metric', simW)}│${pad(' Top 1 Result', name1W)}│${pad(' Score', score1W)}│${pad(' Top 2 Result', name2W)}│${pad(' Score', score2W)}│${pad(' Diff', diffW)}│` + ); + console.log(headerSep); + + results.forEach((r, i) => { + const diff = Math.abs(r.top1Score - r.top2Score).toFixed(4); + console.log( + `│${pad(` ${r.algorithm}`, algoW)}│${pad(` ${r.similarity}`, simW)}│${pad(` ${r.top1Name}`, name1W)}│${pad(` ${r.top1Score.toFixed(4)}`, score1W)}│${pad(` ${r.top2Name}`, name2W)}│${pad(` ${r.top2Score.toFixed(4)}`, score2W)}│${pad(` ${diff}`, diffW)}│` + ); + if (i < results.length - 1) { + console.log(rowSep); + } + }); + + console.log(bottomLine); +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/select-algorithm.ts b/ai/select-algorithm-typescript/src/select-algorithm.ts new file mode 100644 index 0000000..38451af --- /dev/null +++ b/ai/select-algorithm-typescript/src/select-algorithm.ts @@ -0,0 +1,317 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printComparisonTable } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Validate required environment variables at startup +const requiredEnvVars = [ + 'DOCUMENTDB_CLUSTER_NAME', + 'AZURE_OPENAI_EMBEDDING_ENDPOINT', + 'AZURE_OPENAI_EMBEDDING_MODEL', + 'DATA_FILE_WITH_VECTORS' +]; + +const missing = requiredEnvVars.filter(v => !process.env[v]); +if (missing.length > 0) { + console.error(`Missing required environment variables: ${missing.join(', ')}`); + console.error('See .env.example for required values.'); + process.exit(1); +} + +type Algorithm = 'diskann' | 'hnsw' | 'ivf'; +type Similarity = 'COS' | 'L2' | 'IP'; +type SimilarityEnv = 'cos' | 'l2' | 'ip'; + +const ALGORITHMS: Algorithm[] = ['diskann', 'hnsw', 'ivf']; +const SIMILARITIES: Similarity[] = ['COS', 'L2', 'IP']; +const SIMILARITY_ENV_VALUES: SimilarityEnv[] = ['cos', 'l2', 'ip']; +const SIMILARITY_BY_ENV: Record = { + cos: 'COS', + l2: 'L2', + ip: 'IP', +}; + +const ALGORITHM_LABELS: Record = { + diskann: 'DiskANN', + hnsw: 'HNSW', + ivf: 'IVF', +}; + +// Index creation configs per algorithm +function getIndexOptions( + collectionName: string, + indexName: string, + embeddedField: string, + dimensions: number, + algorithm: Algorithm, + similarity: Similarity +) { + const base = { + createIndexes: collectionName, + indexes: [ + { + name: indexName, + key: { [embeddedField]: 'cosmosSearch' }, + cosmosSearchOptions: {} as Record, + }, + ], + }; + + switch (algorithm) { + case 'diskann': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-diskann', + dimensions, + similarity, + maxDegree: 32, + lBuild: 50, + }; + break; + case 'hnsw': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-hnsw', + dimensions, + similarity, + m: 16, + efConstruction: 64, + }; + break; + case 'ivf': + base.indexes[0].cosmosSearchOptions = { + kind: 'vector-ivf', + dimensions, + similarity, + numLists: 1, + }; + break; + } + + return base; +} + +// Algorithm-specific query params +function getSearchPipeline( + queryEmbedding: number[], + embeddedField: string, + k: number, + algorithm: Algorithm +) { + const cosmosSearch: Record = { + vector: queryEmbedding, + path: embeddedField, + k, + }; + + // Add algorithm-specific search params + switch (algorithm) { + case 'diskann': + cosmosSearch.lSearch = 100; + break; + case 'hnsw': + cosmosSearch.efSearch = 80; + break; + case 'ivf': + cosmosSearch.nProbes = 1; + break; + } + + return [ + { $search: { cosmosSearch } }, + { $project: { score: { $meta: "searchScore" }, document: "$$ROOT" } }, + ]; +} + +/** + * Determine which collections to create/query based on ALGORITHM and SIMILARITY env vars. + * Leave either env var unset or empty to run all valid combinations. + * Valid ALGORITHM values: ivf, hnsw, diskann + * Valid SIMILARITY values: cos, l2, ip + * Collection naming: hotels_{algorithm}_{similarity} + */ +function getTargetCollections( + algorithmEnv: string, + similarityEnv: string +): Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> { + const algorithms: Algorithm[] = !algorithmEnv + ? ALGORITHMS + : (() => { + if (!ALGORITHMS.includes(algorithmEnv as Algorithm)) { + throw new Error(`Invalid ALGORITHM '${algorithmEnv}'. Must be one of: ${ALGORITHMS.join(', ')}`); + } + return [algorithmEnv as Algorithm]; + })(); + + const similarities: Similarity[] = !similarityEnv + ? SIMILARITIES + : (() => { + if (!SIMILARITY_ENV_VALUES.includes(similarityEnv as SimilarityEnv)) { + throw new Error(`Invalid SIMILARITY '${similarityEnv}'. Must be one of: ${SIMILARITY_ENV_VALUES.join(', ')}`); + } + return [SIMILARITY_BY_ENV[similarityEnv as SimilarityEnv]]; + })(); + + const targets: Array<{ collectionName: string; algorithm: Algorithm; similarity: Similarity }> = []; + + for (const alg of algorithms) { + for (const sim of similarities) { + targets.push({ + collectionName: `hotels_${alg}_${sim.toLowerCase()}`, + algorithm: alg, + similarity: sim, + }); + } + } + + return targets; +} + +async function main() { + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + if (!aiClient) { + throw new Error('Azure OpenAI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + const dbName = process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels'; + const embeddedField = process.env.EMBEDDED_FIELD || 'DescriptionVector'; + const embeddingDimensions = parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10); + const dataFile = process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json'; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const batchSize = parseInt(process.env.LOAD_SIZE_BATCH || '100', 10); + const algorithmEnv = (process.env.ALGORITHM || '').trim().toLowerCase(); + const similarityEnv = (process.env.SIMILARITY || '').trim().toLowerCase(); + const searchQuery = 'quintessential lodging near running trails, eateries, retail'; + + const targets = getTargetCollections(algorithmEnv, similarityEnv); + + console.log(`\n🔬 Vector Algorithm Comparison`); + console.log(` Database: ${dbName}`); + console.log(` Algorithms: ${algorithmEnv || ALGORITHMS.join(', ')}`); + console.log(` Similarity: ${similarityEnv || SIMILARITY_ENV_VALUES.join(', ')}`); + console.log(` Collections to query: ${targets.map(t => t.collectionName).join(', ')}`); + console.log(` Search query: "${searchQuery}"\n`); + + await dbClient.connect(); + const db = dbClient.db(dbName); + + // Load data once (shared across collections) + const data = await readFileReturnJson(path.join(__dirname, '..', dataFile)); + + // Generate query embedding once (reuse across collections) + console.log('Generating query embedding...'); + const embeddingResponse = await aiClient.embeddings.create({ + model: deployment, + input: [searchQuery], + }); + const queryEmbedding = embeddingResponse.data[0].embedding; + if (queryEmbedding.length !== embeddingDimensions) { + throw new Error( + `Embedding dimension mismatch: expected ${embeddingDimensions}, got ${queryEmbedding.length}. ` + + `Verify AZURE_OPENAI_EMBEDDING_MODEL matches the configured EMBEDDING_DIMENSIONS.` + ); + } + console.log(`Query embedding: ${queryEmbedding.length} dimensions\n`); + + const config = { batchSize }; + + const comparisonResults: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> = []; + const failedTargets: Array<{ collectionName: string; error: string }> = []; + + for (const target of targets) { + console.log(`\n━━━ ${ALGORITHM_LABELS[target.algorithm]} / ${target.similarity} ━━━`); + console.log(`Collection: ${target.collectionName}`); + + try { + // Create collection (drops existing to ensure clean state) + try { + await db.dropCollection(target.collectionName); + } catch { + // Collection may not exist yet + } + const collection = await db.createCollection(target.collectionName); + console.log('Created collection:', target.collectionName); + + // Insert data + const insertSummary = await insertData(config, collection, data); + console.log(`Inserted: ${insertSummary.inserted}/${insertSummary.total}`); + + // Create vector index + const indexName = `vectorIndex_${target.algorithm}_${target.similarity.toLowerCase()}`; + const indexOptions = getIndexOptions( + target.collectionName, + indexName, + embeddedField, + embeddingDimensions, + target.algorithm, + target.similarity + ); + await db.command(indexOptions); + console.log('Created vector index:', indexName); + + // Run vector search + console.log('Executing vector search...'); + const startTime = Date.now(); + + const pipeline = getSearchPipeline(queryEmbedding, embeddedField, 5, target.algorithm); + const searchResults = await collection.aggregate(pipeline).toArray(); + + const latencyMs = Date.now() - startTime; + + comparisonResults.push({ + collectionName: target.collectionName, + algorithm: ALGORITHM_LABELS[target.algorithm], + similarity: target.similarity, + searchResults, + latencyMs, + }); + + console.log(`✓ ${searchResults.length} results, ${latencyMs}ms`); + } catch (error) { + const message = (error as Error).message; + failedTargets.push({ collectionName: target.collectionName, error: message }); + console.error(`✗ Error with ${target.collectionName}:`, message); + } + } + + if (failedTargets.length > 0) { + console.error(`\nFailure summary: ${failedTargets.length} of ${targets.length} target collection(s) failed.`); + for (const failure of failedTargets) { + console.error(` - ${failure.collectionName}: ${failure.error}`); + } + } + + // Print comparison table + if (comparisonResults.length > 0) { + printComparisonTable(comparisonResults); + } else { + console.error('\nNo comparison results were produced. All target collections failed.'); + process.exitCode = 1; + } + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('\nClosing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); diff --git a/ai/select-algorithm-typescript/src/utils.ts b/ai/select-algorithm-typescript/src/utils.ts new file mode 100644 index 0000000..09ec3dd --- /dev/null +++ b/ai/select-algorithm-typescript/src/utils.ts @@ -0,0 +1,211 @@ +import { Collection, Document, MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +// Define a type for JSON data +export type JsonData = Record; + +export function getConfig() { + return { + dbName: process.env.AZURE_DOCUMENTDB_DATABASENAME || 'Hotels', + dataFile: process.env.DATA_FILE_WITH_VECTORS || 'data/Hotels_Vector.json', + embeddedField: process.env.EMBEDDED_FIELD || 'DescriptionVector', + similarity: process.env.SIMILARITY || '', + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS || '1536', 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small', + batchSize: parseInt(process.env.LOAD_SIZE_BATCH || '100', 10) + }; +} + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: Math.floor(((tokenResponse?.expiresOnTimestamp || 0) - Date.now()) / 1000) + }; +}; + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + // Validate all required environment variables upfront + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const clusterName = process.env.DOCUMENTDB_CLUSTER_NAME!; + + if (!endpoint || !deployment || !clusterName) { + throw new Error('Missing required environment variables: AZURE_OPENAI_EMBEDDING_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL, DOCUMENTDB_CLUSTER_NAME'); + } + + console.log(`Using Azure OpenAI Embedding Deployment/Model: ${deployment}`); + + const credential = new DefaultAzureCredential(); + + // For Azure OpenAI with DefaultAzureCredential + { + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion: process.env.AZURE_OPENAI_EMBEDDING_API_VERSION || "2023-05-15", + endpoint, + deployment, + azureADTokenProvider, + timeout: 30000, + maxRetries: 3, + }); + } + + // For DocumentDB with DefaultAzureCredential (uses signed-in user) + { + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 120000, + tls: true, + retryWrites: false, + maxIdleTimeMS: 120000, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + } + ); + } + + return { aiClient, dbClient }; +} + +export async function readFileReturnJson(filePath: string): Promise { + + console.log(`Reading JSON file from ${filePath}`); + + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} + +export async function insertData( + config: { batchSize: number }, + collection: Collection, + data: Document[], + createScalarIndexes: boolean = true +) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + // Small pause between batches to reduce resource contention + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + if (createScalarIndexes) { + const indexColumns = ["HotelId", "Category", "Description", "Description_fr"]; + for (const col of indexColumns) { + const indexSpec: Record = {}; + indexSpec[col] = 1; + await collection.createIndex(indexSpec); + } + } + + return { total: data.length, inserted, failed }; +} + +export function printSearchResults(insertSummary: any, vectorIndexSummary: any, searchResults: Document[]) { + console.log(`\nInsert summary: ${JSON.stringify(insertSummary)}`); + console.log(`Vector index: ${JSON.stringify(vectorIndexSummary)}`); + + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result: Document, index: number) => { + const { document, score } = result; + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + }); +} + +/** + * Print a side-by-side comparison table of vector search results across collections + */ +export function printComparisonTable( + results: Array<{ + collectionName: string; + algorithm: string; + similarity: string; + searchResults: any[]; + latencyMs: number; + }> +): void { + console.log('\n╔══════════════════════════════════════════════════════════════════════════════════╗'); + console.log('║ Vector Algorithm Comparison Results ║'); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + // Header + console.log( + '║ ' + + 'Algorithm'.padEnd(12) + + 'Similarity'.padEnd(14) + + 'Top Result'.padEnd(24) + + 'Score'.padEnd(12) + + 'Latency(ms)'.padEnd(14) + + '║' + ); + console.log('╠══════════════════════════════════════════════════════════════════════════════════╣'); + + for (const r of results) { + const topResult = r.searchResults[0]; + const topName = topResult ? (topResult.document.HotelName as string).substring(0, 22) : 'N/A'; + const topScore = topResult ? topResult.score.toFixed(4) : 'N/A'; + + console.log( + '║ ' + + r.algorithm.padEnd(12) + + r.similarity.padEnd(14) + + topName.padEnd(24) + + topScore.padEnd(12) + + r.latencyMs.toFixed(0).padEnd(14) + + '║' + ); + } + + console.log('╚══════════════════════════════════════════════════════════════════════════════════╝'); + + // Detailed results per collection + for (const r of results) { + console.log(`\n--- ${r.algorithm} / ${r.similarity} (${r.collectionName}) ---`); + if (r.searchResults.length === 0) { + console.log(' No results.'); + continue; + } + r.searchResults.forEach((item: Document, i: number) => { + console.log(` ${i + 1}. ${item.document.HotelName}, Score: ${item.score.toFixed(4)}`); + }); + console.log(` Latency: ${r.latencyMs.toFixed(0)}ms`); + } +} diff --git a/ai/select-algorithm-typescript/tsconfig.json b/ai/select-algorithm-typescript/tsconfig.json new file mode 100644 index 0000000..3cb9aaa --- /dev/null +++ b/ai/select-algorithm-typescript/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java index 676630b..59dcc76 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/DiskAnn.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java index 146fc27..e077d91 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/HNSW.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java index e800107..7962115 100644 --- a/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java +++ b/ai/vector-search-java/src/main/java/com/azure/documentdb/samples/IVF.java @@ -94,7 +94,7 @@ private MongoClient createMongoClient() { .withMechanismProperty("OIDC_CALLBACK", callback); var connectionString = new ConnectionString( - String.format("mongodb+srv://%s@%s.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", + String.format("mongodb+srv://%s@%s.global.mongocluster.cosmos.azure.com/?authMechanism=MONGODB-OIDC&tls=true&retrywrites=false&maxIdleTimeMS=120000", managedIdentityPrincipalId, clusterName) ); diff --git a/ai/vector-search-typescript/src/utils.ts b/ai/vector-search-typescript/src/utils.ts index 1e4abe9..a7fe3bb 100644 --- a/ai/vector-search-typescript/src/utils.ts +++ b/ai/vector-search-typescript/src/utils.ts @@ -80,7 +80,7 @@ export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClie // For DocumentDB with DefaultAzureCredential (uses signed-in user) { dbClient = new MongoClient( - `mongodb+srv://${clusterName}.mongocluster.cosmos.azure.com/`, { + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { connectTimeoutMS: 120000, tls: true, retryWrites: false, diff --git a/infra/main.bicep b/infra/main.bicep index 8e6778d..f2c6cfb 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -254,6 +254,7 @@ output AZURE_DOCUMENTDB_DATABASENAME string = databaseName output AZURE_DOCUMENTDB_COLLECTION string = collectionName output AZURE_DOCUMENTDB_INDEX_NAME string = indexName output MONGO_CLUSTER_NAME string = documentDbCluster.outputs.clusterName +output DOCUMENTDB_CLUSTER_NAME string = documentDbCluster.outputs.clusterName output AZURE_DOCUMENTDB_ADMIN_USERNAME string = documentDbAdminUsername // Configuration for embedding creation and vector search