Merge pull request #78 from Build5Nines/main

crpietschmann · web-flow · commit e9ed3aa0fbbe · 2025-11-23T11:32:19.000-05:00
merge main back to dev
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,12 @@ Add:
 - Added `IVectorTextResultItem.Similarity` and marked `IVectorTextResultItem.VectorComparison` obsolete. `VectorComparison` will be removed in the future.
 - Added more comment metadata to code
 
+## v2.1.2
+
+Fixed:
+
+- Fixed a bug when loading saved database from file/stream where `IntIdGenerator` or `NumericIdGenerator` lose max Id, resulting in adding new texts to database causes existing texts to be overwritten. This specifically affected `SharpVector.OpenAI` and `SharpVector.Ollama` libraries but the fix is implemented within the core `Build5Nines.SharpVector` library.
+
 ## v2.1.1
 
 Add:
diff --git a/README.md b/README.md
@@ -1,19 +1,24 @@
-# Build5Nines SharpVector - The lightweight, in-memory, Semantic Search, Text Vector Database for any C# / .NET Applications
+# Build5Nines SharpVector - The lightweight, in-memory, local, Semantic Search, Text Vector Database for any C# / .NET Applications
 
 `Build5Nines.SharpVector` is an in-memory vector database library designed for .NET applications. It allows you to store, search, and manage text data using vector representations. The library is customizable and extensible, enabling support for different vector comparison methods, preprocessing techniques, and vectorization strategies.
 
 [![Release Build](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml/badge.svg)](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml)
 ![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/build5nines/sharpvector)
 
-[![NuGet](https://img.shields.io/nuget/v/Build5Nines.SharpVector.svg)](https://www.nuget.org/packages/Build5Nines.SharpVector/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
 ![Framework: .NET 8+](https://img.shields.io/badge/framework-.NET%208%2B-blue)
 ![Semantic Search: Enabled](https://img.shields.io/badge/semantic%20search-enabled-purple)
 ![Gen AI: Ready](https://img.shields.io/badge/Gen%20AI-ready-purple)
 
 Vector databases are used with Semantic Search and [Generative AI](https://build5nines.com/what-is-generative-ai/?utm_source=github&utm_medium=sharpvector) solutions augmenting the LLM (Large Language Model) with the ability to load additional context data with the AI prompt using the [RAG (Retrieval-Augmented Generation)](https://build5nines.com/what-is-retrieval-augmented-generation-rag/?utm_source=github&utm_medium=sharpvector) design pattern.
 
-While there are lots of large databases that can be used to build Vector Databases (like Azure CosmosDB, PostgreSQL w/ pgvector, Azure AI Search, Elasticsearch, and more), there are not many options for a lightweight vector database that can be embedded into any .NET application. Build5Nines SharpVector is the lightweight in-memory Text Vector Database for use in any .NET application that you're looking for!
+While there are lots of large databases that can be used to build Vector Databases (like Azure CosmosDB, PostgreSQL w/ pgvector, Azure AI Search, Elasticsearch, and more), there are not many options for a lightweight vector database that can be embedded into any .NET application to provide a local text vector database.
+
+> "For the in-memory vector database, we're using Build5Nines.SharpVector, an excellent open-source project by Chris Pietschmann. SharpVector makes it easy to store and retrieve vectorized data, making it an ideal choice for our sample RAG implementation."
+>
+> [Tulika Chaudharie, Principal Product Manager at Microsoft for Azure App Service](https://azure.github.io/AppService/2024/09/03/Phi3-vector.html)
+
+Build5Nines SharpVector is the lightweight, local, in-memory Text Vector Database for implementing semantic search into any .NET application!
 
 ### [Documentation](https://sharpvector.build5nines.com) | [Get Started](https://sharpvector.build5nines.com/get-started/) | [Samples](https://sharpvector.build5nines.com/samples/)
 
diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -9,7 +9,6 @@ description: The lightweight, in-memory, semantic search, text vector database f
 [![Release Build](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml/badge.svg)](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml)
 ![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/build5nines/sharpvector)
 
-[![NuGet](https://img.shields.io/nuget/v/Build5Nines.SharpVector.svg)](https://www.nuget.org/packages/Build5Nines.SharpVector/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
 ![Framework: .NET 8+](https://img.shields.io/badge/framework-.NET%208%2B-blue)
 ![Semantic Search: Enabled](https://img.shields.io/badge/semantic%20search-enabled-purple)
diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj
@@ -9,7 +9,7 @@
     <PackageId>Build5Nines.SharpVector</PackageId>
     <PackageProjectUrl>https://sharpvector.build5nines.com</PackageProjectUrl>
     <RepositoryUrl>https://github.com/Build5Nines/SharpVector</RepositoryUrl>
-    <Version>2.1.1</Version>
+    <Version>2.1.2</Version>
     <Description>Lightweight In-memory Vector Database to embed in any .NET Applications</Description>
     <Copyright>Copyright (c) 2025 Build5Nines LLC</Copyright>
     <PackageReadmeFile>README.md</PackageReadmeFile>
diff --git a/src/Build5Nines.SharpVector/Id/ISequentialIdGenerator.cs b/src/Build5Nines.SharpVector/Id/ISequentialIdGenerator.cs
@@ -0,0 +1,15 @@
+namespace Build5Nines.SharpVector.Id;
+
+/// <summary>
+/// Interface for ID generators that support setting the most recent generated ID (sequential/numeric style).
+/// </summary>
+/// <typeparam name="TId">The ID type.</typeparam>
+public interface ISequentialIdGenerator<TId> : IIdGenerator<TId>
+    where TId : notnull
+{
+    /// <summary>
+    /// Sets the most recent ID value so the next generated ID will continue the sequence.
+    /// </summary>
+    /// <param name="mostRecentId">The most recently used/generated ID.</param>
+    void SetMostRecent(TId mostRecentId);
+}
diff --git a/src/Build5Nines.SharpVector/Id/NumericIdGenerator.cs b/src/Build5Nines.SharpVector/Id/NumericIdGenerator.cs
@@ -1,6 +1,6 @@
 namespace Build5Nines.SharpVector.Id;
 
-public class NumericIdGenerator<TId> : IIdGenerator<TId>
+public class NumericIdGenerator<TId> : ISequentialIdGenerator<TId>
     where TId : struct
 {
     public NumericIdGenerator()
@@ -22,4 +22,11 @@ public TId NewId() {
             return _lastId;
         }
     }
+
+    public void SetMostRecent(TId mostRecentId)
+    {
+        lock(_lock) {
+            _lastId = mostRecentId;
+        }
+    }
 }
diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs
@@ -11,6 +11,7 @@
 using Build5Nines.SharpVector.Embeddings;
 using System.Runtime.ExceptionServices;
 using System.Collections;
+using System.Linq;
 
 namespace Build5Nines.SharpVector;
 
@@ -351,8 +352,18 @@ await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
             async (archive) =>
             {
                 await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
-
                 await DatabaseFile.LoadVocabularyStoreAsync(archive, VectorStore.VocabularyStore);
+
+                // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
+                if (_idGenerator is ISequentialIdGenerator<TId> seqIdGen)
+                {
+                    // Re-seed the sequence only if there are existing IDs
+                    var ids = VectorStore.GetIds();
+                    if (ids.Any())
+                    {
+                        seqIdGen.SetMostRecent(ids.Max()!);
+                    }
+                }
             }
         );
     }
@@ -708,6 +719,17 @@ await DatabaseFile.LoadDatabaseFromZipArchiveAsync(
             async (archive) =>
             {
                 await DatabaseFile.LoadVectorStoreAsync(archive, VectorStore);
+
+                // Re-initialize the IdGenerator with the max Id value from the VectorStore if it supports sequential numeric IDs
+                if (_idGenerator is ISequentialIdGenerator<TId> seqIdGen)
+                {
+                    // Re-seed the sequence only if there are existing IDs
+                    var ids = VectorStore.GetIds();
+                    if (ids.Any())
+                    {
+                        seqIdGen.SetMostRecent(ids.Max()!);
+                    }
+                }
             }
         );
     }
diff --git a/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs b/src/SharpVectorOpenAITest/BasicOpenAIMemoryVectorDatabaseTest.cs
@@ -7,6 +7,9 @@
 using System.Threading;
 using System.Threading.Tasks;
 using System.Collections.Generic;
+using System.ClientModel.Primitives;
+using System.IO;
+using System;
 
 namespace Build5Nines.SharpVector.OpenAI.Tests
 {
@@ -20,9 +23,49 @@ public class BasicMemoryVectorDatabaseTest
         public void Setup()
         {
             _mockEmbeddingClient = new Mock<EmbeddingClient>();
+
+            // Mock the OpenAI EmbeddingClient to return a deterministic embedding vector
+            // GenerateEmbeddingAsync(string input, EmbeddingGenerationOptions? options = null, CancellationToken cancellationToken = default)
+            // returns ClientResult<OpenAIEmbedding>. We create one using the Model Factory helpers.
+            var embeddingVector = new float[] { 0.1f, 0.2f, 0.3f }; // small deterministic vector for tests
+            var openAiEmbedding = OpenAIEmbeddingsModelFactory.OpenAIEmbedding(index: 0, vector: embeddingVector);
+            // Create minimal concrete PipelineResponse implementation to satisfy ClientResult.FromValue without relying on Moq for abstract type
+            var response = new TestPipelineResponse();
+            var clientResult = ClientResult.FromValue(openAiEmbedding, response);
+
+            _mockEmbeddingClient
+                .Setup(c => c.GenerateEmbeddingAsync(
+                    It.IsAny<string>(),
+                    It.IsAny<EmbeddingGenerationOptions?>(),
+                    It.IsAny<CancellationToken>()))
+                .ReturnsAsync(clientResult);
+
             _database = new BasicOpenAIMemoryVectorDatabase(_mockEmbeddingClient.Object);
         }
 
+        // Minimal headers implementation for TestPipelineResponse
+        internal class EmptyPipelineResponseHeaders : PipelineResponseHeaders
+        {
+            public override IEnumerator<KeyValuePair<string, string>> GetEnumerator() => (new List<KeyValuePair<string,string>>()).GetEnumerator();
+            public override bool TryGetValue(string name, out string? value) { value = null; return false; }
+            public override bool TryGetValues(string name, out IEnumerable<string>? values) { values = null; return false; }
+        }
+
+        // Minimal PipelineResponse implementation
+        internal class TestPipelineResponse : PipelineResponse
+        {
+            private Stream? _contentStream = Stream.Null;
+            private readonly EmptyPipelineResponseHeaders _headers = new EmptyPipelineResponseHeaders();
+            public override int Status => 200;
+            public override string ReasonPhrase => "OK";
+            public override Stream? ContentStream { get => _contentStream; set => _contentStream = value; }
+            protected override PipelineResponseHeaders HeadersCore => _headers;
+            public override BinaryData Content => BinaryData.FromBytes(Array.Empty<byte>());
+            public override BinaryData BufferContent(CancellationToken cancellationToken = default) => Content;
+            public override ValueTask<BinaryData> BufferContentAsync(CancellationToken cancellationToken = default) => ValueTask.FromResult(Content);
+            public override void Dispose() { _contentStream?.Dispose(); }
+        }
+
         [TestMethod]
         public void TestInitialization()
         {
@@ -40,5 +83,57 @@ public async Task Test_SaveLoad_01()
             await _database.LoadFromFileAsync(filename);
         }
 
+        [TestMethod]
+        public async Task Test_SaveLoad_TestIds_01()
+        {
+            _database.AddText("Sample text for testing IDs.", "111");
+            _database.AddText("Another sample text for testing IDs.", "222");
+
+            var results = _database.Search("testing IDs");
+            Assert.AreEqual(2, results.Texts.Count());
+
+            var filename = "openai_test_saveload_testids_01.b59vdb";
+#pragma warning disable CS8604 // Possible null reference argument.
+            await _database.SaveToFileAsync(filename);
+#pragma warning restore CS8604 // Possible null reference argument.
+
+            await _database.LoadFromFileAsync(filename);
+
+            _database.AddText("A new text after loading to check ID assignment.", "333");
+
+            var newResults = _database.Search("testing IDs");
+            Assert.AreEqual(3, newResults.Texts.Count());
+            var texts = newResults.Texts.OrderBy(x => x.Metadata).ToArray();
+            Assert.AreEqual("111", texts[0].Metadata);
+            Assert.AreEqual("222", texts[1].Metadata);
+            Assert.AreEqual("333", texts[2].Metadata);
+        }
+
+        [TestMethod]
+        public async Task Test_SaveLoad_TestIds_02()
+        {
+            _database.AddText("Sample text for testing IDs.", "111");
+            _database.AddText("Another sample text for testing IDs.", "222");
+
+            var results = _database.Search("testing IDs");
+            Assert.AreEqual(2, results.Texts.Count());
+
+            var filename = "openai_test_saveload_testids_02.b59vdb";
+#pragma warning disable CS8604 // Possible null reference argument.
+            await _database.SaveToFileAsync(filename);
+#pragma warning restore CS8604 // Possible null reference argument.
+
+            var newdb = new BasicOpenAIMemoryVectorDatabase(_mockEmbeddingClient.Object);
+            await newdb.LoadFromFileAsync(filename);
+    
+            newdb.AddText("A new text after loading to check ID assignment.", "333");
+
+            var newResults = newdb.Search("testing IDs");
+            Assert.AreEqual(3, newResults.Texts.Count());
+            var texts = newResults.Texts.OrderBy(x => x.Metadata).ToArray();
+            Assert.AreEqual("111", texts[0].Metadata);
+            Assert.AreEqual("222", texts[1].Metadata);
+            Assert.AreEqual("333", texts[2].Metadata);
+        }
     }
 }
diff --git a/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj b/src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj
@@ -10,7 +10,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Build5Nines.SharpVector" Version="[2.0.3,3.0.0)" />
+    <!-- <PackageReference Include="Build5Nines.SharpVector" Version="[2.0.3,3.0.0)" /> -->
     <PackageReference Include="coverlet.collector" Version="6.0.0" />
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
     <PackageReference Include="Moq" Version="4.20.72" />
@@ -25,6 +25,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\Build5Nines.SharpVector.OpenAI\Build5Nines.SharpVector.OpenAI.csproj" />
+    <ProjectReference Include="..\Build5Nines.SharpVector\Build5Nines.SharpVector.csproj" />
   </ItemGroup>
 
 </Project>
diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs
@@ -174,6 +174,40 @@ public void BasicMemoryVectorDatabase_SaveLoad_01()
         Assert.AreEqual(0.3396831452846527, results.Texts.First().Similarity);
     }
 
+    [TestMethod]
+    public void BasicMemoryVectorDatabase_SaveLoad_TestIds()
+    {
+        var vdb = new BasicMemoryVectorDatabase();
+        
+        // // Load Vector Database with some sample text
+        vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "First");
+        vdb.AddText("Build5Nines is awesome!", "Second");
+        var results = vdb.Search("Lion King");
+
+        Assert.AreEqual(2, results.Texts.Count());
+
+        var filename = "BasicMemoryVectorDatabase_SaveLoad_TestIds.b59vdb";
+        vdb.SaveToFile(filename);
+
+        var newvdb = new BasicMemoryVectorDatabase();
+        newvdb.LoadFromFile(filename);
+
+        // Add a new text entry after loading
+        // This should get the next available ID (3) and not overwrite existing entries
+        newvdb.AddText("A new string that should be added, not replacing existing one.", "Third");
+
+        results = newvdb.Search("Lion King");
+
+        Assert.AreEqual(3, results.Texts.Count());
+        var listOfTexts = results.Texts.OrderBy(x => x.Id).ToArray();
+        Assert.AreEqual(listOfTexts[0].Id, 1);
+        Assert.AreEqual(listOfTexts[0].Metadata, "First");
+        Assert.AreEqual(listOfTexts[1].Id, 2);
+        Assert.AreEqual(listOfTexts[1].Metadata, "Second");
+        Assert.AreEqual(listOfTexts[2].Id, 3);
+        Assert.AreEqual(listOfTexts[2].Metadata, "Third");
+    }
+
     [TestMethod]
     public async Task BasicMemoryVectorDatabase_SaveLoadBinaryStreamAsync_01()
     {

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`namespace Build5Nines.SharpVector.Id;`
`2`	`2`
`3`		`-public class NumericIdGenerator<TId> : IIdGenerator<TId>`
	`3`	`+public class NumericIdGenerator<TId> : ISequentialIdGenerator<TId>`
`4`	`4`	`where TId : struct`
`5`	`5`	`{`
`6`	`6`	`public NumericIdGenerator()`
`@@ -22,4 +22,11 @@ public TId NewId() {`
`22`	`22`	`return _lastId;`
`23`	`23`	`}`
`24`	`24`	`}`
	`25`	`+`
	`26`	`+ public void SetMostRecent(TId mostRecentId)`
	`27`	`+ {`
	`28`	`+ lock(_lock) {`
	`29`	`+ _lastId = mostRecentId;`
	`30`	`+ }`
	`31`	`+ }`
`25`	`32`	`}`