datasciencecampus · rileyok-ons · Jan 22, 2026 · Jan 22, 2026 · Jan 23, 2026 · Jan 23, 2026
@@ -9,6 +9,7 @@
     VectorStoreSearchOutput,
 )
 from .main import VectorStore
+from .types import metric_settings
 
 __all__ = [
     "VectorStore",
@@ -18,4 +19,5 @@
     "VectorStoreReverseSearchOutput",
     "VectorStoreSearchInput",
     "VectorStoreSearchOutput",
+    "metric_settings",
 ]
@@ -32,11 +32,13 @@
 import shutil
 import time
 import uuid
+from typing import get_args
 
 import numpy as np
 import polars as pl
 from tqdm.autonotebook import tqdm
 
+from ..vectorisers import VectoriserBase
 from .dataclasses import (
     VectorStoreEmbedInput,
     VectorStoreEmbedOutput,
@@ -45,6 +47,7 @@
     VectorStoreSearchInput,
     VectorStoreSearchOutput,
 )
+from .types import metric_settings
 
 # Configure logging for your application
 logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
@@ -53,13 +56,29 @@
 logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
 
 
+def metricvalid(metric: metric_settings):
+    """Test that the given metric is a valid option.
+
+    Args:
+        metric (str): The selected metric for the VectorStore
+
+    Raises:
+        ValueError: If value is not in ["cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared"]
+
+    """
+    valid_metrics = get_args(metric_settings)
+    if metric not in valid_metrics:
+        raise ValueError(f"The scoring metric input '{metric}' is not in the valid metrics {valid_metrics}")
+
+
 class VectorStore:
     """A class to model and create 'VectorStore' objects for building and searching vector databases from CSV text files.
 
     Attributes:
         file_name (str): the original file with the knowledgebase to build the vector store
         data_type (str): the data type of the original file (curently only csv supported)
-        vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module
+        vectoriser (VectoriserBase): A Vectoriser object from the corresponding ClassifAI Pacakge module
+        scoring_metric(metric_settings): The metric to use for scoring
         batch_size (int): the batch size to pass to the vectoriser when embedding
         meta_data (dict[str:type]): key-value pairs of metadata to extract from the input file and their correpsonding types
         output_dir (str): the path to the output directory where the VectorStore will be saved
@@ -74,7 +93,8 @@ def __init__(  # noqa: PLR0913
         self,
         file_name,
         data_type,
-        vectoriser,
+        vectoriser: VectoriserBase,
+        scoring_metric: metric_settings = "cosine",
         batch_size=8,
         meta_data=None,
         output_dir=None,
@@ -87,8 +107,9 @@ def __init__(  # noqa: PLR0913
         Args:
             file_name (str): The name of the input CSV file.
             data_type (str): The type of input data (currently supports only "csv").
-            vectoriser (object): The vectoriser object used to transform text into
+            vectoriser (VectoriserBase): The vectoriser object used to transform text into
                                 vector embeddings.
+            scoring_metric(metric_settings): The metric to use for scoring
             batch_size (int, optional): The batch size for processing the input file and batching to
             vectoriser. Defaults to 8.
             meta_data (dict, optional): key,value pair metadata column names to extract from the input file and their types.
@@ -107,6 +128,7 @@ def __init__(  # noqa: PLR0913
         self.file_name = file_name
         self.data_type = data_type
         self.vectoriser = vectoriser
+        self.scoring_metric = scoring_metric
         self.batch_size = batch_size
         self.meta_data = meta_data if meta_data is not None else {}
         self.output_dir = output_dir
@@ -119,6 +141,9 @@ def __init__(  # noqa: PLR0913
         if self.data_type not in ["csv"]:
             raise ValueError(f"Data type '{self.data_type}' not supported. Choose from ['csv'].")
 
+        ## validate scoring metric
+        metricvalid(self.scoring_metric)
+
         if self.output_dir is None:
             logging.info("No output directory specified, attempting to use input file name as output folder name.")
 
@@ -146,7 +171,7 @@ def __init__(  # noqa: PLR0913
             os.makedirs(self.output_dir, exist_ok=True)
 
         self._create_vector_store_index()
-
+        self._check_norm_vdb()
         logging.info("Gathering metadata and saving vector store / metadata...")
 
         self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1]
@@ -347,7 +372,65 @@ def reverse_search(self, query: VectorStoreReverseSearchInput, n_results=100) ->
 
         return result_df
 
-    def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> VectorStoreSearchOutput:
+    def _check_norm_vdb(self):
+        """Normalise Vdb if using cosine similarity."""
+        if "cosine" in self.scoring_metric:
+            embeddings = self.vectors["embeddings"].to_numpy()
+            embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+
+            self.vectors.with_columns(pl.Series("embeddings", embeddings))
+
+    def score(
+        self, query: np.ndarray, n_results: int, query_ids_batch: list[str], query_text_batch: list[str]
+    ) -> tuple[pl.DataFrame, np.ndarray]:
+        """Perform Scoring and return Top Values.
+
+        Args:
+            query(np.ndarray): query for search
+            n_results(int): number of results to return
+            query_ids_batch(list[str]): ids of query batch
+            query_text_batch(list[str]): source text of query batch
+
+        Returns:
+            pl.DataFrame: The Polars DataFrame containing the top n most similar results to the query
+        """
+        if self.scoring_metric.startswith("cosine"):
+            query = query / np.linalg.norm(query, axis=1, keepdims=True)
+
+        result = query @ self.vectors["embeddings"].to_numpy().T
+
+        # Get the top n_results indices for each query in the batch
+        idx = np.argpartition(result, -n_results, axis=1)[:, -n_results:]
+
+        # Sort top n_results indices by their scores in descending order
+        idx_sorted = np.zeros_like(idx)
+        scores = np.zeros_like(idx, dtype=float)
+
+        for j in range(idx.shape[0]):
+            row_scores = result[j, idx[j]]
+            sorted_indices = np.argsort(row_scores)[::-1]
+            idx_sorted[j] = idx[j, sorted_indices]
+            scores[j] = row_scores[sorted_indices]
+
+        if "l2" in self.scoring_metric:
+            scores = 2 * (1 - scores)
+            if not self.scoring_metric.endswith("squared"):
+                scores = np.sqrt(scores)
+
+        # Build a DataFrame for the current batch results
+        result_df = pl.DataFrame(
+            {
+                "query_id": np.repeat(query_ids_batch, n_results),
+                "query_text": np.repeat(query_text_batch, n_results),
+                "rank": np.tile(np.arange(n_results), len(query_text_batch)),
+                "score": scores.flatten(),
+            }
+        )
+        return result_df, idx_sorted
+
+    def search(
+        self, query: VectorStoreSearchInput, n_results: int = 10, batch_size: int = 8
+    ) -> VectorStoreSearchOutput:
         """Searches the vector store using queries from a VectorStoreSearchInput object and returns
         ranked results in VectorStoreSearchOutput object. In batches, converts users text queries into vector embeddings,
         computes cosine similarity with stored document vectors, and retrieves the top results.
@@ -386,35 +469,11 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V
             # Get the current batch of queries
             query_text_batch = query.query.to_list()[i : i + batch_size]
             query_ids_batch = query.id.to_list()[i : i + batch_size]
-
             # Convert the current batch of queries to vectors
             query_vectors = self.vectoriser.transform(query_text_batch)
 
-            # Compute cosine similarity between the query batch and document vectors
-            cosine = query_vectors @ self.vectors["embeddings"].to_numpy().T
-
-            # Get the top n_results indices for each query in the batch
-            idx = np.argpartition(cosine, -n_results, axis=1)[:, -n_results:]
-
-            # Sort top n_results indices by their scores in descending order
-            idx_sorted = np.zeros_like(idx)
-            scores = np.zeros_like(idx, dtype=float)
-
-            for j in range(idx.shape[0]):
-                row_scores = cosine[j, idx[j]]
-                sorted_indices = np.argsort(row_scores)[::-1]
-                idx_sorted[j] = idx[j, sorted_indices]
-                scores[j] = row_scores[sorted_indices]
-
-            # Build a DataFrame for the current batch results
-            result_df = pl.DataFrame(
-                {
-                    "query_id": np.repeat(query_ids_batch, n_results),
-                    "query_text": np.repeat(query_text_batch, n_results),
-                    "rank": np.tile(np.arange(n_results), len(query_text_batch)),
-                    "score": scores.flatten(),
-                }
-            )
+            # perform scoring and return frame and ids
+            result_df, idx_sorted = self.score(query_vectors, n_results, query_ids_batch, query_text_batch)
 
             # Get the vector store results for the current batch
             ranked_docs = self.vectors[idx_sorted.flatten().tolist()].select(["id", "text", *self.meta_data.keys()])
@@ -461,7 +520,7 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V
         return result_df
 
     @classmethod
-    def from_filespace(cls, folder_path, vectoriser):
+    def from_filespace(cls, folder_path, vectoriser: VectoriserBase, scoring_metric: metric_settings = "cosine"):
         """Creates a `VectorStore` instance from stored metadata and Parquet files.
         This method reads the metadata and vectors from the specified folder,
         validates the contents, and initializes a `VectorStore` object with the
@@ -474,7 +533,8 @@ def from_filespace(cls, folder_path, vectoriser):
 
         Args:
             folder_path (str): The folder path containing the metadata and Parquet files.
-            vectoriser (object): The vectoriser object used to transform text into vector embeddings.
+            vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings.
+            scoring_metric(metric_settings): The metric to use for scoring
 
         Returns:
             VectorStore: An instance of the `VectorStore` class.
@@ -491,6 +551,9 @@ def from_filespace(cls, folder_path, vectoriser):
         with open(metadata_path, encoding="utf-8") as f:
             metadata = json.load(f)
 
+            ## validate scoring metric
+        metricvalid(scoring_metric)
+
         # check that the correct keys exist in metadata
         required_keys = [
             "vectoriser_class",
@@ -544,12 +607,13 @@ def from_filespace(cls, folder_path, vectoriser):
         vector_store.file_name = None
         vector_store.data_type = None
         vector_store.vectoriser = vectoriser
+        vector_store.scoring_metric = scoring_metric
         vector_store.batch_size = None
         vector_store.meta_data = deserialized_column_meta_data
         vector_store.vectors = df
         vector_store.vector_shape = metadata["vector_shape"]
         vector_store.num_vectors = metadata["num_vectors"]
         vector_store.vectoriser_class = metadata["vectoriser_class"]
         vector_store.hooks = {}
-
+        vector_store._check_norm_vdb()
         return vector_store
@@ -0,0 +1,5 @@
+from typing import Literal, TypeAlias
+
+metric_settings: TypeAlias = Literal[
+    "cosine", "dotprod", "cosinel2", "dotprodl2", "cosinel2squared", "dotprodl2squared"
+]