From b11f2331810cc544a15326fa463ad597621c9bb9 Mon Sep 17 00:00:00 2001
From: Luke <206716137+lukeroantreeONS@users.noreply.github.com>
Date: Tue, 19 May 2026 15:50:12 +0100
Subject: [PATCH 1/5] feat(indexers): make persisting VectorStore to disk
 optional, defaulting to True

---
 src/classifai/indexers/main.py | 84 ++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 9a92717..ea11ac6 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -92,6 +92,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         output_dir: str | None = None,
         overwrite: bool = False,
         hooks: dict | None = None,
+        persist_to_disk: bool = True,
     ):
         """Initializes the `VectorStore` object by processing the input CSV file and generating
         vector embeddings.
@@ -107,9 +108,14 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                                 Defaults to `None`.
             output_dir (str): [optional] The directory where the `VectorStore` will be saved.
                                 Defaults to `None`, where input file name will be used.
+                                Note: ignored if `persist_to_disk=False`.
             overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name.
                                 Defaults to `False` to prevent accidental overwrites.
+                                Note: ignored if `persist_to_disk=False`.
             hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`.
+            persist_to_disk (bool): [optional] If `True`, will save the `VectorStore` to disk after creation, if `False`, will
+                                just keep it in memory (for testing or ephemeral use cases).
+                                Defaults to `True`.
 
 
         Raises:
@@ -160,28 +166,34 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self.num_vectors = None
         self.vectoriser_class = vectoriser.__class__.__name__
         self.hooks = {} if hooks is None else hooks
+        self.persist_to_disk = persist_to_disk
 
-        # ---- Output directory handling (filesystem problems) -> ConfigurationError
-        try:
-            if self.output_dir is None:
-                logging.info("No output directory specified, attempting to use input file name as output folder name.")
-                normalized_file_name = os.path.basename(os.path.splitext(self.file_name)[0])
-                self.output_dir = os.path.join(normalized_file_name)
-
-            if os.path.isdir(self.output_dir):
-                if overwrite:
-                    shutil.rmtree(self.output_dir)
-                else:
-                    raise ConfigurationError(
-                        "Output directory already exists. Pass overwrite=True to overwrite the folder.",
-                        context={"output_dir": self.output_dir},
+        if self.persist_to_disk:
+            # ---- Output directory handling (filesystem problems) -> ConfigurationError
+            try:
+                if self.output_dir is None:
+                    logging.info(
+                        "No output directory specified, attempting to use input file name as output folder name."
                     )
-            os.makedirs(self.output_dir, exist_ok=True)
-        except Exception as e:
-            raise ConfigurationError(
-                "Failed to prepare output directory.",
-                context={"output_dir": self.output_dir},
-            ) from e
+                    normalized_file_name = os.path.basename(os.path.splitext(self.file_name)[0])
+                    self.output_dir = os.path.join(normalized_file_name)
+
+                if os.path.isdir(self.output_dir):
+                    if overwrite:
+                        shutil.rmtree(self.output_dir)
+                    else:
+                        raise ConfigurationError(
+                            "Output directory already exists. Pass overwrite=True to overwrite the folder.",
+                            context={"output_dir": self.output_dir},
+                        )
+                os.makedirs(self.output_dir, exist_ok=True)
+            except Exception as e:
+                raise ConfigurationError(
+                    "Failed to prepare output directory.",
+                    context={"output_dir": self.output_dir},
+                ) from e
+        else:
+            logging.debug("persist_to_disk is set to False, the VectorStore will not be saved to disk after creation.")
 
         # ---- Build index (wrap every unexpected failure) -> IndexBuildError
         try:
@@ -202,23 +214,25 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
             ) from e
 
         # ---- Save + derived metadata (IO/format problems) -> IndexBuildError
-        try:
-            logging.info("Gathering metadata and saving vector store / metadata...")
-
-            self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1]
-            self.num_vectors = len(self.vectors)
+        self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1]
+        self.num_vectors = len(self.vectors)
 
-            self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet"))
-            self._save_metadata(os.path.join(self.output_dir, "metadata.json"))
+        if self.persist_to_disk:
+            try:
+                logging.info("Gathering metadata and saving vector store / metadata...")
+                self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet"))
+                self._save_metadata(os.path.join(self.output_dir, "metadata.json"))
 
-            logging.info("Vector Store created - files saved to %s", self.output_dir)
-        except ClassifaiError:
-            raise
-        except Exception as e:
-            raise IndexBuildError(
-                "Vector store was created but saving outputs failed.",
-                context={"cause_type": type(e).__name__, "cause_message": str(e)},
-            ) from e
+                logging.info("Vector Store created - files saved to %s", self.output_dir)
+            except ClassifaiError:
+                raise
+            except Exception as e:
+                raise IndexBuildError(
+                    "Vector store was created but saving outputs failed.",
+                    context={"cause_type": type(e).__name__, "cause_message": str(e)},
+                ) from e
+        else:
+            logging.debug("persist_to_disk is False, skipping saving VectorStore to disk.")
 
     def _save_metadata(self, path: str):
         """Saves metadata about the `VectorStore` to a JSON file.

From 91d103458f5e1c7b6d5e8838ca9d1f0dc38c9296 Mon Sep 17 00:00:00 2001
From: Luke <206716137+lukeroantreeONS@users.noreply.github.com>
Date: Thu, 21 May 2026 12:11:45 +0100
Subject: [PATCH 2/5] chore(indexers): rename persist_to_disk to skip_save (and
 reverse logic)

---
 src/classifai/indexers/main.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index ea11ac6..2ed3b31 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -92,7 +92,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         output_dir: str | None = None,
         overwrite: bool = False,
         hooks: dict | None = None,
-        persist_to_disk: bool = True,
+        skip_save: bool = False,
     ):
         """Initializes the `VectorStore` object by processing the input CSV file and generating
         vector embeddings.
@@ -108,14 +108,14 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                                 Defaults to `None`.
             output_dir (str): [optional] The directory where the `VectorStore` will be saved.
                                 Defaults to `None`, where input file name will be used.
-                                Note: ignored if `persist_to_disk=False`.
+                                Note: ignored if `skip_save=True`.
             overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name.
                                 Defaults to `False` to prevent accidental overwrites.
-                                Note: ignored if `persist_to_disk=False`.
+                                Note: ignored if `skip_save=True`.
             hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`.
-            persist_to_disk (bool): [optional] If `True`, will save the `VectorStore` to disk after creation, if `False`, will
+            skip_save (bool): [optional] If `False`, will save the `VectorStore` to disk after creation, if `True`, will
                                 just keep it in memory (for testing or ephemeral use cases).
-                                Defaults to `True`.
+                                Defaults to `False`.
 
 
         Raises:
@@ -166,9 +166,9 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self.num_vectors = None
         self.vectoriser_class = vectoriser.__class__.__name__
         self.hooks = {} if hooks is None else hooks
-        self.persist_to_disk = persist_to_disk
+        self.skip_save = skip_save
 
-        if self.persist_to_disk:
+        if not self.skip_save:
             # ---- Output directory handling (filesystem problems) -> ConfigurationError
             try:
                 if self.output_dir is None:
@@ -193,7 +193,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                     context={"output_dir": self.output_dir},
                 ) from e
         else:
-            logging.debug("persist_to_disk is set to False, the VectorStore will not be saved to disk after creation.")
+            logging.debug("skip_save is set to False, the VectorStore will not be saved to disk after creation.")
 
         # ---- Build index (wrap every unexpected failure) -> IndexBuildError
         try:
@@ -217,7 +217,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1]
         self.num_vectors = len(self.vectors)
 
-        if self.persist_to_disk:
+        if not self.skip_save:
             try:
                 logging.info("Gathering metadata and saving vector store / metadata...")
                 self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet"))
@@ -232,7 +232,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                     context={"cause_type": type(e).__name__, "cause_message": str(e)},
                 ) from e
         else:
-            logging.debug("persist_to_disk is False, skipping saving VectorStore to disk.")
+            logging.debug("skip_save is False, skipping saving VectorStore to disk.")
 
     def _save_metadata(self, path: str):
         """Saves metadata about the `VectorStore` to a JSON file.

From 626fcfb5d6b3ee0797e3ed90abc4aa87f1b5c057 Mon Sep 17 00:00:00 2001
From: Luke <206716137+lukeroantreeONS@users.noreply.github.com>
Date: Thu, 21 May 2026 12:22:09 +0100
Subject: [PATCH 3/5] chore(indexers): update logging around skip_save
 parameter

---
 src/classifai/indexers/main.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 2ed3b31..5e1193b 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -168,6 +168,17 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self.hooks = {} if hooks is None else hooks
         self.skip_save = skip_save
 
+        if self.output_dir is not None and self.skip_save:
+            logging.warning(
+                "VectorStore creation: output_dir is set to %s but skip_save is True, so the VectorStore will not be saved to disk. output_dir will be ignored.",
+                self.output_dir,
+            )
+
+        if self.output_dir is not None and not isinstance(self.output_dir, str):
+            raise DataValidationError(
+                "output_dir must be a string or None.", context={"output_dir_type": type(self.output_dir).__name__}
+            )
+
         if not self.skip_save:
             # ---- Output directory handling (filesystem problems) -> ConfigurationError
             try:
@@ -193,7 +204,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                     context={"output_dir": self.output_dir},
                 ) from e
         else:
-            logging.debug("skip_save is set to False, the VectorStore will not be saved to disk after creation.")
+            logging.debug("skip_save is set to True, the VectorStore will not be saved to disk after creation.")
 
         # ---- Build index (wrap every unexpected failure) -> IndexBuildError
         try:
@@ -232,7 +243,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
                     context={"cause_type": type(e).__name__, "cause_message": str(e)},
                 ) from e
         else:
-            logging.debug("skip_save is False, skipping saving VectorStore to disk.")
+            logging.debug("skip_save is True, skipping saving VectorStore to disk.")
 
     def _save_metadata(self, path: str):
         """Saves metadata about the `VectorStore` to a JSON file.

From d19db24b5c1d147aa47f905a91c444c676e0a4aa Mon Sep 17 00:00:00 2001
From: Luke <206716137+lukeroantreeONS@users.noreply.github.com>
Date: Thu, 21 May 2026 13:27:54 +0100
Subject: [PATCH 4/5] docs(indexers): Add description of VectorStore saving
 options

---
 DEMO/general_workflow_demo.ipynb | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/DEMO/general_workflow_demo.ipynb b/DEMO/general_workflow_demo.ipynb
index 8e80fe0..0f41606 100644
--- a/DEMO/general_workflow_demo.ipynb
+++ b/DEMO/general_workflow_demo.ipynb
@@ -110,7 +110,9 @@
    "metadata": {},
    "source": [
     "### The VectorStore class creates a vector database by converting a set of labelled texts to embeddings, using an associated Vectoriser.\n",
-    "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore\n",
+    "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore.\n",
+    "\n",
+    "#### By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n",
     "![VectorStore_image](files/VectorStore.png)\n"
    ]
   },
@@ -123,11 +125,13 @@
     "from classifai.indexers import VectorStore\n",
     "\n",
     "my_vector_store = VectorStore(\n",
-    "    file_name=\"data/testdata.csv\",\n",
-    "    data_type=\"csv\",\n",
-    "    vectoriser=vectoriser,\n",
-    "    meta_data={\"colour\": str, \"language\": str},\n",
-    "    overwrite=True,\n",
+    "    file_name=\"data/testdata.csv\",  # required\n",
+    "    data_type=\"csv\",  # required\n",
+    "    vectoriser=vectoriser,  # required\n",
+    "    meta_data={\"colour\": str, \"language\": str},  # optional\n",
+    "    output_dir=\"testdata\",  # optional\n",
+    "    overwrite=True,  # optional\n",
+    "    skip_save=False,  # optional\n",
     ")"
    ]
   },
@@ -165,12 +169,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In the above cell, we're building the object that the VectorStore takes in to do the search process. Our input expects two columns of data, id and query, as above. And this data can be passed to our VectorStoreSearchInput class, as a dictionary <b> or alreadt as a Pandas dataframe. </b>\n",
+    "In the above cell, we're building the object that the VectorStore takes in to do the search process. Our input expects two columns of data, id and query, as above. We can create a VectorStoreSearchInput object by passing in this data as a dictionary <b> or as a Pandas dataframe. </b>\n",
     "\n",
-    "If you try to remove some of the data, say the 'id' column. Our data class object will inform you that you're missing some data. In this sense the data classes keep you right when working with the Package.\n",
+    "If you try to remove some of the data, say the 'id' column, the class constructor will inform you that you're missing some data. The data validation embedded withing these data classes helps avoid unexpected behaviour when working with classifai.\n",
     "\n",
     "\n",
-    "Look at the type of the input_data object we created, notice that it is not of type Pandas, but our own custom type. Under the hood this is doing the additional work to validate the data your passing in."
+    "Look at the type of the input_data object we created; notice that it is not of type Pandas DataFrame, but our own custom datatype. You can think of these classes as dataframes with additional functionality to validate the data you pass in."
    ]
   },
   {
@@ -228,7 +232,7 @@
    "metadata": {},
    "source": [
     "### With reverse search you can do partial matching!\n",
-    "use the `partial match` flag to check if the **ids/labels** start with our query id"
+    "Use the `partial match` flag to check if the returned **doc_labels** start with our query"
    ]
   },
   {
@@ -259,7 +263,7 @@
    "source": [
     "### VectorStore Embed method\n",
     "\n",
-    "Its also possible to get the vector embeddings for each from some input text or queries by calling the VectorStore <i>.embed()</i> method.\n",
+    "It is also possible to get the vector embeddings for each from some input text or queries by calling the VectorStore <i>.embed()</i> method.\n",
     "\n",
     "Once again, this method has its own data class to inferace with: `VectorStoreEmbedInput`\n"
    ]
@@ -445,7 +449,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "classifai",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -459,7 +463,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.7"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,

From 508827bff31ffaf7986783626600b00e5f98a5e5 Mon Sep 17 00:00:00 2001
From: Luke <206716137+lukeroantreeONS@users.noreply.github.com>
Date: Fri, 22 May 2026 14:01:36 +0100
Subject: [PATCH 5/5] chore(indexers): move skip_save argument, make
 documentation text smaller

---
 DEMO/general_workflow_demo.ipynb | 4 ++--
 src/classifai/indexers/main.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/DEMO/general_workflow_demo.ipynb b/DEMO/general_workflow_demo.ipynb
index 0f41606..e66085d 100644
--- a/DEMO/general_workflow_demo.ipynb
+++ b/DEMO/general_workflow_demo.ipynb
@@ -112,7 +112,7 @@
     "### The VectorStore class creates a vector database by converting a set of labelled texts to embeddings, using an associated Vectoriser.\n",
     "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore.\n",
     "\n",
-    "#### By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n",
+    "By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n",
     "![VectorStore_image](files/VectorStore.png)\n"
    ]
   },
@@ -463,7 +463,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.13.12"
   }
  },
  "nbformat": 4,
diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 5e1193b..ab4f63e 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -91,8 +91,8 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         meta_data: dict | None = None,
         output_dir: str | None = None,
         overwrite: bool = False,
-        hooks: dict | None = None,
         skip_save: bool = False,
+        hooks: dict | None = None,
     ):
         """Initializes the `VectorStore` object by processing the input CSV file and generating
         vector embeddings.
@@ -112,10 +112,10 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
             overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name.
                                 Defaults to `False` to prevent accidental overwrites.
                                 Note: ignored if `skip_save=True`.
-            hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`.
             skip_save (bool): [optional] If `False`, will save the `VectorStore` to disk after creation, if `True`, will
                                 just keep it in memory (for testing or ephemeral use cases).
                                 Defaults to `False`.
+            hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`.
 
 
         Raises: