From b11f2331810cc544a15326fa463ad597621c9bb9 Mon Sep 17 00:00:00 2001 From: Luke <206716137+lukeroantreeONS@users.noreply.github.com> Date: Tue, 19 May 2026 15:50:12 +0100 Subject: [PATCH 1/5] feat(indexers): make persisting VectorStore to disk optional, defaulting to True --- src/classifai/indexers/main.py | 84 ++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 9a92717..ea11ac6 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -92,6 +92,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 output_dir: str | None = None, overwrite: bool = False, hooks: dict | None = None, + persist_to_disk: bool = True, ): """Initializes the `VectorStore` object by processing the input CSV file and generating vector embeddings. @@ -107,9 +108,14 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 Defaults to `None`. output_dir (str): [optional] The directory where the `VectorStore` will be saved. Defaults to `None`, where input file name will be used. + Note: ignored if `persist_to_disk=False`. overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name. Defaults to `False` to prevent accidental overwrites. + Note: ignored if `persist_to_disk=False`. hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`. + persist_to_disk (bool): [optional] If `True`, will save the `VectorStore` to disk after creation, if `False`, will + just keep it in memory (for testing or ephemeral use cases). + Defaults to `True`. Raises: @@ -160,28 +166,34 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self.num_vectors = None self.vectoriser_class = vectoriser.__class__.__name__ self.hooks = {} if hooks is None else hooks + self.persist_to_disk = persist_to_disk - # ---- Output directory handling (filesystem problems) -> ConfigurationError - try: - if self.output_dir is None: - logging.info("No output directory specified, attempting to use input file name as output folder name.") - normalized_file_name = os.path.basename(os.path.splitext(self.file_name)[0]) - self.output_dir = os.path.join(normalized_file_name) - - if os.path.isdir(self.output_dir): - if overwrite: - shutil.rmtree(self.output_dir) - else: - raise ConfigurationError( - "Output directory already exists. Pass overwrite=True to overwrite the folder.", - context={"output_dir": self.output_dir}, + if self.persist_to_disk: + # ---- Output directory handling (filesystem problems) -> ConfigurationError + try: + if self.output_dir is None: + logging.info( + "No output directory specified, attempting to use input file name as output folder name." ) - os.makedirs(self.output_dir, exist_ok=True) - except Exception as e: - raise ConfigurationError( - "Failed to prepare output directory.", - context={"output_dir": self.output_dir}, - ) from e + normalized_file_name = os.path.basename(os.path.splitext(self.file_name)[0]) + self.output_dir = os.path.join(normalized_file_name) + + if os.path.isdir(self.output_dir): + if overwrite: + shutil.rmtree(self.output_dir) + else: + raise ConfigurationError( + "Output directory already exists. Pass overwrite=True to overwrite the folder.", + context={"output_dir": self.output_dir}, + ) + os.makedirs(self.output_dir, exist_ok=True) + except Exception as e: + raise ConfigurationError( + "Failed to prepare output directory.", + context={"output_dir": self.output_dir}, + ) from e + else: + logging.debug("persist_to_disk is set to False, the VectorStore will not be saved to disk after creation.") # ---- Build index (wrap every unexpected failure) -> IndexBuildError try: @@ -202,23 +214,25 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 ) from e # ---- Save + derived metadata (IO/format problems) -> IndexBuildError - try: - logging.info("Gathering metadata and saving vector store / metadata...") - - self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1] - self.num_vectors = len(self.vectors) + self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1] + self.num_vectors = len(self.vectors) - self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet")) - self._save_metadata(os.path.join(self.output_dir, "metadata.json")) + if self.persist_to_disk: + try: + logging.info("Gathering metadata and saving vector store / metadata...") + self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet")) + self._save_metadata(os.path.join(self.output_dir, "metadata.json")) - logging.info("Vector Store created - files saved to %s", self.output_dir) - except ClassifaiError: - raise - except Exception as e: - raise IndexBuildError( - "Vector store was created but saving outputs failed.", - context={"cause_type": type(e).__name__, "cause_message": str(e)}, - ) from e + logging.info("Vector Store created - files saved to %s", self.output_dir) + except ClassifaiError: + raise + except Exception as e: + raise IndexBuildError( + "Vector store was created but saving outputs failed.", + context={"cause_type": type(e).__name__, "cause_message": str(e)}, + ) from e + else: + logging.debug("persist_to_disk is False, skipping saving VectorStore to disk.") def _save_metadata(self, path: str): """Saves metadata about the `VectorStore` to a JSON file. From 91d103458f5e1c7b6d5e8838ca9d1f0dc38c9296 Mon Sep 17 00:00:00 2001 From: Luke <206716137+lukeroantreeONS@users.noreply.github.com> Date: Thu, 21 May 2026 12:11:45 +0100 Subject: [PATCH 2/5] chore(indexers): rename persist_to_disk to skip_save (and reverse logic) --- src/classifai/indexers/main.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index ea11ac6..2ed3b31 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -92,7 +92,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 output_dir: str | None = None, overwrite: bool = False, hooks: dict | None = None, - persist_to_disk: bool = True, + skip_save: bool = False, ): """Initializes the `VectorStore` object by processing the input CSV file and generating vector embeddings. @@ -108,14 +108,14 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 Defaults to `None`. output_dir (str): [optional] The directory where the `VectorStore` will be saved. Defaults to `None`, where input file name will be used. - Note: ignored if `persist_to_disk=False`. + Note: ignored if `skip_save=True`. overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name. Defaults to `False` to prevent accidental overwrites. - Note: ignored if `persist_to_disk=False`. + Note: ignored if `skip_save=True`. hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`. - persist_to_disk (bool): [optional] If `True`, will save the `VectorStore` to disk after creation, if `False`, will + skip_save (bool): [optional] If `False`, will save the `VectorStore` to disk after creation, if `True`, will just keep it in memory (for testing or ephemeral use cases). - Defaults to `True`. + Defaults to `False`. Raises: @@ -166,9 +166,9 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self.num_vectors = None self.vectoriser_class = vectoriser.__class__.__name__ self.hooks = {} if hooks is None else hooks - self.persist_to_disk = persist_to_disk + self.skip_save = skip_save - if self.persist_to_disk: + if not self.skip_save: # ---- Output directory handling (filesystem problems) -> ConfigurationError try: if self.output_dir is None: @@ -193,7 +193,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 context={"output_dir": self.output_dir}, ) from e else: - logging.debug("persist_to_disk is set to False, the VectorStore will not be saved to disk after creation.") + logging.debug("skip_save is set to False, the VectorStore will not be saved to disk after creation.") # ---- Build index (wrap every unexpected failure) -> IndexBuildError try: @@ -217,7 +217,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self.vector_shape = self.vectors["embeddings"].to_numpy().shape[1] self.num_vectors = len(self.vectors) - if self.persist_to_disk: + if not self.skip_save: try: logging.info("Gathering metadata and saving vector store / metadata...") self.vectors.write_parquet(os.path.join(self.output_dir, "vectors.parquet")) @@ -232,7 +232,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 context={"cause_type": type(e).__name__, "cause_message": str(e)}, ) from e else: - logging.debug("persist_to_disk is False, skipping saving VectorStore to disk.") + logging.debug("skip_save is False, skipping saving VectorStore to disk.") def _save_metadata(self, path: str): """Saves metadata about the `VectorStore` to a JSON file. From 626fcfb5d6b3ee0797e3ed90abc4aa87f1b5c057 Mon Sep 17 00:00:00 2001 From: Luke <206716137+lukeroantreeONS@users.noreply.github.com> Date: Thu, 21 May 2026 12:22:09 +0100 Subject: [PATCH 3/5] chore(indexers): update logging around skip_save parameter --- src/classifai/indexers/main.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 2ed3b31..5e1193b 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -168,6 +168,17 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self.hooks = {} if hooks is None else hooks self.skip_save = skip_save + if self.output_dir is not None and self.skip_save: + logging.warning( + "VectorStore creation: output_dir is set to %s but skip_save is True, so the VectorStore will not be saved to disk. output_dir will be ignored.", + self.output_dir, + ) + + if self.output_dir is not None and not isinstance(self.output_dir, str): + raise DataValidationError( + "output_dir must be a string or None.", context={"output_dir_type": type(self.output_dir).__name__} + ) + if not self.skip_save: # ---- Output directory handling (filesystem problems) -> ConfigurationError try: @@ -193,7 +204,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 context={"output_dir": self.output_dir}, ) from e else: - logging.debug("skip_save is set to False, the VectorStore will not be saved to disk after creation.") + logging.debug("skip_save is set to True, the VectorStore will not be saved to disk after creation.") # ---- Build index (wrap every unexpected failure) -> IndexBuildError try: @@ -232,7 +243,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 context={"cause_type": type(e).__name__, "cause_message": str(e)}, ) from e else: - logging.debug("skip_save is False, skipping saving VectorStore to disk.") + logging.debug("skip_save is True, skipping saving VectorStore to disk.") def _save_metadata(self, path: str): """Saves metadata about the `VectorStore` to a JSON file. From d19db24b5c1d147aa47f905a91c444c676e0a4aa Mon Sep 17 00:00:00 2001 From: Luke <206716137+lukeroantreeONS@users.noreply.github.com> Date: Thu, 21 May 2026 13:27:54 +0100 Subject: [PATCH 4/5] docs(indexers): Add description of VectorStore saving options --- DEMO/general_workflow_demo.ipynb | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/DEMO/general_workflow_demo.ipynb b/DEMO/general_workflow_demo.ipynb index 8e80fe0..0f41606 100644 --- a/DEMO/general_workflow_demo.ipynb +++ b/DEMO/general_workflow_demo.ipynb @@ -110,7 +110,9 @@ "metadata": {}, "source": [ "### The VectorStore class creates a vector database by converting a set of labelled texts to embeddings, using an associated Vectoriser.\n", - "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore\n", + "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore.\n", + "\n", + "#### By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n", "![VectorStore_image](files/VectorStore.png)\n" ] }, @@ -123,11 +125,13 @@ "from classifai.indexers import VectorStore\n", "\n", "my_vector_store = VectorStore(\n", - " file_name=\"data/testdata.csv\",\n", - " data_type=\"csv\",\n", - " vectoriser=vectoriser,\n", - " meta_data={\"colour\": str, \"language\": str},\n", - " overwrite=True,\n", + " file_name=\"data/testdata.csv\", # required\n", + " data_type=\"csv\", # required\n", + " vectoriser=vectoriser, # required\n", + " meta_data={\"colour\": str, \"language\": str}, # optional\n", + " output_dir=\"testdata\", # optional\n", + " overwrite=True, # optional\n", + " skip_save=False, # optional\n", ")" ] }, @@ -165,12 +169,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the above cell, we're building the object that the VectorStore takes in to do the search process. Our input expects two columns of data, id and query, as above. And this data can be passed to our VectorStoreSearchInput class, as a dictionary or alreadt as a Pandas dataframe. \n", + "In the above cell, we're building the object that the VectorStore takes in to do the search process. Our input expects two columns of data, id and query, as above. We can create a VectorStoreSearchInput object by passing in this data as a dictionary or as a Pandas dataframe. \n", "\n", - "If you try to remove some of the data, say the 'id' column. Our data class object will inform you that you're missing some data. In this sense the data classes keep you right when working with the Package.\n", + "If you try to remove some of the data, say the 'id' column, the class constructor will inform you that you're missing some data. The data validation embedded withing these data classes helps avoid unexpected behaviour when working with classifai.\n", "\n", "\n", - "Look at the type of the input_data object we created, notice that it is not of type Pandas, but our own custom type. Under the hood this is doing the additional work to validate the data your passing in." + "Look at the type of the input_data object we created; notice that it is not of type Pandas DataFrame, but our own custom datatype. You can think of these classes as dataframes with additional functionality to validate the data you pass in." ] }, { @@ -228,7 +232,7 @@ "metadata": {}, "source": [ "### With reverse search you can do partial matching!\n", - "use the `partial match` flag to check if the **ids/labels** start with our query id" + "Use the `partial match` flag to check if the returned **doc_labels** start with our query" ] }, { @@ -259,7 +263,7 @@ "source": [ "### VectorStore Embed method\n", "\n", - "Its also possible to get the vector embeddings for each from some input text or queries by calling the VectorStore .embed() method.\n", + "It is also possible to get the vector embeddings for each from some input text or queries by calling the VectorStore .embed() method.\n", "\n", "Once again, this method has its own data class to inferace with: `VectorStoreEmbedInput`\n" ] @@ -445,7 +449,7 @@ ], "metadata": { "kernelspec": { - "display_name": "classifai", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -459,7 +463,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.7" + "version": "3.12.4" } }, "nbformat": 4, From 508827bff31ffaf7986783626600b00e5f98a5e5 Mon Sep 17 00:00:00 2001 From: Luke <206716137+lukeroantreeONS@users.noreply.github.com> Date: Fri, 22 May 2026 14:01:36 +0100 Subject: [PATCH 5/5] chore(indexers): move skip_save argument, make documentation text smaller --- DEMO/general_workflow_demo.ipynb | 4 ++-- src/classifai/indexers/main.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DEMO/general_workflow_demo.ipynb b/DEMO/general_workflow_demo.ipynb index 0f41606..e66085d 100644 --- a/DEMO/general_workflow_demo.ipynb +++ b/DEMO/general_workflow_demo.ipynb @@ -112,7 +112,7 @@ "### The VectorStore class creates a vector database by converting a set of labelled texts to embeddings, using an associated Vectoriser.\n", "#### Once created, it can be 'searched', using the vectoriser to embed queries as vectors and calculate their semantic similarity to the labelled texts in the VectorStore.\n", "\n", - "#### By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n", + "By default, the vector database is persisted to a local directory named after the input filename. You can use the `output_dir` argument to change the location of the persisted vector database when creating the VectorStore. If the directory already exists, it will exit with a warning - you can pass the `overwrite=True` argument to permit it to overwrite an existing directory. If you don't want the vector database to be persisted at all, you can pass the `skip_save=True` argument - note that this takes precedence over `output_dir` and `overwrite`.\n", "![VectorStore_image](files/VectorStore.png)\n" ] }, @@ -463,7 +463,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.12" } }, "nbformat": 4, diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 5e1193b..ab4f63e 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -91,8 +91,8 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 meta_data: dict | None = None, output_dir: str | None = None, overwrite: bool = False, - hooks: dict | None = None, skip_save: bool = False, + hooks: dict | None = None, ): """Initializes the `VectorStore` object by processing the input CSV file and generating vector embeddings. @@ -112,10 +112,10 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 overwrite (bool): [optional] If `True`, allows overwriting existing folders with the same name. Defaults to `False` to prevent accidental overwrites. Note: ignored if `skip_save=True`. - hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`. skip_save (bool): [optional] If `False`, will save the `VectorStore` to disk after creation, if `True`, will just keep it in memory (for testing or ephemeral use cases). Defaults to `False`. + hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to `None`. Raises: