From d452921e676f50cebefafbc1f29de15d71c27d00 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 2 Jun 2026 14:27:54 +0200 Subject: [PATCH 01/15] adding Supabase to docs --- .../document-stores/supabasedocumentstore.mdx | 183 ++++++++++++++++++ docs-website/sidebars.js | 1 + 2 files changed, 184 insertions(+) create mode 100644 docs-website/docs/document-stores/supabasedocumentstore.mdx diff --git a/docs-website/docs/document-stores/supabasedocumentstore.mdx b/docs-website/docs/document-stores/supabasedocumentstore.mdx new file mode 100644 index 0000000000..1932502fef --- /dev/null +++ b/docs-website/docs/document-stores/supabasedocumentstore.mdx @@ -0,0 +1,183 @@ +--- +title: "SupabaseDocumentStore" +id: supabasedocumentstore +slug: "/supabasedocumentstore" +description: "Use Supabase as a document store in Haystack, with vector search (pgvector) or full-text search (PGroonga)." +--- + +# SupabaseDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Supabase](/reference/integrations-supabase) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/ | + +
+ +[Supabase](https://supabase.com/) is an open-source backend platform built on PostgreSQL. The Supabase integration for Haystack provides two document stores: + +- **`SupabasePgvectorDocumentStore`** — vector similarity search using the [pgvector](https://github.com/pgvector/pgvector) PostgreSQL extension, which comes pre-installed on Supabase. +- **`SupabaseGroongaDocumentStore`** — multilingual full-text search using the [PGroonga](https://pgroonga.github.io/) PostgreSQL extension. No embeddings required. + +## Installation + +```shell +pip install supabase-haystack +``` + +## SupabasePgvectorDocumentStore + +`SupabasePgvectorDocumentStore` is a thin wrapper around [`PgvectorDocumentStore`](./pgvectordocumentstore.mdx) with Supabase-specific defaults: + +- Reads the connection string from the `SUPABASE_DB_URL` environment variable. +- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + +### Connection + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +:::tip[Use session mode (port 5432)] +Supabase offers two pooler ports: transaction mode (port 6543) and session mode (port 5432). For best compatibility with pgvector operations, use session mode or a direct connection. +::: + +```shell +export SUPABASE_DB_URL="postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres" +``` + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasepgvectordocumentstore). + +### Supported Retrievers + +- [`SupabasePgvectorEmbeddingRetriever`](../pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx): Fetches documents from the store based on a query embedding. +- [`SupabasePgvectorKeywordRetriever`](../pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. + +### Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] +embedder = SentenceTransformersDocumentEmbedder() +embedder.warm_up() +documents_with_embeddings = embedder.run(documents) +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +# Query pipeline +prompt_template = [ + ChatMessage.from_system("Answer the question based on the provided context."), + ChatMessage.from_user( + "Query: {{query}}\nDocuments:\n{% for doc in documents %}{{ doc.content }}\n{% endfor %}\nAnswer:", + ), +] + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.add_component( + "prompt_builder", + ChatPromptBuilder( + template=prompt_template, + required_variables=["query", "documents"], + ), +) +query_pipeline.add_component("generator", OpenAIChatGenerator(model="gpt-4o")) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +query_pipeline.connect("retriever.documents", "prompt_builder.documents") +query_pipeline.connect("prompt_builder.prompt", "generator.messages") + +result = query_pipeline.run( + { + "text_embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) +``` + +--- + +## SupabaseGroongaDocumentStore + +`SupabaseGroongaDocumentStore` uses [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. Unlike the pgvector store, it works with plain text queries and requires no embeddings. + +### Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) +document_store.warm_up() +``` + +:::note +`warm_up()` must be called before using the store. It initializes the Supabase client and creates the table and PGroonga index if they don't exist. +::: + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasegroongadocumentstore). + +### Supported Retrievers + +- [`SupabaseGroongaBM25Retriever`](../pipeline-components/retrievers/supabasegroongabm25retriever.mdx): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index ca80d62356..d1090879c7 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -138,6 +138,7 @@ export default { 'document-stores/pgvectordocumentstore', 'document-stores/pinecone-document-store', 'document-stores/qdrant-document-store', + 'document-stores/supabasedocumentstore', 'document-stores/valkeydocumentstore', 'document-stores/vespadocumentstore', 'document-stores/weaviatedocumentstore', From cc455cb0b9a813e5b7e6c46be1187d2096b50a80 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 2 Jun 2026 14:37:56 +0200 Subject: [PATCH 02/15] fixing links --- docs-website/docs/document-stores/supabasedocumentstore.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs-website/docs/document-stores/supabasedocumentstore.mdx b/docs-website/docs/document-stores/supabasedocumentstore.mdx index 1932502fef..15fafc00a8 100644 --- a/docs-website/docs/document-stores/supabasedocumentstore.mdx +++ b/docs-website/docs/document-stores/supabasedocumentstore.mdx @@ -62,8 +62,8 @@ To learn more about the initialization parameters, see the [API docs](/reference ### Supported Retrievers -- [`SupabasePgvectorEmbeddingRetriever`](../pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx): Fetches documents from the store based on a query embedding. -- [`SupabasePgvectorKeywordRetriever`](../pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. +- [`SupabasePgvectorEmbeddingRetriever`](/reference/integrations-supabase#supabasepgvectorembeddingretriever): Fetches documents from the store based on a query embedding. +- [`SupabasePgvectorKeywordRetriever`](/reference/integrations-supabase#supabasepgvectorkeywordretriever): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. ### Example: RAG pipeline @@ -180,4 +180,4 @@ To learn more about the initialization parameters, see the [API docs](/reference ### Supported Retrievers -- [`SupabaseGroongaBM25Retriever`](../pipeline-components/retrievers/supabasegroongabm25retriever.mdx): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. +- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase#supabasegroongabm25retriever): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. From 19f3e4ae7c8809a423a57a5d842de82bd062d1c0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 2 Jun 2026 14:48:09 +0200 Subject: [PATCH 03/15] adding missing anchors --- .../reference/integrations-api/supabase.md | 346 ++++++++++++++++++ 1 file changed, 346 insertions(+) diff --git a/docs-website/reference/integrations-api/supabase.md b/docs-website/reference/integrations-api/supabase.md index 5044982839..d35f92c8f7 100644 --- a/docs-website/reference/integrations-api/supabase.md +++ b/docs-website/reference/integrations-api/supabase.md @@ -229,6 +229,135 @@ Deserializes the component from a dictionary. - SupabasePgvectorEmbeddingRetriever – Deserialized component. +## haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever + +### SupabaseGroongaBM25Retriever + +Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + +This retriever works without embeddings — it searches documents using plain text queries. +It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + +Note: async operations are not supported as the supabase-py sync client does not expose +awaitable query methods. Use the sync run() method instead. + +Example usage: + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", +) +document_store.warm_up() + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) +result = retriever.run(query="python programming") +print(result["documents"]) +``` + +#### __init__ + +```python +__init__( + *, + document_store: SupabaseGroongaDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE +) -> None +``` + +Initialize the SupabaseGroongaBM25Retriever. + +**Parameters:** + +- **document_store** (SupabaseGroongaDocumentStore) – An instance of SupabaseGroongaDocumentStore. +- **filters** (dict\[str, Any\] | None) – Optional filters applied to retrieved Documents. +- **top_k** (int) – Maximum number of Documents to return. Defaults to 10. +- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. + +**Raises:** + +- ValueError – If document_store is not an instance of SupabaseGroongaDocumentStore. + +#### run + +```python +run( + query: str, filters: dict[str, Any] | None = None, top_k: int | None = None +) -> dict[str, list[Document]] +``` + +Runs the retriever on the given query. + +**Parameters:** + +- **query** (str) – The text query to search for. +- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. +- **top_k** (int | None) – Optional override for maximum number of documents to return. + +**Returns:** + +- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. + +#### run_async + +```python +run_async( + query: str, filters: dict[str, Any] | None = None, top_k: int | None = None +) -> dict[str, list[Document]] +``` + +Async version of run(). + +Note: supabase-py's sync client does not support native async queries. +This method runs the synchronous retrieval and returns the result. +For fully async support, consider using acreate_client() from supabase-py +and refactoring the document store accordingly. + +**Parameters:** + +- **query** (str) – The text query to search for. +- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. +- **top_k** (int | None) – Optional override for maximum number of documents to return. + +**Returns:** + +- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabaseGroongaBM25Retriever +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabaseGroongaBM25Retriever – Deserialized component. + ## haystack_integrations.components.retrievers.supabase.keyword_retriever ### SupabasePgvectorKeywordRetriever @@ -442,3 +571,220 @@ Deserializes the component from a dictionary. **Returns:** - SupabasePgvectorDocumentStore – Deserialized component. + +## haystack_integrations.document_stores.supabase.groonga_document_store + +### SupabaseGroongaDocumentStore + +Bases: DocumentStore + +A Document Store for Supabase using PGroonga for full-text search. + +PGroonga is a PostgreSQL extension for fast, multilingual full-text search. +Unlike vector search, this store works with plain text queries — no embeddings needed. + +Prerequisites: + +- A Supabase project with PGroonga extension enabled. +- Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` + +Example usage: + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", +) +document_store.warm_up() +``` + +#### __init__ + +```python +__init__( + *, + supabase_url: str, + supabase_key: Secret = Secret.from_env_var( + "SUPABASE_SERVICE_KEY", strict=False + ), + table_name: str = "haystack_groonga_documents", + recreate_table: bool = False +) -> None +``` + +Creates a new SupabaseGroongaDocumentStore instance. + +Note: Call warm_up() before using the store to initialize the client and table. + +**Parameters:** + +- **supabase_url** (str) – The URL of your Supabase project. + Format: `https://.supabase.co` +- **supabase_key** (Secret) – The service role key for your Supabase project. + Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. +- **table_name** (str) – The name of the table to store documents in. + Defaults to `haystack_groonga_documents`. +- **recreate_table** (bool) – Whether to drop and recreate the table on startup. + Defaults to `False`. + +#### warm_up + +```python +warm_up() -> None +``` + +Initializes the Supabase client and sets up the table. + +Must be called before using the document store. + +#### count_documents + +```python +count_documents() -> int +``` + +Returns the number of documents in the store. + +**Returns:** + +- int – Number of documents. + +#### filter_documents + +```python +filter_documents(filters: dict[str, Any] | None = None) -> list[Document] +``` + +Returns documents matching the given filters. + +Supports the standard Haystack filter syntax with the following operators: + +- Comparison: `==`, `!=`, `>`, `>=`, `<`, `<=`, `in`, `not in` +- Logical: `AND`, `OR`, `NOT` (`OR` and `NOT` support simple conditions + only — no nested logical operators inside them) + +**Known limitation:** For `!=` and `not in` on `meta.*` fields, documents +where the field is absent are included in the result (matching Python `None != value` +semantics). For `>` / `>=` / `<` / `<=`, documents where the field is absent +are excluded (SQL `NULL` comparison semantics). + +**Parameters:** + +- **filters** (dict\[str, Any\] | None) – Optional Haystack filter dict. + Simple comparison: `{"field": "meta.language", "operator": "==", "value": "en"}` + Logical: `{"operator": "AND", "conditions": [...]}` + +**Returns:** + +- list\[Document\] – List of matching Document objects. + +**Raises:** + +- FilterError – If the filter structure is malformed or uses an unsupported operator. + +#### write_documents + +```python +write_documents( + documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL +) -> int +``` + +Writes documents to the store. + +**Parameters:** + +- **documents** (list\[Document\]) – List of Haystack Document objects to write. +- **policy** (DuplicatePolicy) – How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. + +**Returns:** + +- int – Number of documents written. + +#### delete_by_filter + +```python +delete_by_filter(filters: dict[str, Any]) -> int +``` + +Deletes documents matching the given filters. + +**Parameters:** + +- **filters** (dict\[str, Any\]) – Filters to select documents for deletion. + +**Returns:** + +- int – Number of documents deleted. + +#### update_by_filter + +```python +update_by_filter(filters: dict[str, Any], meta: dict[str, Any]) -> int +``` + +Updates the metadata of documents matching the given filters. + +Provided meta fields are merged into the existing document metadata. + +**Parameters:** + +- **filters** (dict\[str, Any\]) – Filters to select documents to update. +- **meta** (dict\[str, Any\]) – Metadata fields to set on matching documents. + +**Returns:** + +- int – Number of documents updated. + +#### delete_all_documents + +```python +delete_all_documents() -> None +``` + +Deletes all documents from the store. + +#### delete_documents + +```python +delete_documents(document_ids: list[str]) -> None +``` + +Deletes documents with the given IDs. + +**Parameters:** + +- **document_ids** (list\[str\]) – List of document IDs to delete. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabaseGroongaDocumentStore +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabaseGroongaDocumentStore – Deserialized component. From 41f66cf0957efa694702a818546296b5d495fb7c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 14:26:14 +0200 Subject: [PATCH 04/15] removing api docs, those are generated --- .../reference/integrations-api/supabase.md | 790 ------------------ 1 file changed, 790 deletions(-) delete mode 100644 docs-website/reference/integrations-api/supabase.md diff --git a/docs-website/reference/integrations-api/supabase.md b/docs-website/reference/integrations-api/supabase.md deleted file mode 100644 index d35f92c8f7..0000000000 --- a/docs-website/reference/integrations-api/supabase.md +++ /dev/null @@ -1,790 +0,0 @@ ---- -title: "Supabase" -id: integrations-supabase -description: "Supabase integration for Haystack" -slug: "/integrations-supabase" ---- - - -## haystack_integrations.components.downloaders.supabase.supabase_bucket_downloader - -### SupabaseBucketDownloader - -Downloads files from a Supabase Storage bucket and returns them as ByteStream objects. - -Files are downloaded in-memory and returned as `ByteStream` objects ready for further -processing in indexing pipelines (e.g. passing to a `DocumentConverter`). - -Example usage: - -```python -from haystack_integrations.components.downloaders.supabase import SupabaseBucketDownloader -from haystack.utils import Secret - -downloader = SupabaseBucketDownloader( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - bucket_name="my-documents", -) -result = downloader.run(sources=["reports/report.pdf", "data/notes.txt"]) -streams = result["streams"] -``` - -#### __init__ - -```python -__init__( - *, - supabase_url: str, - supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY"), - bucket_name: str, - file_extensions: list[str] | None = None -) -> None -``` - -Creates a new SupabaseBucketDownloader instance. - -**Parameters:** - -- **supabase_url** (str) – The URL of your Supabase project, e.g. `https://.supabase.co`. -- **supabase_key** (Secret) – The Supabase API key used to authenticate requests. Defaults to the - `SUPABASE_SERVICE_KEY` environment variable. Use the service role key for private buckets. -- **bucket_name** (str) – The name of the Supabase Storage bucket to download files from. -- **file_extensions** (list\[str\] | None) – Optional list of file extensions to filter downloads (e.g. `[".pdf", ".txt"]`). - If `None`, all files are downloaded. Extensions are matched case-insensitively. - -#### warm_up - -```python -warm_up() -> None -``` - -Initializes the Supabase client. - -Called automatically on the first run(), or can be called explicitly in a pipeline. - -#### run - -```python -run(sources: list[str]) -> dict[str, list[ByteStream]] -``` - -Downloads files from the Supabase Storage bucket. - -**Parameters:** - -- **sources** (list\[str\]) – List of file paths within the bucket to download, - e.g. `["folder/file.pdf", "notes.txt"]`. - -**Returns:** - -- dict\[str, list\[ByteStream\]\] – A dictionary with: -- `streams`: list of `ByteStream` objects, one per successfully downloaded file. - Each `ByteStream` has `meta["file_path"]` and `meta["bucket_name"]` set. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabaseBucketDownloader -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabaseBucketDownloader – Deserialized component. - -## haystack_integrations.components.retrievers.supabase.embedding_retriever - -### SupabasePgvectorEmbeddingRetriever - -Bases: PgvectorEmbeddingRetriever - -Retrieves documents from the `SupabasePgvectorDocumentStore`, based on their dense embeddings. - -This is a thin wrapper around `PgvectorEmbeddingRetriever`, adapted for use with -`SupabasePgvectorDocumentStore`. - -Example usage: - -# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. - -```bash -export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres -``` - -```python -from haystack import Document, Pipeline -from haystack.document_stores.types.policy import DuplicatePolicy -from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder - -from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore -from haystack_integrations.components.retrievers.supabase import SupabasePgvectorEmbeddingRetriever - -document_store = SupabasePgvectorDocumentStore( - embedding_dimension=768, - vector_function="cosine_similarity", - recreate_table=True, -) - -documents = [Document(content="There are over 7,000 languages spoken around the world today."), - Document(content="Elephants have been observed to behave in a way that indicates..."), - Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] - -document_embedder = SentenceTransformersDocumentEmbedder() -document_embedder.warm_up() -documents_with_embeddings = document_embedder.run(documents) -document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) - -query_pipeline = Pipeline() -query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) -query_pipeline.add_component("retriever", SupabasePgvectorEmbeddingRetriever(document_store=document_store)) -query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") - -query = "How many languages are there?" - -res = query_pipeline.run({"text_embedder": {"text": query}}) -print(res['retriever']['documents'][0].content) -# >> "There are over 7,000 languages spoken around the world today." -``` - -#### __init__ - -```python -__init__( - *, - document_store: SupabasePgvectorDocumentStore, - filters: dict[str, Any] | None = None, - top_k: int = 10, - vector_function: ( - Literal["cosine_similarity", "inner_product", "l2_distance"] | None - ) = None, - filter_policy: str | FilterPolicy = FilterPolicy.REPLACE -) -> None -``` - -Initialize the SupabasePgvectorEmbeddingRetriever. - -**Parameters:** - -- **document_store** (SupabasePgvectorDocumentStore) – An instance of `SupabasePgvectorDocumentStore`. -- **filters** (dict\[str, Any\] | None) – Filters applied to the retrieved Documents. -- **top_k** (int) – Maximum number of Documents to return. -- **vector_function** (Literal['cosine_similarity', 'inner_product', 'l2_distance'] | None) – The similarity function to use when searching for similar embeddings. - Defaults to the one set in the `document_store` instance. - `"cosine_similarity"` and `"inner_product"` are similarity functions and - higher scores indicate greater similarity between the documents. - `"l2_distance"` returns the straight-line distance between vectors, - and the most similar documents are the ones with the smallest score. - **Important**: if the document store is using the `"hnsw"` search strategy, the vector function - should match the one utilized during index creation to take advantage of the index. -- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. - -**Raises:** - -- ValueError – If `document_store` is not an instance of `SupabasePgvectorDocumentStore` or if - `vector_function` is not one of the valid options. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabasePgvectorEmbeddingRetriever -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabasePgvectorEmbeddingRetriever – Deserialized component. - -## haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever - -### SupabaseGroongaBM25Retriever - -Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. - -This retriever works without embeddings — it searches documents using plain text queries. -It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. - -Note: async operations are not supported as the supabase-py sync client does not expose -awaitable query methods. Use the sync run() method instead. - -Example usage: - -```python -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore -from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever -from haystack.utils import Secret - -document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", -) -document_store.warm_up() - -retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) -result = retriever.run(query="python programming") -print(result["documents"]) -``` - -#### __init__ - -```python -__init__( - *, - document_store: SupabaseGroongaDocumentStore, - filters: dict[str, Any] | None = None, - top_k: int = 10, - filter_policy: str | FilterPolicy = FilterPolicy.REPLACE -) -> None -``` - -Initialize the SupabaseGroongaBM25Retriever. - -**Parameters:** - -- **document_store** (SupabaseGroongaDocumentStore) – An instance of SupabaseGroongaDocumentStore. -- **filters** (dict\[str, Any\] | None) – Optional filters applied to retrieved Documents. -- **top_k** (int) – Maximum number of Documents to return. Defaults to 10. -- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. - -**Raises:** - -- ValueError – If document_store is not an instance of SupabaseGroongaDocumentStore. - -#### run - -```python -run( - query: str, filters: dict[str, Any] | None = None, top_k: int | None = None -) -> dict[str, list[Document]] -``` - -Runs the retriever on the given query. - -**Parameters:** - -- **query** (str) – The text query to search for. -- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. -- **top_k** (int | None) – Optional override for maximum number of documents to return. - -**Returns:** - -- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. - -#### run_async - -```python -run_async( - query: str, filters: dict[str, Any] | None = None, top_k: int | None = None -) -> dict[str, list[Document]] -``` - -Async version of run(). - -Note: supabase-py's sync client does not support native async queries. -This method runs the synchronous retrieval and returns the result. -For fully async support, consider using acreate_client() from supabase-py -and refactoring the document store accordingly. - -**Parameters:** - -- **query** (str) – The text query to search for. -- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. -- **top_k** (int | None) – Optional override for maximum number of documents to return. - -**Returns:** - -- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabaseGroongaBM25Retriever -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabaseGroongaBM25Retriever – Deserialized component. - -## haystack_integrations.components.retrievers.supabase.keyword_retriever - -### SupabasePgvectorKeywordRetriever - -Bases: PgvectorKeywordRetriever - -Retrieves documents from the `SupabasePgvectorDocumentStore`, based on keywords. - -This is a thin wrapper around `PgvectorKeywordRetriever`, adapted for use with -`SupabasePgvectorDocumentStore`. - -To rank the documents, the `ts_rank_cd` function of PostgreSQL is used. -It considers how often the query terms appear in the document, how close together the terms are in the document, -and how important is the part of the document where they occur. - -Example usage: - -# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. - -```bash -export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres -``` - -```python -from haystack import Document, Pipeline -from haystack.document_stores.types.policy import DuplicatePolicy - -from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore -from haystack_integrations.components.retrievers.supabase import SupabasePgvectorKeywordRetriever - -document_store = SupabasePgvectorDocumentStore( - embedding_dimension=768, - recreate_table=True, -) - -documents = [Document(content="There are over 7,000 languages spoken around the world today."), - Document(content="Elephants have been observed to behave in a way that indicates..."), - Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] - -document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) -retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) -result = retriever.run(query="languages") - -print(result['documents'][0].content) -# >> "There are over 7,000 languages spoken around the world today." -``` - -#### __init__ - -```python -__init__( - *, - document_store: SupabasePgvectorDocumentStore, - filters: dict[str, Any] | None = None, - top_k: int = 10, - filter_policy: str | FilterPolicy = FilterPolicy.REPLACE -) -> None -``` - -Initialize the SupabasePgvectorKeywordRetriever. - -**Parameters:** - -- **document_store** (SupabasePgvectorDocumentStore) – An instance of `SupabasePgvectorDocumentStore`. -- **filters** (dict\[str, Any\] | None) – Filters applied to the retrieved Documents. -- **top_k** (int) – Maximum number of Documents to return. -- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. - -**Raises:** - -- ValueError – If `document_store` is not an instance of `SupabasePgvectorDocumentStore`. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabasePgvectorKeywordRetriever -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabasePgvectorKeywordRetriever – Deserialized component. - -## haystack_integrations.document_stores.supabase.document_store - -### SupabasePgvectorDocumentStore - -Bases: PgvectorDocumentStore - -A Document Store for Supabase, using PostgreSQL with the pgvector extension. - -It should be used with Supabase installed. - -This is a thin wrapper around `PgvectorDocumentStore` with Supabase-specific defaults: - -- Reads the connection string from the `SUPABASE_DB_URL` environment variable. -- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. - -**Connection notes:** Supabase offers two pooler ports — transaction mode (6543) and session mode (5432). -For best compatibility with pgvector operations, use session mode (port 5432) or a direct connection. - -Example usage: - -# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. - -```bash -export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres -``` - -```python -from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore - -document_store = SupabasePgvectorDocumentStore( - embedding_dimension=768, - vector_function="cosine_similarity", - recreate_table=True, -) -``` - -#### __init__ - -```python -__init__( - *, - connection_string: Secret = Secret.from_env_var("SUPABASE_DB_URL"), - create_extension: bool = False, - schema_name: str = "public", - table_name: str = "haystack_documents", - language: str = "english", - embedding_dimension: int = 768, - vector_type: Literal["vector", "halfvec"] = "vector", - vector_function: Literal[ - "cosine_similarity", "inner_product", "l2_distance" - ] = "cosine_similarity", - recreate_table: bool = False, - search_strategy: Literal[ - "exact_nearest_neighbor", "hnsw" - ] = "exact_nearest_neighbor", - hnsw_recreate_index_if_exists: bool = False, - hnsw_index_creation_kwargs: dict[str, int] | None = None, - hnsw_index_name: str = "haystack_hnsw_index", - hnsw_ef_search: int | None = None, - keyword_index_name: str = "haystack_keyword_index" -) -> None -``` - -Creates a new SupabasePgvectorDocumentStore instance. - -**Parameters:** - -- **connection_string** (Secret) – The connection string for the Supabase PostgreSQL database, defined as an - environment variable. Default: `SUPABASE_DB_URL`. Format: - `postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres` -- **create_extension** (bool) – Whether to create the pgvector extension if it doesn't exist. - Defaults to `False` since Supabase has pgvector pre-installed. -- **schema_name** (str) – The name of the schema the table is created in. -- **table_name** (str) – The name of the table to use to store Haystack documents. -- **language** (str) – The language to be used to parse query and document content in keyword retrieval. -- **embedding_dimension** (int) – The dimension of the embedding. -- **vector_type** (Literal['vector', 'halfvec']) – The type of vector used for embedding storage. `"vector"` or `"halfvec"`. -- **vector_function** (Literal['cosine_similarity', 'inner_product', 'l2_distance']) – The similarity function to use when searching for similar embeddings. -- **recreate_table** (bool) – Whether to recreate the table if it already exists. -- **search_strategy** (Literal['exact_nearest_neighbor', 'hnsw']) – The search strategy to use: `"exact_nearest_neighbor"` or `"hnsw"`. -- **hnsw_recreate_index_if_exists** (bool) – Whether to recreate the HNSW index if it already exists. -- **hnsw_index_creation_kwargs** (dict\[str, int\] | None) – Additional keyword arguments for HNSW index creation. -- **hnsw_index_name** (str) – Index name for the HNSW index. -- **hnsw_ef_search** (int | None) – The `ef_search` parameter to use at query time for HNSW. -- **keyword_index_name** (str) – Index name for the Keyword index. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabasePgvectorDocumentStore -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabasePgvectorDocumentStore – Deserialized component. - -## haystack_integrations.document_stores.supabase.groonga_document_store - -### SupabaseGroongaDocumentStore - -Bases: DocumentStore - -A Document Store for Supabase using PGroonga for full-text search. - -PGroonga is a PostgreSQL extension for fast, multilingual full-text search. -Unlike vector search, this store works with plain text queries — no embeddings needed. - -Prerequisites: - -- A Supabase project with PGroonga extension enabled. -- Enable PGroonga in your Supabase project by running: - `CREATE EXTENSION IF NOT EXISTS pgroonga;` - -Example usage: - -```python -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore -from haystack.utils import Secret - -document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", -) -document_store.warm_up() -``` - -#### __init__ - -```python -__init__( - *, - supabase_url: str, - supabase_key: Secret = Secret.from_env_var( - "SUPABASE_SERVICE_KEY", strict=False - ), - table_name: str = "haystack_groonga_documents", - recreate_table: bool = False -) -> None -``` - -Creates a new SupabaseGroongaDocumentStore instance. - -Note: Call warm_up() before using the store to initialize the client and table. - -**Parameters:** - -- **supabase_url** (str) – The URL of your Supabase project. - Format: `https://.supabase.co` -- **supabase_key** (Secret) – The service role key for your Supabase project. - Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. -- **table_name** (str) – The name of the table to store documents in. - Defaults to `haystack_groonga_documents`. -- **recreate_table** (bool) – Whether to drop and recreate the table on startup. - Defaults to `False`. - -#### warm_up - -```python -warm_up() -> None -``` - -Initializes the Supabase client and sets up the table. - -Must be called before using the document store. - -#### count_documents - -```python -count_documents() -> int -``` - -Returns the number of documents in the store. - -**Returns:** - -- int – Number of documents. - -#### filter_documents - -```python -filter_documents(filters: dict[str, Any] | None = None) -> list[Document] -``` - -Returns documents matching the given filters. - -Supports the standard Haystack filter syntax with the following operators: - -- Comparison: `==`, `!=`, `>`, `>=`, `<`, `<=`, `in`, `not in` -- Logical: `AND`, `OR`, `NOT` (`OR` and `NOT` support simple conditions - only — no nested logical operators inside them) - -**Known limitation:** For `!=` and `not in` on `meta.*` fields, documents -where the field is absent are included in the result (matching Python `None != value` -semantics). For `>` / `>=` / `<` / `<=`, documents where the field is absent -are excluded (SQL `NULL` comparison semantics). - -**Parameters:** - -- **filters** (dict\[str, Any\] | None) – Optional Haystack filter dict. - Simple comparison: `{"field": "meta.language", "operator": "==", "value": "en"}` - Logical: `{"operator": "AND", "conditions": [...]}` - -**Returns:** - -- list\[Document\] – List of matching Document objects. - -**Raises:** - -- FilterError – If the filter structure is malformed or uses an unsupported operator. - -#### write_documents - -```python -write_documents( - documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL -) -> int -``` - -Writes documents to the store. - -**Parameters:** - -- **documents** (list\[Document\]) – List of Haystack Document objects to write. -- **policy** (DuplicatePolicy) – How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. - -**Returns:** - -- int – Number of documents written. - -#### delete_by_filter - -```python -delete_by_filter(filters: dict[str, Any]) -> int -``` - -Deletes documents matching the given filters. - -**Parameters:** - -- **filters** (dict\[str, Any\]) – Filters to select documents for deletion. - -**Returns:** - -- int – Number of documents deleted. - -#### update_by_filter - -```python -update_by_filter(filters: dict[str, Any], meta: dict[str, Any]) -> int -``` - -Updates the metadata of documents matching the given filters. - -Provided meta fields are merged into the existing document metadata. - -**Parameters:** - -- **filters** (dict\[str, Any\]) – Filters to select documents to update. -- **meta** (dict\[str, Any\]) – Metadata fields to set on matching documents. - -**Returns:** - -- int – Number of documents updated. - -#### delete_all_documents - -```python -delete_all_documents() -> None -``` - -Deletes all documents from the store. - -#### delete_documents - -```python -delete_documents(document_ids: list[str]) -> None -``` - -Deletes documents with the given IDs. - -**Parameters:** - -- **document_ids** (list\[str\]) – List of document IDs to delete. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabaseGroongaDocumentStore -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabaseGroongaDocumentStore – Deserialized component. From d087bc15a28e769b08081bec4c6c581c5428b701 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 14:28:38 +0200 Subject: [PATCH 05/15] removing warm_up() --- docs-website/docs/document-stores/supabasedocumentstore.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/docs-website/docs/document-stores/supabasedocumentstore.mdx b/docs-website/docs/document-stores/supabasedocumentstore.mdx index 15fafc00a8..23b73ee39d 100644 --- a/docs-website/docs/document-stores/supabasedocumentstore.mdx +++ b/docs-website/docs/document-stores/supabasedocumentstore.mdx @@ -101,7 +101,6 @@ documents = [ ), ] embedder = SentenceTransformersDocumentEmbedder() -embedder.warm_up() documents_with_embeddings = embedder.run(documents) document_store.write_documents( documents_with_embeddings["documents"], From ad9e625b192db0e1613e41a8f1b8592a6c8b3882 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 14:48:48 +0200 Subject: [PATCH 06/15] adding retrievers --- .../docs/pipeline-components/retrievers.mdx | 3 + .../supabasegroongabm25retriever.mdx | 151 ++++++++++++++++++ .../supabasepgvectorembeddingretriever.mdx | 115 +++++++++++++ .../supabasepgvectorkeywordretriever.mdx | 135 ++++++++++++++++ docs-website/sidebars.js | 3 + 5 files changed, 407 insertions(+) create mode 100644 docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx create mode 100644 docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx create mode 100644 docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx diff --git a/docs-website/docs/pipeline-components/retrievers.mdx b/docs-website/docs/pipeline-components/retrievers.mdx index 16cf831f91..933940e172 100644 --- a/docs-website/docs/pipeline-components/retrievers.mdx +++ b/docs-website/docs/pipeline-components/retrievers.mdx @@ -185,6 +185,9 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [QdrantHybridRetriever](retrievers/qdranthybridretriever.mdx) | A Retriever based both on dense and sparse embeddings, compatible with the Qdrant Document Store. | | [SentenceWindowRetriever](retrievers/sentencewindowretrieval.mdx) | Retrieves neighboring sentences around relevant sentences to get the full context. | | [SnowflakeTableRetriever](retrievers/snowflaketableretriever.mdx) | Connects to a Snowflake database to execute an SQL query. | +| [SupabaseGroongaBM25Retriever](retrievers/supabasegroongabm25retriever.mdx) | A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. | +| [SupabasePgvectorEmbeddingRetriever](retrievers/supabasepgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. | +| [SupabasePgvectorKeywordRetriever](retrievers/supabasepgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. | | [TextEmbeddingRetriever](retrievers/textembeddingretriever.mdx) | Wraps an embedding-based retriever with a text embedder into a single component that accepts a text query. | | [VespaEmbeddingRetriever](retrievers/vespaembeddingretriever.mdx) | An embedding-based Retriever compatible with the Vespa Document Store. | | [VespaKeywordRetriever](retrievers/vespakeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Vespa Document Store. | diff --git a/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx new file mode 100644 index 0000000000..c5447165c6 --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx @@ -0,0 +1,151 @@ +--- +title: "SupabaseGroongaBM25Retriever" +id: supabasegroongabm25retriever +slug: "/supabasegroongabm25retriever" +description: "A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search." +--- + +# SupabaseGroongaBM25Retriever + +A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the full-text search pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabaseGroongaDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabaseGroongaBM25Retriever` retrieves Documents from the `SupabaseGroongaDocumentStore` using [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. + +Unlike embedding-based retrievers, this Retriever works with plain text queries and requires no embeddings. It supports a wide range of languages out of the box through PGroonga's multilingual indexing capabilities. + +The Retriever can be combined with `SupabasePgvectorEmbeddingRetriever` and a [`DocumentJoiner`](../joiners/documentjoiner.mdx) for hybrid search pipelines that take advantage of both keyword and semantic retrieval. + +In addition to `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabaseGroongaDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` environment variables for your Supabase project. + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_SERVICE_KEY` with your Supabase service role key. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx new file mode 100644 index 0000000000..80c6cc870b --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx @@ -0,0 +1,115 @@ +--- +title: "SupabasePgvectorEmbeddingRetriever" +id: supabasepgvectorembeddingretriever +slug: "/supabasepgvectorembeddingretriever" +description: "An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorEmbeddingRetriever + +An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorEmbeddingRetriever` is a thin wrapper around [`PgvectorEmbeddingRetriever`](pgvectorembeddingretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. It compares the query and Document embeddings and fetches the Documents most relevant to the query based on vector similarity. + +When using this Retriever in your pipeline, make sure embeddings are available. Add a Document Embedder to your indexing pipeline and a Text Embedder to your query pipeline. + +In addition to `query_embedding`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve), `filters` to narrow down the search space, and `vector_function` to override the similarity function set on the Document Store. + +Some relevant parameters that impact embedding retrieval must be defined when the `SupabasePgvectorDocumentStore` is initialized: `embedding_dimension`, `vector_function`, and `search_strategy` (`"exact_nearest_neighbor"` or `"hnsw"`). + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore(embedding_dimension=768) +retriever = SupabasePgvectorEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings.get("documents"), + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" + +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx new file mode 100644 index 0000000000..c29e0066ba --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx @@ -0,0 +1,135 @@ +--- +title: "SupabasePgvectorKeywordRetriever" +id: supabasepgvectorkeywordretriever +slug: "/supabasepgvectorkeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorKeywordRetriever` is a thin wrapper around [`PgvectorKeywordRetriever`](pgvectorkeywordretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. + +It uses PostgreSQL full-text search (`to_tsvector` / `plainto_tsquery`) to find Documents and ranks them with the `ts_rank_cd` function. The ranking considers how often the query terms appear in the Document, how close together the terms are, and how important the part of the Document is where they occur. For more details, see the [PostgreSQL documentation](https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING). + +Keep in mind that, unlike similar components such as `ElasticsearchBM25Retriever`, this Retriever does not apply fuzzy search out of the box, so it's necessary to carefully formulate the query in order to avoid getting zero results. + +The language used to parse query and Document content for keyword retrieval is set via the `language` parameter on the `SupabasePgvectorDocumentStore` (defaults to `"english"`). + +In addition to the `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +document_store = SupabasePgvectorDocumentStore() +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +document_store = SupabasePgvectorDocumentStore( + language="english", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index c834b513e6..984e77d010 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -579,6 +579,9 @@ export default { 'pipeline-components/retrievers/qdrantsparseembeddingretriever', 'pipeline-components/retrievers/sentencewindowretrieval', 'pipeline-components/retrievers/snowflaketableretriever', + 'pipeline-components/retrievers/supabasegroongabm25retriever', + 'pipeline-components/retrievers/supabasepgvectorembeddingretriever', + 'pipeline-components/retrievers/supabasepgvectorkeywordretriever', 'pipeline-components/retrievers/textembeddingretriever', 'pipeline-components/retrievers/valkeyembeddingretriever', 'pipeline-components/retrievers/vespaembeddingretriever', From ac553464eeb3774f6c990186d22ccb67339efd29 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 15:14:02 +0200 Subject: [PATCH 07/15] Revert "removing api docs, those are generated" This reverts commit 41f66cf0957efa694702a818546296b5d495fb7c. --- .../reference/integrations-api/supabase.md | 790 ++++++++++++++++++ 1 file changed, 790 insertions(+) create mode 100644 docs-website/reference/integrations-api/supabase.md diff --git a/docs-website/reference/integrations-api/supabase.md b/docs-website/reference/integrations-api/supabase.md new file mode 100644 index 0000000000..d35f92c8f7 --- /dev/null +++ b/docs-website/reference/integrations-api/supabase.md @@ -0,0 +1,790 @@ +--- +title: "Supabase" +id: integrations-supabase +description: "Supabase integration for Haystack" +slug: "/integrations-supabase" +--- + + +## haystack_integrations.components.downloaders.supabase.supabase_bucket_downloader + +### SupabaseBucketDownloader + +Downloads files from a Supabase Storage bucket and returns them as ByteStream objects. + +Files are downloaded in-memory and returned as `ByteStream` objects ready for further +processing in indexing pipelines (e.g. passing to a `DocumentConverter`). + +Example usage: + +```python +from haystack_integrations.components.downloaders.supabase import SupabaseBucketDownloader +from haystack.utils import Secret + +downloader = SupabaseBucketDownloader( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + bucket_name="my-documents", +) +result = downloader.run(sources=["reports/report.pdf", "data/notes.txt"]) +streams = result["streams"] +``` + +#### __init__ + +```python +__init__( + *, + supabase_url: str, + supabase_key: Secret = Secret.from_env_var("SUPABASE_SERVICE_KEY"), + bucket_name: str, + file_extensions: list[str] | None = None +) -> None +``` + +Creates a new SupabaseBucketDownloader instance. + +**Parameters:** + +- **supabase_url** (str) – The URL of your Supabase project, e.g. `https://.supabase.co`. +- **supabase_key** (Secret) – The Supabase API key used to authenticate requests. Defaults to the + `SUPABASE_SERVICE_KEY` environment variable. Use the service role key for private buckets. +- **bucket_name** (str) – The name of the Supabase Storage bucket to download files from. +- **file_extensions** (list\[str\] | None) – Optional list of file extensions to filter downloads (e.g. `[".pdf", ".txt"]`). + If `None`, all files are downloaded. Extensions are matched case-insensitively. + +#### warm_up + +```python +warm_up() -> None +``` + +Initializes the Supabase client. + +Called automatically on the first run(), or can be called explicitly in a pipeline. + +#### run + +```python +run(sources: list[str]) -> dict[str, list[ByteStream]] +``` + +Downloads files from the Supabase Storage bucket. + +**Parameters:** + +- **sources** (list\[str\]) – List of file paths within the bucket to download, + e.g. `["folder/file.pdf", "notes.txt"]`. + +**Returns:** + +- dict\[str, list\[ByteStream\]\] – A dictionary with: +- `streams`: list of `ByteStream` objects, one per successfully downloaded file. + Each `ByteStream` has `meta["file_path"]` and `meta["bucket_name"]` set. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabaseBucketDownloader +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabaseBucketDownloader – Deserialized component. + +## haystack_integrations.components.retrievers.supabase.embedding_retriever + +### SupabasePgvectorEmbeddingRetriever + +Bases: PgvectorEmbeddingRetriever + +Retrieves documents from the `SupabasePgvectorDocumentStore`, based on their dense embeddings. + +This is a thin wrapper around `PgvectorEmbeddingRetriever`, adapted for use with +`SupabasePgvectorDocumentStore`. + +Example usage: + +# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```bash +export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres +``` + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import SupabasePgvectorEmbeddingRetriever + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + +document_embedder = SentenceTransformersDocumentEmbedder() +document_embedder.warm_up() +documents_with_embeddings = document_embedder.run(documents) +document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component("retriever", SupabasePgvectorEmbeddingRetriever(document_store=document_store)) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" + +res = query_pipeline.run({"text_embedder": {"text": query}}) +print(res['retriever']['documents'][0].content) +# >> "There are over 7,000 languages spoken around the world today." +``` + +#### __init__ + +```python +__init__( + *, + document_store: SupabasePgvectorDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + vector_function: ( + Literal["cosine_similarity", "inner_product", "l2_distance"] | None + ) = None, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE +) -> None +``` + +Initialize the SupabasePgvectorEmbeddingRetriever. + +**Parameters:** + +- **document_store** (SupabasePgvectorDocumentStore) – An instance of `SupabasePgvectorDocumentStore`. +- **filters** (dict\[str, Any\] | None) – Filters applied to the retrieved Documents. +- **top_k** (int) – Maximum number of Documents to return. +- **vector_function** (Literal['cosine_similarity', 'inner_product', 'l2_distance'] | None) – The similarity function to use when searching for similar embeddings. + Defaults to the one set in the `document_store` instance. + `"cosine_similarity"` and `"inner_product"` are similarity functions and + higher scores indicate greater similarity between the documents. + `"l2_distance"` returns the straight-line distance between vectors, + and the most similar documents are the ones with the smallest score. + **Important**: if the document store is using the `"hnsw"` search strategy, the vector function + should match the one utilized during index creation to take advantage of the index. +- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. + +**Raises:** + +- ValueError – If `document_store` is not an instance of `SupabasePgvectorDocumentStore` or if + `vector_function` is not one of the valid options. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabasePgvectorEmbeddingRetriever +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabasePgvectorEmbeddingRetriever – Deserialized component. + +## haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever + +### SupabaseGroongaBM25Retriever + +Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. + +This retriever works without embeddings — it searches documents using plain text queries. +It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. + +Note: async operations are not supported as the supabase-py sync client does not expose +awaitable query methods. Use the sync run() method instead. + +Example usage: + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", +) +document_store.warm_up() + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) +result = retriever.run(query="python programming") +print(result["documents"]) +``` + +#### __init__ + +```python +__init__( + *, + document_store: SupabaseGroongaDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE +) -> None +``` + +Initialize the SupabaseGroongaBM25Retriever. + +**Parameters:** + +- **document_store** (SupabaseGroongaDocumentStore) – An instance of SupabaseGroongaDocumentStore. +- **filters** (dict\[str, Any\] | None) – Optional filters applied to retrieved Documents. +- **top_k** (int) – Maximum number of Documents to return. Defaults to 10. +- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. + +**Raises:** + +- ValueError – If document_store is not an instance of SupabaseGroongaDocumentStore. + +#### run + +```python +run( + query: str, filters: dict[str, Any] | None = None, top_k: int | None = None +) -> dict[str, list[Document]] +``` + +Runs the retriever on the given query. + +**Parameters:** + +- **query** (str) – The text query to search for. +- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. +- **top_k** (int | None) – Optional override for maximum number of documents to return. + +**Returns:** + +- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. + +#### run_async + +```python +run_async( + query: str, filters: dict[str, Any] | None = None, top_k: int | None = None +) -> dict[str, list[Document]] +``` + +Async version of run(). + +Note: supabase-py's sync client does not support native async queries. +This method runs the synchronous retrieval and returns the result. +For fully async support, consider using acreate_client() from supabase-py +and refactoring the document store accordingly. + +**Parameters:** + +- **query** (str) – The text query to search for. +- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. +- **top_k** (int | None) – Optional override for maximum number of documents to return. + +**Returns:** + +- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabaseGroongaBM25Retriever +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabaseGroongaBM25Retriever – Deserialized component. + +## haystack_integrations.components.retrievers.supabase.keyword_retriever + +### SupabasePgvectorKeywordRetriever + +Bases: PgvectorKeywordRetriever + +Retrieves documents from the `SupabasePgvectorDocumentStore`, based on keywords. + +This is a thin wrapper around `PgvectorKeywordRetriever`, adapted for use with +`SupabasePgvectorDocumentStore`. + +To rank the documents, the `ts_rank_cd` function of PostgreSQL is used. +It considers how often the query terms appear in the document, how close together the terms are in the document, +and how important is the part of the document where they occur. + +Example usage: + +# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```bash +export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres +``` + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import SupabasePgvectorKeywordRetriever + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + recreate_table=True, +) + +documents = [Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")] + +document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) +result = retriever.run(query="languages") + +print(result['documents'][0].content) +# >> "There are over 7,000 languages spoken around the world today." +``` + +#### __init__ + +```python +__init__( + *, + document_store: SupabasePgvectorDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE +) -> None +``` + +Initialize the SupabasePgvectorKeywordRetriever. + +**Parameters:** + +- **document_store** (SupabasePgvectorDocumentStore) – An instance of `SupabasePgvectorDocumentStore`. +- **filters** (dict\[str, Any\] | None) – Filters applied to the retrieved Documents. +- **top_k** (int) – Maximum number of Documents to return. +- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. + +**Raises:** + +- ValueError – If `document_store` is not an instance of `SupabasePgvectorDocumentStore`. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabasePgvectorKeywordRetriever +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabasePgvectorKeywordRetriever – Deserialized component. + +## haystack_integrations.document_stores.supabase.document_store + +### SupabasePgvectorDocumentStore + +Bases: PgvectorDocumentStore + +A Document Store for Supabase, using PostgreSQL with the pgvector extension. + +It should be used with Supabase installed. + +This is a thin wrapper around `PgvectorDocumentStore` with Supabase-specific defaults: + +- Reads the connection string from the `SUPABASE_DB_URL` environment variable. +- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + +**Connection notes:** Supabase offers two pooler ports — transaction mode (6543) and session mode (5432). +For best compatibility with pgvector operations, use session mode (port 5432) or a direct connection. + +Example usage: + +# Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```bash +export SUPABASE_DB_URL=postgresql://postgres:postgres@localhost:5432/postgres +``` + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) +``` + +#### __init__ + +```python +__init__( + *, + connection_string: Secret = Secret.from_env_var("SUPABASE_DB_URL"), + create_extension: bool = False, + schema_name: str = "public", + table_name: str = "haystack_documents", + language: str = "english", + embedding_dimension: int = 768, + vector_type: Literal["vector", "halfvec"] = "vector", + vector_function: Literal[ + "cosine_similarity", "inner_product", "l2_distance" + ] = "cosine_similarity", + recreate_table: bool = False, + search_strategy: Literal[ + "exact_nearest_neighbor", "hnsw" + ] = "exact_nearest_neighbor", + hnsw_recreate_index_if_exists: bool = False, + hnsw_index_creation_kwargs: dict[str, int] | None = None, + hnsw_index_name: str = "haystack_hnsw_index", + hnsw_ef_search: int | None = None, + keyword_index_name: str = "haystack_keyword_index" +) -> None +``` + +Creates a new SupabasePgvectorDocumentStore instance. + +**Parameters:** + +- **connection_string** (Secret) – The connection string for the Supabase PostgreSQL database, defined as an + environment variable. Default: `SUPABASE_DB_URL`. Format: + `postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres` +- **create_extension** (bool) – Whether to create the pgvector extension if it doesn't exist. + Defaults to `False` since Supabase has pgvector pre-installed. +- **schema_name** (str) – The name of the schema the table is created in. +- **table_name** (str) – The name of the table to use to store Haystack documents. +- **language** (str) – The language to be used to parse query and document content in keyword retrieval. +- **embedding_dimension** (int) – The dimension of the embedding. +- **vector_type** (Literal['vector', 'halfvec']) – The type of vector used for embedding storage. `"vector"` or `"halfvec"`. +- **vector_function** (Literal['cosine_similarity', 'inner_product', 'l2_distance']) – The similarity function to use when searching for similar embeddings. +- **recreate_table** (bool) – Whether to recreate the table if it already exists. +- **search_strategy** (Literal['exact_nearest_neighbor', 'hnsw']) – The search strategy to use: `"exact_nearest_neighbor"` or `"hnsw"`. +- **hnsw_recreate_index_if_exists** (bool) – Whether to recreate the HNSW index if it already exists. +- **hnsw_index_creation_kwargs** (dict\[str, int\] | None) – Additional keyword arguments for HNSW index creation. +- **hnsw_index_name** (str) – Index name for the HNSW index. +- **hnsw_ef_search** (int | None) – The `ef_search` parameter to use at query time for HNSW. +- **keyword_index_name** (str) – Index name for the Keyword index. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabasePgvectorDocumentStore +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabasePgvectorDocumentStore – Deserialized component. + +## haystack_integrations.document_stores.supabase.groonga_document_store + +### SupabaseGroongaDocumentStore + +Bases: DocumentStore + +A Document Store for Supabase using PGroonga for full-text search. + +PGroonga is a PostgreSQL extension for fast, multilingual full-text search. +Unlike vector search, this store works with plain text queries — no embeddings needed. + +Prerequisites: + +- A Supabase project with PGroonga extension enabled. +- Enable PGroonga in your Supabase project by running: + `CREATE EXTENSION IF NOT EXISTS pgroonga;` + +Example usage: + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_fts_documents", +) +document_store.warm_up() +``` + +#### __init__ + +```python +__init__( + *, + supabase_url: str, + supabase_key: Secret = Secret.from_env_var( + "SUPABASE_SERVICE_KEY", strict=False + ), + table_name: str = "haystack_groonga_documents", + recreate_table: bool = False +) -> None +``` + +Creates a new SupabaseGroongaDocumentStore instance. + +Note: Call warm_up() before using the store to initialize the client and table. + +**Parameters:** + +- **supabase_url** (str) – The URL of your Supabase project. + Format: `https://.supabase.co` +- **supabase_key** (Secret) – The service role key for your Supabase project. + Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. +- **table_name** (str) – The name of the table to store documents in. + Defaults to `haystack_groonga_documents`. +- **recreate_table** (bool) – Whether to drop and recreate the table on startup. + Defaults to `False`. + +#### warm_up + +```python +warm_up() -> None +``` + +Initializes the Supabase client and sets up the table. + +Must be called before using the document store. + +#### count_documents + +```python +count_documents() -> int +``` + +Returns the number of documents in the store. + +**Returns:** + +- int – Number of documents. + +#### filter_documents + +```python +filter_documents(filters: dict[str, Any] | None = None) -> list[Document] +``` + +Returns documents matching the given filters. + +Supports the standard Haystack filter syntax with the following operators: + +- Comparison: `==`, `!=`, `>`, `>=`, `<`, `<=`, `in`, `not in` +- Logical: `AND`, `OR`, `NOT` (`OR` and `NOT` support simple conditions + only — no nested logical operators inside them) + +**Known limitation:** For `!=` and `not in` on `meta.*` fields, documents +where the field is absent are included in the result (matching Python `None != value` +semantics). For `>` / `>=` / `<` / `<=`, documents where the field is absent +are excluded (SQL `NULL` comparison semantics). + +**Parameters:** + +- **filters** (dict\[str, Any\] | None) – Optional Haystack filter dict. + Simple comparison: `{"field": "meta.language", "operator": "==", "value": "en"}` + Logical: `{"operator": "AND", "conditions": [...]}` + +**Returns:** + +- list\[Document\] – List of matching Document objects. + +**Raises:** + +- FilterError – If the filter structure is malformed or uses an unsupported operator. + +#### write_documents + +```python +write_documents( + documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL +) -> int +``` + +Writes documents to the store. + +**Parameters:** + +- **documents** (list\[Document\]) – List of Haystack Document objects to write. +- **policy** (DuplicatePolicy) – How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. + +**Returns:** + +- int – Number of documents written. + +#### delete_by_filter + +```python +delete_by_filter(filters: dict[str, Any]) -> int +``` + +Deletes documents matching the given filters. + +**Parameters:** + +- **filters** (dict\[str, Any\]) – Filters to select documents for deletion. + +**Returns:** + +- int – Number of documents deleted. + +#### update_by_filter + +```python +update_by_filter(filters: dict[str, Any], meta: dict[str, Any]) -> int +``` + +Updates the metadata of documents matching the given filters. + +Provided meta fields are merged into the existing document metadata. + +**Parameters:** + +- **filters** (dict\[str, Any\]) – Filters to select documents to update. +- **meta** (dict\[str, Any\]) – Metadata fields to set on matching documents. + +**Returns:** + +- int – Number of documents updated. + +#### delete_all_documents + +```python +delete_all_documents() -> None +``` + +Deletes all documents from the store. + +#### delete_documents + +```python +delete_documents(document_ids: list[str]) -> None +``` + +Deletes documents with the given IDs. + +**Parameters:** + +- **document_ids** (list\[str\]) – List of document IDs to delete. + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serializes the component to a dictionary. + +**Returns:** + +- dict\[str, Any\] – Dictionary with serialized data. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> SupabaseGroongaDocumentStore +``` + +Deserializes the component from a dictionary. + +**Parameters:** + +- **data** (dict\[str, Any\]) – Dictionary to deserialize from. + +**Returns:** + +- SupabaseGroongaDocumentStore – Deserialized component. From 725bec8f2d33a76922efcbd5fde4dc3df51bb8ca Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 17:03:16 +0200 Subject: [PATCH 08/15] =?UTF-8?q?Rely=20on=20existing=20supabase.md=20API?= =?UTF-8?q?=20reference=20=E2=80=94=20reset=20to=20main?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reference file is auto-generated by CI. Remove branch-local modifications so the PR does not alter it. --- .../reference/integrations-api/supabase.md | 346 ------------------ 1 file changed, 346 deletions(-) diff --git a/docs-website/reference/integrations-api/supabase.md b/docs-website/reference/integrations-api/supabase.md index d35f92c8f7..5044982839 100644 --- a/docs-website/reference/integrations-api/supabase.md +++ b/docs-website/reference/integrations-api/supabase.md @@ -229,135 +229,6 @@ Deserializes the component from a dictionary. - SupabasePgvectorEmbeddingRetriever – Deserialized component. -## haystack_integrations.components.retrievers.supabase.groonga_bm25_retriever - -### SupabaseGroongaBM25Retriever - -Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. - -This retriever works without embeddings — it searches documents using plain text queries. -It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. - -Note: async operations are not supported as the supabase-py sync client does not expose -awaitable query methods. Use the sync run() method instead. - -Example usage: - -```python -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore -from haystack_integrations.components.retrievers.supabase import SupabaseGroongaBM25Retriever -from haystack.utils import Secret - -document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", -) -document_store.warm_up() - -retriever = SupabaseGroongaBM25Retriever(document_store=document_store, top_k=10) -result = retriever.run(query="python programming") -print(result["documents"]) -``` - -#### __init__ - -```python -__init__( - *, - document_store: SupabaseGroongaDocumentStore, - filters: dict[str, Any] | None = None, - top_k: int = 10, - filter_policy: str | FilterPolicy = FilterPolicy.REPLACE -) -> None -``` - -Initialize the SupabaseGroongaBM25Retriever. - -**Parameters:** - -- **document_store** (SupabaseGroongaDocumentStore) – An instance of SupabaseGroongaDocumentStore. -- **filters** (dict\[str, Any\] | None) – Optional filters applied to retrieved Documents. -- **top_k** (int) – Maximum number of Documents to return. Defaults to 10. -- **filter_policy** (str | FilterPolicy) – Policy to determine how filters are applied. - -**Raises:** - -- ValueError – If document_store is not an instance of SupabaseGroongaDocumentStore. - -#### run - -```python -run( - query: str, filters: dict[str, Any] | None = None, top_k: int | None = None -) -> dict[str, list[Document]] -``` - -Runs the retriever on the given query. - -**Parameters:** - -- **query** (str) – The text query to search for. -- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. -- **top_k** (int | None) – Optional override for maximum number of documents to return. - -**Returns:** - -- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. - -#### run_async - -```python -run_async( - query: str, filters: dict[str, Any] | None = None, top_k: int | None = None -) -> dict[str, list[Document]] -``` - -Async version of run(). - -Note: supabase-py's sync client does not support native async queries. -This method runs the synchronous retrieval and returns the result. -For fully async support, consider using acreate_client() from supabase-py -and refactoring the document store accordingly. - -**Parameters:** - -- **query** (str) – The text query to search for. -- **filters** (dict\[str, Any\] | None) – Optional runtime filters. Merged or replaced based on filter_policy. -- **top_k** (int | None) – Optional override for maximum number of documents to return. - -**Returns:** - -- dict\[str, list\[Document\]\] – Dictionary with key "documents" containing list of matching Documents. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabaseGroongaBM25Retriever -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabaseGroongaBM25Retriever – Deserialized component. - ## haystack_integrations.components.retrievers.supabase.keyword_retriever ### SupabasePgvectorKeywordRetriever @@ -571,220 +442,3 @@ Deserializes the component from a dictionary. **Returns:** - SupabasePgvectorDocumentStore – Deserialized component. - -## haystack_integrations.document_stores.supabase.groonga_document_store - -### SupabaseGroongaDocumentStore - -Bases: DocumentStore - -A Document Store for Supabase using PGroonga for full-text search. - -PGroonga is a PostgreSQL extension for fast, multilingual full-text search. -Unlike vector search, this store works with plain text queries — no embeddings needed. - -Prerequisites: - -- A Supabase project with PGroonga extension enabled. -- Enable PGroonga in your Supabase project by running: - `CREATE EXTENSION IF NOT EXISTS pgroonga;` - -Example usage: - -```python -from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore -from haystack.utils import Secret - -document_store = SupabaseGroongaDocumentStore( - supabase_url="https://.supabase.co", - supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), - table_name="haystack_fts_documents", -) -document_store.warm_up() -``` - -#### __init__ - -```python -__init__( - *, - supabase_url: str, - supabase_key: Secret = Secret.from_env_var( - "SUPABASE_SERVICE_KEY", strict=False - ), - table_name: str = "haystack_groonga_documents", - recreate_table: bool = False -) -> None -``` - -Creates a new SupabaseGroongaDocumentStore instance. - -Note: Call warm_up() before using the store to initialize the client and table. - -**Parameters:** - -- **supabase_url** (str) – The URL of your Supabase project. - Format: `https://.supabase.co` -- **supabase_key** (Secret) – The service role key for your Supabase project. - Defaults to reading from the `SUPABASE_SERVICE_KEY` environment variable. -- **table_name** (str) – The name of the table to store documents in. - Defaults to `haystack_groonga_documents`. -- **recreate_table** (bool) – Whether to drop and recreate the table on startup. - Defaults to `False`. - -#### warm_up - -```python -warm_up() -> None -``` - -Initializes the Supabase client and sets up the table. - -Must be called before using the document store. - -#### count_documents - -```python -count_documents() -> int -``` - -Returns the number of documents in the store. - -**Returns:** - -- int – Number of documents. - -#### filter_documents - -```python -filter_documents(filters: dict[str, Any] | None = None) -> list[Document] -``` - -Returns documents matching the given filters. - -Supports the standard Haystack filter syntax with the following operators: - -- Comparison: `==`, `!=`, `>`, `>=`, `<`, `<=`, `in`, `not in` -- Logical: `AND`, `OR`, `NOT` (`OR` and `NOT` support simple conditions - only — no nested logical operators inside them) - -**Known limitation:** For `!=` and `not in` on `meta.*` fields, documents -where the field is absent are included in the result (matching Python `None != value` -semantics). For `>` / `>=` / `<` / `<=`, documents where the field is absent -are excluded (SQL `NULL` comparison semantics). - -**Parameters:** - -- **filters** (dict\[str, Any\] | None) – Optional Haystack filter dict. - Simple comparison: `{"field": "meta.language", "operator": "==", "value": "en"}` - Logical: `{"operator": "AND", "conditions": [...]}` - -**Returns:** - -- list\[Document\] – List of matching Document objects. - -**Raises:** - -- FilterError – If the filter structure is malformed or uses an unsupported operator. - -#### write_documents - -```python -write_documents( - documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL -) -> int -``` - -Writes documents to the store. - -**Parameters:** - -- **documents** (list\[Document\]) – List of Haystack Document objects to write. -- **policy** (DuplicatePolicy) – How to handle duplicate documents. Defaults to DuplicatePolicy.FAIL. - -**Returns:** - -- int – Number of documents written. - -#### delete_by_filter - -```python -delete_by_filter(filters: dict[str, Any]) -> int -``` - -Deletes documents matching the given filters. - -**Parameters:** - -- **filters** (dict\[str, Any\]) – Filters to select documents for deletion. - -**Returns:** - -- int – Number of documents deleted. - -#### update_by_filter - -```python -update_by_filter(filters: dict[str, Any], meta: dict[str, Any]) -> int -``` - -Updates the metadata of documents matching the given filters. - -Provided meta fields are merged into the existing document metadata. - -**Parameters:** - -- **filters** (dict\[str, Any\]) – Filters to select documents to update. -- **meta** (dict\[str, Any\]) – Metadata fields to set on matching documents. - -**Returns:** - -- int – Number of documents updated. - -#### delete_all_documents - -```python -delete_all_documents() -> None -``` - -Deletes all documents from the store. - -#### delete_documents - -```python -delete_documents(document_ids: list[str]) -> None -``` - -Deletes documents with the given IDs. - -**Parameters:** - -- **document_ids** (list\[str\]) – List of document IDs to delete. - -#### to_dict - -```python -to_dict() -> dict[str, Any] -``` - -Serializes the component to a dictionary. - -**Returns:** - -- dict\[str, Any\] – Dictionary with serialized data. - -#### from_dict - -```python -from_dict(data: dict[str, Any]) -> SupabaseGroongaDocumentStore -``` - -Deserializes the component from a dictionary. - -**Parameters:** - -- **data** (dict\[str, Any\]) – Dictionary to deserialize from. - -**Returns:** - -- SupabaseGroongaDocumentStore – Deserialized component. From 14dfb4448f49a5cd16243f4830e052905e420ca7 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 17:10:51 +0200 Subject: [PATCH 09/15] fixing broken anchor links --- docs-website/docs/document-stores/supabasedocumentstore.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs-website/docs/document-stores/supabasedocumentstore.mdx b/docs-website/docs/document-stores/supabasedocumentstore.mdx index 23b73ee39d..f249ae97eb 100644 --- a/docs-website/docs/document-stores/supabasedocumentstore.mdx +++ b/docs-website/docs/document-stores/supabasedocumentstore.mdx @@ -175,8 +175,8 @@ document_store.warm_up() `warm_up()` must be called before using the store. It initializes the Supabase client and creates the table and PGroonga index if they don't exist. ::: -To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasegroongadocumentstore). +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase). ### Supported Retrievers -- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase#supabasegroongabm25retriever): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. +- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. From 9567afe7be65001efd8e354cfa0fa2aef8c7b00e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 3 Jun 2026 17:14:25 +0200 Subject: [PATCH 10/15] adding to choosing-a-document-store.mdx --- .../docs/concepts/document-store/choosing-a-document-store.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx index 24fe226763..a3700324e0 100644 --- a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx +++ b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx @@ -80,6 +80,7 @@ Pure vector databases, also known as just “vector databases”, offer efficien This category is relatively small but growing fast and includes well-known relational databases where vector capabilities were added through plugins or extensions. They are not as performant as the previous categories, but the main advantage of these databases is the opportunity to easily combine vectors with structured data, having a one-stop data shop for your application. You should pick a vector-capable SQL database when the performance trade-off is paid off by the lower cost of maintaining a single database instance for your application or when the structured data plays a more fundamental role in your business logic, with vectors being more of a nice-to-have. - [Pgvector](../../document-stores/pgvectordocumentstore.mdx) +- [Supabase](../../document-stores/supabasedocumentstore.mdx) — managed PostgreSQL with pgvector (vector search) and PGroonga (multilingual full-text search) #### Vector-capable NoSQL databases From 00b4700f76a58614ffa67207cc4c2388cdf37a18 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 4 Jun 2026 10:47:34 +0200 Subject: [PATCH 11/15] Update docs-website/docs/concepts/document-store/choosing-a-document-store.mdx Co-authored-by: bogdankostic --- .../docs/concepts/document-store/choosing-a-document-store.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx index 5898a31f9e..c0c544dae2 100644 --- a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx +++ b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx @@ -81,7 +81,7 @@ This category is relatively small but growing fast and includes well-known relat - [Oracle](../../document-stores/oracledocumentstore.mdx) - [Pgvector](../../document-stores/pgvectordocumentstore.mdx) -- [Supabase](../../document-stores/supabasedocumentstore.mdx) — managed PostgreSQL with pgvector (vector search) and PGroonga (multilingual full-text search) +- [Supabase](../../document-stores/supabasedocumentstore.mdx) #### Vector-capable NoSQL databases From 8908f68fa3a45163b0cbd67afd0b54b85e7ef8d0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 4 Jun 2026 10:51:30 +0200 Subject: [PATCH 12/15] refering Smart Pipeline connections --- .../retrievers/supabasegroongabm25retriever.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx index c5447165c6..5fd9a7a2df 100644 --- a/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx +++ b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx @@ -30,6 +30,7 @@ A full-text Retriever that fetches documents from the SupabaseGroongaDocumentSto Unlike embedding-based retrievers, this Retriever works with plain text queries and requires no embeddings. It supports a wide range of languages out of the box through PGroonga's multilingual indexing capabilities. The Retriever can be combined with `SupabasePgvectorEmbeddingRetriever` and a [`DocumentJoiner`](../joiners/documentjoiner.mdx) for hybrid search pipelines that take advantage of both keyword and semantic retrieval. +You can also use of the [Smart Pipeline Connections](https://docs.haystack.deepset.ai/docs/smart-pipeline-connections) and skip the `DocumentJoiner` if you want to combine the results of both retrievers in a RAG pipeline. In addition to `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. From 1d2549089cd8580e136c0901497b1e49e2a35e47 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 4 Jun 2026 11:13:02 +0200 Subject: [PATCH 13/15] adding to 2.30 sidebars + Oracle DocStore and Retrievers were missing --- .../choosing-a-document-store.mdx | 2 + .../document-stores/oracledocumentstore.mdx | 191 ++++++++++++++++++ .../document-stores/supabasedocumentstore.mdx | 182 +++++++++++++++++ .../pipeline-components/retrievers.mdx | 6 + .../retrievers/oracleembeddingretriever.mdx | 142 +++++++++++++ .../retrievers/oraclekeywordretriever.mdx | 150 ++++++++++++++ .../supabasegroongabm25retriever.mdx | 152 ++++++++++++++ .../supabasepgvectorembeddingretriever.mdx | 115 +++++++++++ .../supabasepgvectorkeywordretriever.mdx | 135 +++++++++++++ .../version-2.30-sidebars.json | 9 +- 10 files changed, 1083 insertions(+), 1 deletion(-) create mode 100644 docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx create mode 100644 docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx diff --git a/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx b/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx index 24fe226763..c0c544dae2 100644 --- a/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx +++ b/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx @@ -79,7 +79,9 @@ Pure vector databases, also known as just “vector databases”, offer efficien This category is relatively small but growing fast and includes well-known relational databases where vector capabilities were added through plugins or extensions. They are not as performant as the previous categories, but the main advantage of these databases is the opportunity to easily combine vectors with structured data, having a one-stop data shop for your application. You should pick a vector-capable SQL database when the performance trade-off is paid off by the lower cost of maintaining a single database instance for your application or when the structured data plays a more fundamental role in your business logic, with vectors being more of a nice-to-have. +- [Oracle](../../document-stores/oracledocumentstore.mdx) - [Pgvector](../../document-stores/pgvectordocumentstore.mdx) +- [Supabase](../../document-stores/supabasedocumentstore.mdx) #### Vector-capable NoSQL databases diff --git a/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx b/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx new file mode 100644 index 0000000000..f5c0728113 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx @@ -0,0 +1,191 @@ +--- +title: "OracleDocumentStore" +id: oracledocumentstore +slug: "/oracledocumentstore" +description: "Use Oracle AI Vector Search as a document store in Haystack, with vector similarity and keyword search powered by Oracle Database 23ai." +--- + +# OracleDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Oracle](/reference/integrations-oracle) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | + +
+ +`OracleDocumentStore` is a Document Store backed by [Oracle AI Vector Search](https://www.oracle.com/database/ai-vector-search/), available in Oracle Database 23ai and later. +It stores documents alongside dense vector embeddings in a native `VECTOR` column, and supports both vector similarity search and keyword search via an automatically managed DBMS_SEARCH index. + +## Installation + +```shell +pip install oracle-haystack +``` + +## Connection + +`OracleDocumentStore` connects to Oracle using the `OracleConnectionConfig` dataclass, which supports two connection modes: + +- **Thin mode** (default): connects directly over TCP. No Oracle Instant Client required. +- **Thick mode**: activated automatically when `wallet_location` is provided. Used for Oracle Autonomous Database (ADB-S) connections. + +Set the connection parameters as environment variables: + +```shell +export ORACLE_USER="haystack" +export ORACLE_PASSWORD="secret" +export ORACLE_DSN="localhost:1521/freepdb1" +``` + +## Initialization + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-oracle#oracledocumentstore). + +### Connecting to Oracle Autonomous Database + +For Oracle Autonomous Database (ADB-S), provide a wallet for authentication. The store automatically activates thick mode when `wallet_location` is set: + +```python +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + wallet_location="/path/to/wallet", + wallet_password=Secret.from_env_var("WALLET_PASSWORD"), + ), + embedding_dim=1536, +) +``` + +### HNSW Vector Index + +By default, the store performs exact vector search. To enable approximate nearest-neighbor search (faster on large datasets), create an HNSW index: + +```python +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, + distance_metric="COSINE", + create_index=True, # creates the HNSW index on startup + hnsw_neighbors=32, + hnsw_ef_construction=200, + hnsw_accuracy=95, +) +``` + +## Supported Retrievers + +- [`OracleEmbeddingRetriever`](../pipeline-components/retrievers/oracleembeddingretriever.mdx): Retrieves documents from `OracleDocumentStore` based on vector similarity to a query embedding. +- [`OracleKeywordRetriever`](../pipeline-components/retrievers/oraclekeywordretriever.mdx): Retrieves documents matching a keyword query using Oracle's DBMS_SEARCH full-text index. + +## Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersDocumentEmbedder, + SentenceTransformersTextEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] + +doc_embedder = SentenceTransformersDocumentEmbedder( + model="sentence-transformers/all-MiniLM-L6-v2", +) +doc_embedder.warm_up() +embedded_docs = doc_embedder.run(documents)["documents"] +document_store.write_documents(embedded_docs, policy=DuplicatePolicy.OVERWRITE) + +# Build a RAG pipeline +template = [ + ChatMessage.from_user( + """ + Given the following context, answer the question. + Context: {% for doc in documents %}{{ doc.content }}{% endfor %} + Question: {{ query }} + """, + ), +] + +pipeline = Pipeline() +pipeline.add_component( + "embedder", + SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), +) +pipeline.add_component( + "retriever", + OracleEmbeddingRetriever(document_store=document_store, top_k=3), +) +pipeline.add_component("prompt_builder", ChatPromptBuilder(template=template)) +pipeline.add_component( + "llm", + OpenAIChatGenerator(api_key=Secret.from_env_var("OPENAI_API_KEY")), +) + +pipeline.connect("embedder.embedding", "retriever.query_embedding") +pipeline.connect("retriever.documents", "prompt_builder.documents") +pipeline.connect("prompt_builder.prompt", "llm.messages") + +result = pipeline.run( + { + "embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) + +print(result["llm"]["replies"][0].text) +``` diff --git a/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx b/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx new file mode 100644 index 0000000000..f249ae97eb --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx @@ -0,0 +1,182 @@ +--- +title: "SupabaseDocumentStore" +id: supabasedocumentstore +slug: "/supabasedocumentstore" +description: "Use Supabase as a document store in Haystack, with vector search (pgvector) or full-text search (PGroonga)." +--- + +# SupabaseDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Supabase](/reference/integrations-supabase) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/ | + +
+ +[Supabase](https://supabase.com/) is an open-source backend platform built on PostgreSQL. The Supabase integration for Haystack provides two document stores: + +- **`SupabasePgvectorDocumentStore`** — vector similarity search using the [pgvector](https://github.com/pgvector/pgvector) PostgreSQL extension, which comes pre-installed on Supabase. +- **`SupabaseGroongaDocumentStore`** — multilingual full-text search using the [PGroonga](https://pgroonga.github.io/) PostgreSQL extension. No embeddings required. + +## Installation + +```shell +pip install supabase-haystack +``` + +## SupabasePgvectorDocumentStore + +`SupabasePgvectorDocumentStore` is a thin wrapper around [`PgvectorDocumentStore`](./pgvectordocumentstore.mdx) with Supabase-specific defaults: + +- Reads the connection string from the `SUPABASE_DB_URL` environment variable. +- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + +### Connection + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +:::tip[Use session mode (port 5432)] +Supabase offers two pooler ports: transaction mode (port 6543) and session mode (port 5432). For best compatibility with pgvector operations, use session mode or a direct connection. +::: + +```shell +export SUPABASE_DB_URL="postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres" +``` + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasepgvectordocumentstore). + +### Supported Retrievers + +- [`SupabasePgvectorEmbeddingRetriever`](/reference/integrations-supabase#supabasepgvectorembeddingretriever): Fetches documents from the store based on a query embedding. +- [`SupabasePgvectorKeywordRetriever`](/reference/integrations-supabase#supabasepgvectorkeywordretriever): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. + +### Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] +embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = embedder.run(documents) +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +# Query pipeline +prompt_template = [ + ChatMessage.from_system("Answer the question based on the provided context."), + ChatMessage.from_user( + "Query: {{query}}\nDocuments:\n{% for doc in documents %}{{ doc.content }}\n{% endfor %}\nAnswer:", + ), +] + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.add_component( + "prompt_builder", + ChatPromptBuilder( + template=prompt_template, + required_variables=["query", "documents"], + ), +) +query_pipeline.add_component("generator", OpenAIChatGenerator(model="gpt-4o")) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +query_pipeline.connect("retriever.documents", "prompt_builder.documents") +query_pipeline.connect("prompt_builder.prompt", "generator.messages") + +result = query_pipeline.run( + { + "text_embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) +``` + +--- + +## SupabaseGroongaDocumentStore + +`SupabaseGroongaDocumentStore` uses [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. Unlike the pgvector store, it works with plain text queries and requires no embeddings. + +### Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) +document_store.warm_up() +``` + +:::note +`warm_up()` must be called before using the store. It initializes the Supabase client and creates the table and PGroonga index if they don't exist. +::: + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase). + +### Supported Retrievers + +- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx index 2bac65a42a..45e8002519 100644 --- a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx @@ -165,6 +165,7 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [ChromaQueryTextRetriever](retrievers/chromaqueryretriever.mdx) | A Retriever compatible with the Chroma Document Store that uses the Chroma query API. | | [ElasticsearchEmbeddingRetriever](retrievers/elasticsearchembeddingretriever.mdx) | An embedding-based Retriever compatible with the Elasticsearch Document Store. | | [ElasticsearchBM25Retriever](retrievers/elasticsearchbm25retriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Elasticsearch Document Store. | +| [ElasticsearchSQLRetriever](retrievers/elasticsearchsqlretriever.mdx) | Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. | | [InMemoryBM25Retriever](retrievers/inmemorybm25retriever.mdx) | A keyword-based Retriever compatible with the InMemoryDocumentStore. | | [InMemoryEmbeddingRetriever](retrievers/inmemoryembeddingretriever.mdx) | An embedding-based Retriever compatible with the InMemoryDocumentStore. | | [FilterRetriever](retrievers/filterretriever.mdx) | A special Retriever to be used with any Document Store to get the Documents that match specific filters. | @@ -176,6 +177,8 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [OpenSearchBM25Retriever](retrievers/opensearchbm25retriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from an OpenSearch Document Store. | | [OpenSearchEmbeddingRetriever](retrievers/opensearchembeddingretriever.mdx) | An embedding-based Retriever compatible with the OpenSearch Document Store. | | [OpenSearchHybridRetriever](retrievers/opensearchhybridretriever.mdx) | A SuperComponent that implements a Hybrid Retriever in a single component, relying on OpenSearch as the backend Document Store. | +| [OracleEmbeddingRetriever](retrievers/oracleembeddingretriever.mdx) | An embedding-based Retriever compatible with the Oracle Document Store. | +| [OracleKeywordRetriever](retrievers/oraclekeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Oracle Document Store. | | [PgvectorEmbeddingRetriever](retrievers/pgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the Pgvector Document Store. | | [PgvectorKeywordRetriever](retrievers/pgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the Pgvector Document Store. | | [PineconeEmbeddingRetriever](retrievers/pineconedenseretriever.mdx) | An embedding-based Retriever compatible with the Pinecone Document Store. | @@ -184,6 +187,9 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [QdrantHybridRetriever](retrievers/qdranthybridretriever.mdx) | A Retriever based both on dense and sparse embeddings, compatible with the Qdrant Document Store. | | [SentenceWindowRetriever](retrievers/sentencewindowretrieval.mdx) | Retrieves neighboring sentences around relevant sentences to get the full context. | | [SnowflakeTableRetriever](retrievers/snowflaketableretriever.mdx) | Connects to a Snowflake database to execute an SQL query. | +| [SupabaseGroongaBM25Retriever](retrievers/supabasegroongabm25retriever.mdx) | A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. | +| [SupabasePgvectorEmbeddingRetriever](retrievers/supabasepgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. | +| [SupabasePgvectorKeywordRetriever](retrievers/supabasepgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. | | [TextEmbeddingRetriever](retrievers/textembeddingretriever.mdx) | Wraps an embedding-based retriever with a text embedder into a single component that accepts a text query. | | [VespaEmbeddingRetriever](retrievers/vespaembeddingretriever.mdx) | An embedding-based Retriever compatible with the Vespa Document Store. | | [VespaKeywordRetriever](retrievers/vespakeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Vespa Document Store. | diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx new file mode 100644 index 0000000000..6f63f6f2db --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx @@ -0,0 +1,142 @@ +--- +title: "OracleEmbeddingRetriever" +id: oracleembeddingretriever +slug: "/oracleembeddingretriever" +description: "An embedding-based Retriever compatible with the Oracle Document Store." +--- + +# OracleEmbeddingRetriever + +An embedding-based Retriever compatible with the Oracle Document Store. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in a semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of an [OracleDocumentStore](../../document-stores/oracledocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Oracle](/reference/integrations-oracle) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | +| **Package name** | `oracle-haystack` | + +
+ +## Overview + +The `OracleEmbeddingRetriever` is an embedding-based Retriever compatible with `OracleDocumentStore`. It uses Oracle AI Vector Search to compare query and document embeddings, fetching the most relevant documents based on vector similarity. + +When using `OracleEmbeddingRetriever` in a pipeline, make sure embeddings are available for both documents (at index time) and queries (at query time). Use a Document Embedder in your indexing pipeline and a Text Embedder in your query pipeline. + +The distance metric (COSINE, EUCLIDEAN, or DOT) is configured on the `OracleDocumentStore`. In addition to `query_embedding`, the retriever accepts `top_k` (maximum documents to return) and `filters` to narrow the search space. + +## Installation + +To run Oracle Database 23ai locally with Docker: + +```shell +docker run -d --name oracle23ai \ + -p 1521:1521 \ + -e ORACLE_PASSWORD=oracle \ + container-registry.oracle.com/database/free:latest +``` + +Install the Oracle integration for Haystack: + +```shell +pip install oracle-haystack +``` + +## Usage + +### On its own + +This Retriever needs an `OracleDocumentStore` and indexed documents with embeddings to run. + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +retriever = OracleEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersDocumentEmbedder, + SentenceTransformersTextEmbedder, +) +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder( + model="sentence-transformers/all-MiniLM-L6-v2", +) +document_embedder.warm_up() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component( + "text_embedder", + SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), +) +query_pipeline.add_component( + "retriever", + OracleEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx new file mode 100644 index 0000000000..56059cb934 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx @@ -0,0 +1,150 @@ +--- +title: "OracleKeywordRetriever" +id: oraclekeywordretriever +slug: "/oraclekeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the Oracle Document Store using Oracle's DBMS_SEARCH full-text index." +--- + +# OracleKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the Oracle Document Store. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in a keyword search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of an [OracleDocumentStore](../../document-stores/oracledocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents matching the query | +| **API reference** | [Oracle](/reference/integrations-oracle) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | +| **Package name** | `oracle-haystack` | + +
+ +## Overview + +The `OracleKeywordRetriever` is a keyword-based Retriever compatible with `OracleDocumentStore`. It uses Oracle's DBMS_SEARCH full-text index — automatically created when the document store is initialized — to search documents by keyword relevance. + +This retriever works without embeddings, making it suitable for keyword-only pipelines or as the keyword branch of a hybrid search pipeline. + +In addition to `query`, the retriever accepts `top_k` (maximum documents to return) and `filters` to narrow the search space. + +## Installation + +To run Oracle Database 23ai locally with Docker: + +```shell +docker run -d --name oracle23ai \ + -p 1521:1521 \ + -e ORACLE_PASSWORD=oracle \ + container-registry.oracle.com/database/free:latest +``` + +Install the Oracle integration for Haystack: + +```shell +pip install oracle-haystack +``` + +## Usage + +### On its own + +This Retriever needs an `OracleDocumentStore` and indexed documents to run. + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleKeywordRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +retriever = OracleKeywordRetriever(document_store=document_store) +retriever.run(query="my keyword query") +``` + +### In a RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleKeywordRetriever + +prompt_template = [ + ChatMessage.from_user( + """ + Given these documents, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + + \nQuestion: {{question}} + \nAnswer: + """, + ), +] + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = OracleKeywordRetriever(document_store=document_store) + +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder(template=prompt_template, required_variables="*"), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") + +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") + +question = "How many languages are there?" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + }, +) +print(result["llm"]["replies"][0].text) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx new file mode 100644 index 0000000000..5fd9a7a2df --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx @@ -0,0 +1,152 @@ +--- +title: "SupabaseGroongaBM25Retriever" +id: supabasegroongabm25retriever +slug: "/supabasegroongabm25retriever" +description: "A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search." +--- + +# SupabaseGroongaBM25Retriever + +A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the full-text search pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabaseGroongaDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabaseGroongaBM25Retriever` retrieves Documents from the `SupabaseGroongaDocumentStore` using [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. + +Unlike embedding-based retrievers, this Retriever works with plain text queries and requires no embeddings. It supports a wide range of languages out of the box through PGroonga's multilingual indexing capabilities. + +The Retriever can be combined with `SupabasePgvectorEmbeddingRetriever` and a [`DocumentJoiner`](../joiners/documentjoiner.mdx) for hybrid search pipelines that take advantage of both keyword and semantic retrieval. +You can also use of the [Smart Pipeline Connections](https://docs.haystack.deepset.ai/docs/smart-pipeline-connections) and skip the `DocumentJoiner` if you want to combine the results of both retrievers in a RAG pipeline. + +In addition to `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabaseGroongaDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` environment variables for your Supabase project. + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_SERVICE_KEY` with your Supabase service role key. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx new file mode 100644 index 0000000000..80c6cc870b --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx @@ -0,0 +1,115 @@ +--- +title: "SupabasePgvectorEmbeddingRetriever" +id: supabasepgvectorembeddingretriever +slug: "/supabasepgvectorembeddingretriever" +description: "An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorEmbeddingRetriever + +An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorEmbeddingRetriever` is a thin wrapper around [`PgvectorEmbeddingRetriever`](pgvectorembeddingretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. It compares the query and Document embeddings and fetches the Documents most relevant to the query based on vector similarity. + +When using this Retriever in your pipeline, make sure embeddings are available. Add a Document Embedder to your indexing pipeline and a Text Embedder to your query pipeline. + +In addition to `query_embedding`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve), `filters` to narrow down the search space, and `vector_function` to override the similarity function set on the Document Store. + +Some relevant parameters that impact embedding retrieval must be defined when the `SupabasePgvectorDocumentStore` is initialized: `embedding_dimension`, `vector_function`, and `search_strategy` (`"exact_nearest_neighbor"` or `"hnsw"`). + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore(embedding_dimension=768) +retriever = SupabasePgvectorEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings.get("documents"), + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" + +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx new file mode 100644 index 0000000000..c29e0066ba --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx @@ -0,0 +1,135 @@ +--- +title: "SupabasePgvectorKeywordRetriever" +id: supabasepgvectorkeywordretriever +slug: "/supabasepgvectorkeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorKeywordRetriever` is a thin wrapper around [`PgvectorKeywordRetriever`](pgvectorkeywordretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. + +It uses PostgreSQL full-text search (`to_tsvector` / `plainto_tsquery`) to find Documents and ranks them with the `ts_rank_cd` function. The ranking considers how often the query terms appear in the Document, how close together the terms are, and how important the part of the Document is where they occur. For more details, see the [PostgreSQL documentation](https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING). + +Keep in mind that, unlike similar components such as `ElasticsearchBM25Retriever`, this Retriever does not apply fuzzy search out of the box, so it's necessary to carefully formulate the query in order to avoid getting zero results. + +The language used to parse query and Document content for keyword retrieval is set via the `language` parameter on the `SupabasePgvectorDocumentStore` (defaults to `"english"`). + +In addition to the `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +document_store = SupabasePgvectorDocumentStore() +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +document_store = SupabasePgvectorDocumentStore( + language="english", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/versioned_sidebars/version-2.30-sidebars.json b/docs-website/versioned_sidebars/version-2.30-sidebars.json index 530eb1e702..04c66ee2b8 100644 --- a/docs-website/versioned_sidebars/version-2.30-sidebars.json +++ b/docs-website/versioned_sidebars/version-2.30-sidebars.json @@ -131,9 +131,11 @@ "href": "https://haystack.deepset.ai/integrations/neo4j-document-store" }, "document-stores/opensearch-document-store", + "document-stores/oracledocumentstore", "document-stores/pgvectordocumentstore", "document-stores/pinecone-document-store", "document-stores/qdrant-document-store", + "document-stores/supabasedocumentstore", "document-stores/valkeydocumentstore", "document-stores/vespadocumentstore", "document-stores/weaviatedocumentstore" @@ -565,6 +567,8 @@ "pipeline-components/retrievers/opensearchbm25retriever", "pipeline-components/retrievers/opensearchembeddingretriever", "pipeline-components/retrievers/opensearchhybridretriever", + "pipeline-components/retrievers/oracleembeddingretriever", + "pipeline-components/retrievers/oraclekeywordretriever", "pipeline-components/retrievers/pgvectorembeddingretriever", "pipeline-components/retrievers/pgvectorkeywordretriever", "pipeline-components/retrievers/pineconedenseretriever", @@ -572,6 +576,9 @@ "pipeline-components/retrievers/qdranthybridretriever", "pipeline-components/retrievers/qdrantsparseembeddingretriever", "pipeline-components/retrievers/sentencewindowretrieval", + "pipeline-components/retrievers/supabasegroongabm25retriever", + "pipeline-components/retrievers/supabasepgvectorembeddingretriever", + "pipeline-components/retrievers/supabasepgvectorkeywordretriever", "pipeline-components/retrievers/snowflaketableretriever", "pipeline-components/retrievers/textembeddingretriever", "pipeline-components/retrievers/valkeyembeddingretriever", @@ -740,4 +747,4 @@ ] } ] -} \ No newline at end of file +} From 8c0d89572177e5bcaaa3a98c3a2e4e7cabfd44f4 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 4 Jun 2026 11:29:46 +0200 Subject: [PATCH 14/15] ElasticSearchSQLRetriever was missing in the sidebars --- docs-website/versioned_sidebars/version-2.30-sidebars.json | 1 + 1 file changed, 1 insertion(+) diff --git a/docs-website/versioned_sidebars/version-2.30-sidebars.json b/docs-website/versioned_sidebars/version-2.30-sidebars.json index 04c66ee2b8..fcefb1373d 100644 --- a/docs-website/versioned_sidebars/version-2.30-sidebars.json +++ b/docs-website/versioned_sidebars/version-2.30-sidebars.json @@ -552,6 +552,7 @@ "pipeline-components/retrievers/chromaqueryretriever", "pipeline-components/retrievers/elasticsearchbm25retriever", "pipeline-components/retrievers/elasticsearchembeddingretriever", + "pipeline-components/retrievers/elasticsearchsqlretriever", "pipeline-components/retrievers/faissembeddingretriever", "pipeline-components/retrievers/falkordbcypherretriever", "pipeline-components/retrievers/falkordbembeddingretriever", From 3c76426843019cffbb49e43ea01223a4d61d4542 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 4 Jun 2026 11:36:46 +0200 Subject: [PATCH 15/15] adding missed file --- .../retrievers/elasticsearchsqlretriever.mdx | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx new file mode 100644 index 0000000000..003a69d346 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx @@ -0,0 +1,116 @@ +--- +title: ElasticsearchSQLRetriever +id: elasticsearchsqlretriever +slug: /elasticsearchsqlretriever +description: Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. +--- + +# ElasticsearchSQLRetriever + +Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. + +| | | +| --------------------------------------- | ------------------------------------------------------------------------------------------------ | +| **Most common position in a pipeline** | Standalone, or anywhere you need to fetch metadata, aggregations, or other structured data | +| **Mandatory init variables** | `document_store`: An instance of `ElasticsearchDocumentStore` | +| **Mandatory run variables** | `query`: An Elasticsearch SQL query string | +| **Output variables** | `result`: A dictionary with the raw JSON response from the Elasticsearch SQL API | +| **API reference** | [Elasticsearch](https://docs.haystack.deepset.ai/reference/integrations-elasticsearch) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch | +| **Package name** | `elasticsearch-haystack` | + +## Overview + +`ElasticsearchSQLRetriever` lets you run [Elasticsearch SQL](https://www.elastic.co/guide/en/elasticsearch/reference/current/xpack-sql.html) queries directly against an `ElasticsearchDocumentStore`. Instead of matching a query against documents like the `ElasticsearchBM25Retriever` or `ElasticsearchEmbeddingRetriever`, it executes a SQL statement and returns the **raw JSON response** from the Elasticsearch SQL API. + +This is useful when you need structured access to your index at runtime, for example to fetch specific fields, filter on metadata, or compute aggregations such as counts and averages. + +Unlike the other Elasticsearch retrievers, this component does not return a list of `Document` objects. The output is a single `result` dictionary, where `result["result"]` holds the raw Elasticsearch response. For a typical query, the response contains: + +- `result["result"]["columns"]`: metadata describing each returned column. +- `result["result"]["rows"]`: the data rows. + +The component accepts two optional parameters at initialization: + +- `raise_on_failure`: if `True` (the default), an exception is raised when the SQL API call fails. If `False`, the error is logged as a warning and an empty dictionary is returned. +- `fetch_size`: the number of results to fetch per page. If not set, the default fetch size configured in Elasticsearch is used. + +## Installation + +Install Elasticsearch and then start an instance. Haystack supports Elasticsearch 8. + +If you have Docker set up, we recommend pulling the Docker image and running it. + +```bash +docker pull docker.elastic.co/elasticsearch/elasticsearch:8.11.1 +docker run -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" -e "xpack.security.enabled=false" elasticsearch:8.11.1 +``` + +As an alternative, you can go to [Elasticsearch integration GitHub](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch) and start a Docker container running Elasticsearch using the provided `docker-compose.yml`: + +```bash +docker compose up +``` + +Once you have a running Elasticsearch instance, install the `elasticsearch-haystack` integration: + +```bash +pip install elasticsearch-haystack +``` + +## Usage + +### On its own + +Write a few documents to an index, then run a SQL query against it. The example below selects the `content` field from the index and reads the returned columns and rows: + +```python +from haystack import Document +from haystack_integrations.components.retrievers.elasticsearch import ( + ElasticsearchSQLRetriever, +) +from haystack_integrations.document_stores.elasticsearch import ( + ElasticsearchDocumentStore, +) +from haystack.document_stores.types import DuplicatePolicy + +document_store = ElasticsearchDocumentStore( + hosts="http://localhost:9200/", + index="my_index", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +# DuplicatePolicy.SKIP is optional, but useful to run the script multiple times without throwing errors +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = ElasticsearchSQLRetriever(document_store=document_store) +output = retriever.run(query='SELECT content FROM "my_index" LIMIT 10') + +result = output["result"] +print(result["columns"]) # column metadata, e.g. [{"name": "content", "type": "text"}] +for row in result["rows"]: + print(row) +``` + +### Running an aggregation query + +Because the component returns the raw SQL response, you can use it for aggregations that the document-based retrievers don't support, such as counting documents: + +```python +retriever = ElasticsearchSQLRetriever(document_store=document_store) +output = retriever.run(query='SELECT COUNT(*) AS doc_count FROM "my_index"') + +result = output["result"] +print(result["rows"]) # e.g. [[3]] +``` + +To avoid raising an exception on a malformed or failing query, initialize the component with `raise_on_failure=False`. In that case, a failed query logs a warning and returns an empty dictionary instead.