diff --git a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx index 0feff4e9be..c0c544dae2 100644 --- a/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx +++ b/docs-website/docs/concepts/document-store/choosing-a-document-store.mdx @@ -81,6 +81,7 @@ This category is relatively small but growing fast and includes well-known relat - [Oracle](../../document-stores/oracledocumentstore.mdx) - [Pgvector](../../document-stores/pgvectordocumentstore.mdx) +- [Supabase](../../document-stores/supabasedocumentstore.mdx) #### Vector-capable NoSQL databases diff --git a/docs-website/docs/document-stores/supabasedocumentstore.mdx b/docs-website/docs/document-stores/supabasedocumentstore.mdx new file mode 100644 index 0000000000..f249ae97eb --- /dev/null +++ b/docs-website/docs/document-stores/supabasedocumentstore.mdx @@ -0,0 +1,182 @@ +--- +title: "SupabaseDocumentStore" +id: supabasedocumentstore +slug: "/supabasedocumentstore" +description: "Use Supabase as a document store in Haystack, with vector search (pgvector) or full-text search (PGroonga)." +--- + +# SupabaseDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Supabase](/reference/integrations-supabase) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/ | + +
+ +[Supabase](https://supabase.com/) is an open-source backend platform built on PostgreSQL. The Supabase integration for Haystack provides two document stores: + +- **`SupabasePgvectorDocumentStore`** — vector similarity search using the [pgvector](https://github.com/pgvector/pgvector) PostgreSQL extension, which comes pre-installed on Supabase. +- **`SupabaseGroongaDocumentStore`** — multilingual full-text search using the [PGroonga](https://pgroonga.github.io/) PostgreSQL extension. No embeddings required. + +## Installation + +```shell +pip install supabase-haystack +``` + +## SupabasePgvectorDocumentStore + +`SupabasePgvectorDocumentStore` is a thin wrapper around [`PgvectorDocumentStore`](./pgvectordocumentstore.mdx) with Supabase-specific defaults: + +- Reads the connection string from the `SUPABASE_DB_URL` environment variable. +- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + +### Connection + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +:::tip[Use session mode (port 5432)] +Supabase offers two pooler ports: transaction mode (port 6543) and session mode (port 5432). For best compatibility with pgvector operations, use session mode or a direct connection. +::: + +```shell +export SUPABASE_DB_URL="postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres" +``` + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasepgvectordocumentstore). + +### Supported Retrievers + +- [`SupabasePgvectorEmbeddingRetriever`](/reference/integrations-supabase#supabasepgvectorembeddingretriever): Fetches documents from the store based on a query embedding. +- [`SupabasePgvectorKeywordRetriever`](/reference/integrations-supabase#supabasepgvectorkeywordretriever): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. + +### Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] +embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = embedder.run(documents) +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +# Query pipeline +prompt_template = [ + ChatMessage.from_system("Answer the question based on the provided context."), + ChatMessage.from_user( + "Query: {{query}}\nDocuments:\n{% for doc in documents %}{{ doc.content }}\n{% endfor %}\nAnswer:", + ), +] + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.add_component( + "prompt_builder", + ChatPromptBuilder( + template=prompt_template, + required_variables=["query", "documents"], + ), +) +query_pipeline.add_component("generator", OpenAIChatGenerator(model="gpt-4o")) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +query_pipeline.connect("retriever.documents", "prompt_builder.documents") +query_pipeline.connect("prompt_builder.prompt", "generator.messages") + +result = query_pipeline.run( + { + "text_embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) +``` + +--- + +## SupabaseGroongaDocumentStore + +`SupabaseGroongaDocumentStore` uses [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. Unlike the pgvector store, it works with plain text queries and requires no embeddings. + +### Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) +document_store.warm_up() +``` + +:::note +`warm_up()` must be called before using the store. It initializes the Supabase client and creates the table and PGroonga index if they don't exist. +::: + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase). + +### Supported Retrievers + +- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. diff --git a/docs-website/docs/pipeline-components/retrievers.mdx b/docs-website/docs/pipeline-components/retrievers.mdx index 5967e55441..45e8002519 100644 --- a/docs-website/docs/pipeline-components/retrievers.mdx +++ b/docs-website/docs/pipeline-components/retrievers.mdx @@ -187,6 +187,9 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [QdrantHybridRetriever](retrievers/qdranthybridretriever.mdx) | A Retriever based both on dense and sparse embeddings, compatible with the Qdrant Document Store. | | [SentenceWindowRetriever](retrievers/sentencewindowretrieval.mdx) | Retrieves neighboring sentences around relevant sentences to get the full context. | | [SnowflakeTableRetriever](retrievers/snowflaketableretriever.mdx) | Connects to a Snowflake database to execute an SQL query. | +| [SupabaseGroongaBM25Retriever](retrievers/supabasegroongabm25retriever.mdx) | A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. | +| [SupabasePgvectorEmbeddingRetriever](retrievers/supabasepgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. | +| [SupabasePgvectorKeywordRetriever](retrievers/supabasepgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. | | [TextEmbeddingRetriever](retrievers/textembeddingretriever.mdx) | Wraps an embedding-based retriever with a text embedder into a single component that accepts a text query. | | [VespaEmbeddingRetriever](retrievers/vespaembeddingretriever.mdx) | An embedding-based Retriever compatible with the Vespa Document Store. | | [VespaKeywordRetriever](retrievers/vespakeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Vespa Document Store. | diff --git a/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx new file mode 100644 index 0000000000..5fd9a7a2df --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasegroongabm25retriever.mdx @@ -0,0 +1,152 @@ +--- +title: "SupabaseGroongaBM25Retriever" +id: supabasegroongabm25retriever +slug: "/supabasegroongabm25retriever" +description: "A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search." +--- + +# SupabaseGroongaBM25Retriever + +A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the full-text search pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabaseGroongaDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabaseGroongaBM25Retriever` retrieves Documents from the `SupabaseGroongaDocumentStore` using [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. + +Unlike embedding-based retrievers, this Retriever works with plain text queries and requires no embeddings. It supports a wide range of languages out of the box through PGroonga's multilingual indexing capabilities. + +The Retriever can be combined with `SupabasePgvectorEmbeddingRetriever` and a [`DocumentJoiner`](../joiners/documentjoiner.mdx) for hybrid search pipelines that take advantage of both keyword and semantic retrieval. +You can also use of the [Smart Pipeline Connections](https://docs.haystack.deepset.ai/docs/smart-pipeline-connections) and skip the `DocumentJoiner` if you want to combine the results of both retrievers in a RAG pipeline. + +In addition to `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabaseGroongaDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` environment variables for your Supabase project. + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_SERVICE_KEY` with your Supabase service role key. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx new file mode 100644 index 0000000000..80c6cc870b --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx @@ -0,0 +1,115 @@ +--- +title: "SupabasePgvectorEmbeddingRetriever" +id: supabasepgvectorembeddingretriever +slug: "/supabasepgvectorembeddingretriever" +description: "An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorEmbeddingRetriever + +An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorEmbeddingRetriever` is a thin wrapper around [`PgvectorEmbeddingRetriever`](pgvectorembeddingretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. It compares the query and Document embeddings and fetches the Documents most relevant to the query based on vector similarity. + +When using this Retriever in your pipeline, make sure embeddings are available. Add a Document Embedder to your indexing pipeline and a Text Embedder to your query pipeline. + +In addition to `query_embedding`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve), `filters` to narrow down the search space, and `vector_function` to override the similarity function set on the Document Store. + +Some relevant parameters that impact embedding retrieval must be defined when the `SupabasePgvectorDocumentStore` is initialized: `embedding_dimension`, `vector_function`, and `search_strategy` (`"exact_nearest_neighbor"` or `"hnsw"`). + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore(embedding_dimension=768) +retriever = SupabasePgvectorEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings.get("documents"), + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" + +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx new file mode 100644 index 0000000000..c29e0066ba --- /dev/null +++ b/docs-website/docs/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx @@ -0,0 +1,135 @@ +--- +title: "SupabasePgvectorKeywordRetriever" +id: supabasepgvectorkeywordretriever +slug: "/supabasepgvectorkeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorKeywordRetriever` is a thin wrapper around [`PgvectorKeywordRetriever`](pgvectorkeywordretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. + +It uses PostgreSQL full-text search (`to_tsvector` / `plainto_tsquery`) to find Documents and ranks them with the `ts_rank_cd` function. The ranking considers how often the query terms appear in the Document, how close together the terms are, and how important the part of the Document is where they occur. For more details, see the [PostgreSQL documentation](https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING). + +Keep in mind that, unlike similar components such as `ElasticsearchBM25Retriever`, this Retriever does not apply fuzzy search out of the box, so it's necessary to carefully formulate the query in order to avoid getting zero results. + +The language used to parse query and Document content for keyword retrieval is set via the `language` parameter on the `SupabasePgvectorDocumentStore` (defaults to `"english"`). + +In addition to the `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +document_store = SupabasePgvectorDocumentStore() +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +document_store = SupabasePgvectorDocumentStore( + language="english", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index c4257ac308..c9c94e0ad7 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -139,6 +139,7 @@ export default { 'document-stores/pgvectordocumentstore', 'document-stores/pinecone-document-store', 'document-stores/qdrant-document-store', + 'document-stores/supabasedocumentstore', 'document-stores/valkeydocumentstore', 'document-stores/vespadocumentstore', 'document-stores/weaviatedocumentstore', @@ -581,6 +582,9 @@ export default { 'pipeline-components/retrievers/qdrantsparseembeddingretriever', 'pipeline-components/retrievers/sentencewindowretrieval', 'pipeline-components/retrievers/snowflaketableretriever', + 'pipeline-components/retrievers/supabasegroongabm25retriever', + 'pipeline-components/retrievers/supabasepgvectorembeddingretriever', + 'pipeline-components/retrievers/supabasepgvectorkeywordretriever', 'pipeline-components/retrievers/textembeddingretriever', 'pipeline-components/retrievers/valkeyembeddingretriever', 'pipeline-components/retrievers/vespaembeddingretriever', diff --git a/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx b/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx index 24fe226763..c0c544dae2 100644 --- a/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx +++ b/docs-website/versioned_docs/version-2.30/concepts/document-store/choosing-a-document-store.mdx @@ -79,7 +79,9 @@ Pure vector databases, also known as just “vector databases”, offer efficien This category is relatively small but growing fast and includes well-known relational databases where vector capabilities were added through plugins or extensions. They are not as performant as the previous categories, but the main advantage of these databases is the opportunity to easily combine vectors with structured data, having a one-stop data shop for your application. You should pick a vector-capable SQL database when the performance trade-off is paid off by the lower cost of maintaining a single database instance for your application or when the structured data plays a more fundamental role in your business logic, with vectors being more of a nice-to-have. +- [Oracle](../../document-stores/oracledocumentstore.mdx) - [Pgvector](../../document-stores/pgvectordocumentstore.mdx) +- [Supabase](../../document-stores/supabasedocumentstore.mdx) #### Vector-capable NoSQL databases diff --git a/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx b/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx new file mode 100644 index 0000000000..f5c0728113 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/document-stores/oracledocumentstore.mdx @@ -0,0 +1,191 @@ +--- +title: "OracleDocumentStore" +id: oracledocumentstore +slug: "/oracledocumentstore" +description: "Use Oracle AI Vector Search as a document store in Haystack, with vector similarity and keyword search powered by Oracle Database 23ai." +--- + +# OracleDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Oracle](/reference/integrations-oracle) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | + +
+ +`OracleDocumentStore` is a Document Store backed by [Oracle AI Vector Search](https://www.oracle.com/database/ai-vector-search/), available in Oracle Database 23ai and later. +It stores documents alongside dense vector embeddings in a native `VECTOR` column, and supports both vector similarity search and keyword search via an automatically managed DBMS_SEARCH index. + +## Installation + +```shell +pip install oracle-haystack +``` + +## Connection + +`OracleDocumentStore` connects to Oracle using the `OracleConnectionConfig` dataclass, which supports two connection modes: + +- **Thin mode** (default): connects directly over TCP. No Oracle Instant Client required. +- **Thick mode**: activated automatically when `wallet_location` is provided. Used for Oracle Autonomous Database (ADB-S) connections. + +Set the connection parameters as environment variables: + +```shell +export ORACLE_USER="haystack" +export ORACLE_PASSWORD="secret" +export ORACLE_DSN="localhost:1521/freepdb1" +``` + +## Initialization + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-oracle#oracledocumentstore). + +### Connecting to Oracle Autonomous Database + +For Oracle Autonomous Database (ADB-S), provide a wallet for authentication. The store automatically activates thick mode when `wallet_location` is set: + +```python +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + wallet_location="/path/to/wallet", + wallet_password=Secret.from_env_var("WALLET_PASSWORD"), + ), + embedding_dim=1536, +) +``` + +### HNSW Vector Index + +By default, the store performs exact vector search. To enable approximate nearest-neighbor search (faster on large datasets), create an HNSW index: + +```python +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, + distance_metric="COSINE", + create_index=True, # creates the HNSW index on startup + hnsw_neighbors=32, + hnsw_ef_construction=200, + hnsw_accuracy=95, +) +``` + +## Supported Retrievers + +- [`OracleEmbeddingRetriever`](../pipeline-components/retrievers/oracleembeddingretriever.mdx): Retrieves documents from `OracleDocumentStore` based on vector similarity to a query embedding. +- [`OracleKeywordRetriever`](../pipeline-components/retrievers/oraclekeywordretriever.mdx): Retrieves documents matching a keyword query using Oracle's DBMS_SEARCH full-text index. + +## Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersDocumentEmbedder, + SentenceTransformersTextEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] + +doc_embedder = SentenceTransformersDocumentEmbedder( + model="sentence-transformers/all-MiniLM-L6-v2", +) +doc_embedder.warm_up() +embedded_docs = doc_embedder.run(documents)["documents"] +document_store.write_documents(embedded_docs, policy=DuplicatePolicy.OVERWRITE) + +# Build a RAG pipeline +template = [ + ChatMessage.from_user( + """ + Given the following context, answer the question. + Context: {% for doc in documents %}{{ doc.content }}{% endfor %} + Question: {{ query }} + """, + ), +] + +pipeline = Pipeline() +pipeline.add_component( + "embedder", + SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), +) +pipeline.add_component( + "retriever", + OracleEmbeddingRetriever(document_store=document_store, top_k=3), +) +pipeline.add_component("prompt_builder", ChatPromptBuilder(template=template)) +pipeline.add_component( + "llm", + OpenAIChatGenerator(api_key=Secret.from_env_var("OPENAI_API_KEY")), +) + +pipeline.connect("embedder.embedding", "retriever.query_embedding") +pipeline.connect("retriever.documents", "prompt_builder.documents") +pipeline.connect("prompt_builder.prompt", "llm.messages") + +result = pipeline.run( + { + "embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) + +print(result["llm"]["replies"][0].text) +``` diff --git a/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx b/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx new file mode 100644 index 0000000000..f249ae97eb --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/document-stores/supabasedocumentstore.mdx @@ -0,0 +1,182 @@ +--- +title: "SupabaseDocumentStore" +id: supabasedocumentstore +slug: "/supabasedocumentstore" +description: "Use Supabase as a document store in Haystack, with vector search (pgvector) or full-text search (PGroonga)." +--- + +# SupabaseDocumentStore + +
+ +| | | +| --- | --- | +| API reference | [Supabase](/reference/integrations-supabase) | +| GitHub link | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/ | + +
+ +[Supabase](https://supabase.com/) is an open-source backend platform built on PostgreSQL. The Supabase integration for Haystack provides two document stores: + +- **`SupabasePgvectorDocumentStore`** — vector similarity search using the [pgvector](https://github.com/pgvector/pgvector) PostgreSQL extension, which comes pre-installed on Supabase. +- **`SupabaseGroongaDocumentStore`** — multilingual full-text search using the [PGroonga](https://pgroonga.github.io/) PostgreSQL extension. No embeddings required. + +## Installation + +```shell +pip install supabase-haystack +``` + +## SupabasePgvectorDocumentStore + +`SupabasePgvectorDocumentStore` is a thin wrapper around [`PgvectorDocumentStore`](./pgvectordocumentstore.mdx) with Supabase-specific defaults: + +- Reads the connection string from the `SUPABASE_DB_URL` environment variable. +- Defaults `create_extension` to `False` since pgvector is pre-installed on Supabase. + +### Connection + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +:::tip[Use session mode (port 5432)] +Supabase offers two pooler ports: transaction mode (port 6543) and session mode (port 5432). For best compatibility with pgvector operations, use session mode or a direct connection. +::: + +```shell +export SUPABASE_DB_URL="postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:5432/postgres" +``` + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) +``` + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase#supabasepgvectordocumentstore). + +### Supported Retrievers + +- [`SupabasePgvectorEmbeddingRetriever`](/reference/integrations-supabase#supabasepgvectorembeddingretriever): Fetches documents from the store based on a query embedding. +- [`SupabasePgvectorKeywordRetriever`](/reference/integrations-supabase#supabasepgvectorkeywordretriever): Fetches documents matching a keyword query using PostgreSQL's `ts_rank_cd` ranking. + +### Example: RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types.policy import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +# Index documents +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness.", + ), + Document( + content="In certain places, you can witness the phenomenon of bioluminescent waves.", + ), +] +embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = embedder.run(documents) +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +# Query pipeline +prompt_template = [ + ChatMessage.from_system("Answer the question based on the provided context."), + ChatMessage.from_user( + "Query: {{query}}\nDocuments:\n{% for doc in documents %}{{ doc.content }}\n{% endfor %}\nAnswer:", + ), +] + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.add_component( + "prompt_builder", + ChatPromptBuilder( + template=prompt_template, + required_variables=["query", "documents"], + ), +) +query_pipeline.add_component("generator", OpenAIChatGenerator(model="gpt-4o")) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") +query_pipeline.connect("retriever.documents", "prompt_builder.documents") +query_pipeline.connect("prompt_builder.prompt", "generator.messages") + +result = query_pipeline.run( + { + "text_embedder": {"text": "How many languages are there?"}, + "prompt_builder": {"query": "How many languages are there?"}, + }, +) +``` + +--- + +## SupabaseGroongaDocumentStore + +`SupabaseGroongaDocumentStore` uses [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. Unlike the pgvector store, it works with plain text queries and requires no embeddings. + +### Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +### Initialization + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) +document_store.warm_up() +``` + +:::note +`warm_up()` must be called before using the store. It initializes the Supabase client and creates the table and PGroonga index if they don't exist. +::: + +To learn more about the initialization parameters, see the [API docs](/reference/integrations-supabase). + +### Supported Retrievers + +- [`SupabaseGroongaBM25Retriever`](/reference/integrations-supabase): Retrieves documents using PGroonga full-text search. Works without embeddings and can be combined with `SupabasePgvectorEmbeddingRetriever` for hybrid search pipelines. diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx index 2bac65a42a..45e8002519 100644 --- a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers.mdx @@ -165,6 +165,7 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [ChromaQueryTextRetriever](retrievers/chromaqueryretriever.mdx) | A Retriever compatible with the Chroma Document Store that uses the Chroma query API. | | [ElasticsearchEmbeddingRetriever](retrievers/elasticsearchembeddingretriever.mdx) | An embedding-based Retriever compatible with the Elasticsearch Document Store. | | [ElasticsearchBM25Retriever](retrievers/elasticsearchbm25retriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Elasticsearch Document Store. | +| [ElasticsearchSQLRetriever](retrievers/elasticsearchsqlretriever.mdx) | Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. | | [InMemoryBM25Retriever](retrievers/inmemorybm25retriever.mdx) | A keyword-based Retriever compatible with the InMemoryDocumentStore. | | [InMemoryEmbeddingRetriever](retrievers/inmemoryembeddingretriever.mdx) | An embedding-based Retriever compatible with the InMemoryDocumentStore. | | [FilterRetriever](retrievers/filterretriever.mdx) | A special Retriever to be used with any Document Store to get the Documents that match specific filters. | @@ -176,6 +177,8 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [OpenSearchBM25Retriever](retrievers/opensearchbm25retriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from an OpenSearch Document Store. | | [OpenSearchEmbeddingRetriever](retrievers/opensearchembeddingretriever.mdx) | An embedding-based Retriever compatible with the OpenSearch Document Store. | | [OpenSearchHybridRetriever](retrievers/opensearchhybridretriever.mdx) | A SuperComponent that implements a Hybrid Retriever in a single component, relying on OpenSearch as the backend Document Store. | +| [OracleEmbeddingRetriever](retrievers/oracleembeddingretriever.mdx) | An embedding-based Retriever compatible with the Oracle Document Store. | +| [OracleKeywordRetriever](retrievers/oraclekeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Oracle Document Store. | | [PgvectorEmbeddingRetriever](retrievers/pgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the Pgvector Document Store. | | [PgvectorKeywordRetriever](retrievers/pgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the Pgvector Document Store. | | [PineconeEmbeddingRetriever](retrievers/pineconedenseretriever.mdx) | An embedding-based Retriever compatible with the Pinecone Document Store. | @@ -184,6 +187,9 @@ For details on how to initialize and use a Retriever in a pipeline, see the docu | [QdrantHybridRetriever](retrievers/qdranthybridretriever.mdx) | A Retriever based both on dense and sparse embeddings, compatible with the Qdrant Document Store. | | [SentenceWindowRetriever](retrievers/sentencewindowretrieval.mdx) | Retrieves neighboring sentences around relevant sentences to get the full context. | | [SnowflakeTableRetriever](retrievers/snowflaketableretriever.mdx) | Connects to a Snowflake database to execute an SQL query. | +| [SupabaseGroongaBM25Retriever](retrievers/supabasegroongabm25retriever.mdx) | A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. | +| [SupabasePgvectorEmbeddingRetriever](retrievers/supabasepgvectorembeddingretriever.mdx) | An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. | +| [SupabasePgvectorKeywordRetriever](retrievers/supabasepgvectorkeywordretriever.mdx) | A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. | | [TextEmbeddingRetriever](retrievers/textembeddingretriever.mdx) | Wraps an embedding-based retriever with a text embedder into a single component that accepts a text query. | | [VespaEmbeddingRetriever](retrievers/vespaembeddingretriever.mdx) | An embedding-based Retriever compatible with the Vespa Document Store. | | [VespaKeywordRetriever](retrievers/vespakeywordretriever.mdx) | A keyword-based Retriever that fetches Documents matching a query from the Vespa Document Store. | diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx new file mode 100644 index 0000000000..003a69d346 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/elasticsearchsqlretriever.mdx @@ -0,0 +1,116 @@ +--- +title: ElasticsearchSQLRetriever +id: elasticsearchsqlretriever +slug: /elasticsearchsqlretriever +description: Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. +--- + +# ElasticsearchSQLRetriever + +Executes raw Elasticsearch SQL queries against an Elasticsearch Document Store and returns the raw JSON response. + +| | | +| --------------------------------------- | ------------------------------------------------------------------------------------------------ | +| **Most common position in a pipeline** | Standalone, or anywhere you need to fetch metadata, aggregations, or other structured data | +| **Mandatory init variables** | `document_store`: An instance of `ElasticsearchDocumentStore` | +| **Mandatory run variables** | `query`: An Elasticsearch SQL query string | +| **Output variables** | `result`: A dictionary with the raw JSON response from the Elasticsearch SQL API | +| **API reference** | [Elasticsearch](https://docs.haystack.deepset.ai/reference/integrations-elasticsearch) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch | +| **Package name** | `elasticsearch-haystack` | + +## Overview + +`ElasticsearchSQLRetriever` lets you run [Elasticsearch SQL](https://www.elastic.co/guide/en/elasticsearch/reference/current/xpack-sql.html) queries directly against an `ElasticsearchDocumentStore`. Instead of matching a query against documents like the `ElasticsearchBM25Retriever` or `ElasticsearchEmbeddingRetriever`, it executes a SQL statement and returns the **raw JSON response** from the Elasticsearch SQL API. + +This is useful when you need structured access to your index at runtime, for example to fetch specific fields, filter on metadata, or compute aggregations such as counts and averages. + +Unlike the other Elasticsearch retrievers, this component does not return a list of `Document` objects. The output is a single `result` dictionary, where `result["result"]` holds the raw Elasticsearch response. For a typical query, the response contains: + +- `result["result"]["columns"]`: metadata describing each returned column. +- `result["result"]["rows"]`: the data rows. + +The component accepts two optional parameters at initialization: + +- `raise_on_failure`: if `True` (the default), an exception is raised when the SQL API call fails. If `False`, the error is logged as a warning and an empty dictionary is returned. +- `fetch_size`: the number of results to fetch per page. If not set, the default fetch size configured in Elasticsearch is used. + +## Installation + +Install Elasticsearch and then start an instance. Haystack supports Elasticsearch 8. + +If you have Docker set up, we recommend pulling the Docker image and running it. + +```bash +docker pull docker.elastic.co/elasticsearch/elasticsearch:8.11.1 +docker run -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" -e "xpack.security.enabled=false" elasticsearch:8.11.1 +``` + +As an alternative, you can go to [Elasticsearch integration GitHub](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch) and start a Docker container running Elasticsearch using the provided `docker-compose.yml`: + +```bash +docker compose up +``` + +Once you have a running Elasticsearch instance, install the `elasticsearch-haystack` integration: + +```bash +pip install elasticsearch-haystack +``` + +## Usage + +### On its own + +Write a few documents to an index, then run a SQL query against it. The example below selects the `content` field from the index and reads the returned columns and rows: + +```python +from haystack import Document +from haystack_integrations.components.retrievers.elasticsearch import ( + ElasticsearchSQLRetriever, +) +from haystack_integrations.document_stores.elasticsearch import ( + ElasticsearchDocumentStore, +) +from haystack.document_stores.types import DuplicatePolicy + +document_store = ElasticsearchDocumentStore( + hosts="http://localhost:9200/", + index="my_index", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +# DuplicatePolicy.SKIP is optional, but useful to run the script multiple times without throwing errors +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = ElasticsearchSQLRetriever(document_store=document_store) +output = retriever.run(query='SELECT content FROM "my_index" LIMIT 10') + +result = output["result"] +print(result["columns"]) # column metadata, e.g. [{"name": "content", "type": "text"}] +for row in result["rows"]: + print(row) +``` + +### Running an aggregation query + +Because the component returns the raw SQL response, you can use it for aggregations that the document-based retrievers don't support, such as counting documents: + +```python +retriever = ElasticsearchSQLRetriever(document_store=document_store) +output = retriever.run(query='SELECT COUNT(*) AS doc_count FROM "my_index"') + +result = output["result"] +print(result["rows"]) # e.g. [[3]] +``` + +To avoid raising an exception on a malformed or failing query, initialize the component with `raise_on_failure=False`. In that case, a failed query logs a warning and returns an empty dictionary instead. diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx new file mode 100644 index 0000000000..6f63f6f2db --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oracleembeddingretriever.mdx @@ -0,0 +1,142 @@ +--- +title: "OracleEmbeddingRetriever" +id: oracleembeddingretriever +slug: "/oracleembeddingretriever" +description: "An embedding-based Retriever compatible with the Oracle Document Store." +--- + +# OracleEmbeddingRetriever + +An embedding-based Retriever compatible with the Oracle Document Store. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in a semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of an [OracleDocumentStore](../../document-stores/oracledocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Oracle](/reference/integrations-oracle) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | +| **Package name** | `oracle-haystack` | + +
+ +## Overview + +The `OracleEmbeddingRetriever` is an embedding-based Retriever compatible with `OracleDocumentStore`. It uses Oracle AI Vector Search to compare query and document embeddings, fetching the most relevant documents based on vector similarity. + +When using `OracleEmbeddingRetriever` in a pipeline, make sure embeddings are available for both documents (at index time) and queries (at query time). Use a Document Embedder in your indexing pipeline and a Text Embedder in your query pipeline. + +The distance metric (COSINE, EUCLIDEAN, or DOT) is configured on the `OracleDocumentStore`. In addition to `query_embedding`, the retriever accepts `top_k` (maximum documents to return) and `filters` to narrow the search space. + +## Installation + +To run Oracle Database 23ai locally with Docker: + +```shell +docker run -d --name oracle23ai \ + -p 1521:1521 \ + -e ORACLE_PASSWORD=oracle \ + container-registry.oracle.com/database/free:latest +``` + +Install the Oracle integration for Haystack: + +```shell +pip install oracle-haystack +``` + +## Usage + +### On its own + +This Retriever needs an `OracleDocumentStore` and indexed documents with embeddings to run. + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +retriever = OracleEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersDocumentEmbedder, + SentenceTransformersTextEmbedder, +) +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder( + model="sentence-transformers/all-MiniLM-L6-v2", +) +document_embedder.warm_up() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings["documents"], + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component( + "text_embedder", + SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), +) +query_pipeline.add_component( + "retriever", + OracleEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx new file mode 100644 index 0000000000..56059cb934 --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/oraclekeywordretriever.mdx @@ -0,0 +1,150 @@ +--- +title: "OracleKeywordRetriever" +id: oraclekeywordretriever +slug: "/oraclekeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the Oracle Document Store using Oracle's DBMS_SEARCH full-text index." +--- + +# OracleKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the Oracle Document Store. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in a keyword search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of an [OracleDocumentStore](../../document-stores/oracledocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents matching the query | +| **API reference** | [Oracle](/reference/integrations-oracle) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/oracle | +| **Package name** | `oracle-haystack` | + +
+ +## Overview + +The `OracleKeywordRetriever` is a keyword-based Retriever compatible with `OracleDocumentStore`. It uses Oracle's DBMS_SEARCH full-text index — automatically created when the document store is initialized — to search documents by keyword relevance. + +This retriever works without embeddings, making it suitable for keyword-only pipelines or as the keyword branch of a hybrid search pipeline. + +In addition to `query`, the retriever accepts `top_k` (maximum documents to return) and `filters` to narrow the search space. + +## Installation + +To run Oracle Database 23ai locally with Docker: + +```shell +docker run -d --name oracle23ai \ + -p 1521:1521 \ + -e ORACLE_PASSWORD=oracle \ + container-registry.oracle.com/database/free:latest +``` + +Install the Oracle integration for Haystack: + +```shell +pip install oracle-haystack +``` + +## Usage + +### On its own + +This Retriever needs an `OracleDocumentStore` and indexed documents to run. + +```python +from haystack.utils import Secret +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleKeywordRetriever + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +retriever = OracleKeywordRetriever(document_store=document_store) +retriever.run(query="my keyword query") +``` + +### In a RAG pipeline + +```python +from haystack import Document, Pipeline +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.oracle import ( + OracleDocumentStore, + OracleConnectionConfig, +) +from haystack_integrations.components.retrievers.oracle import OracleKeywordRetriever + +prompt_template = [ + ChatMessage.from_user( + """ + Given these documents, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + + \nQuestion: {{question}} + \nAnswer: + """, + ), +] + +document_store = OracleDocumentStore( + connection_config=OracleConnectionConfig( + user=Secret.from_env_var("ORACLE_USER"), + password=Secret.from_env_var("ORACLE_PASSWORD"), + dsn=Secret.from_env_var("ORACLE_DSN"), + ), + embedding_dim=768, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = OracleKeywordRetriever(document_store=document_store) + +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder(template=prompt_template, required_variables="*"), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") + +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") + +question = "How many languages are there?" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + }, +) +print(result["llm"]["replies"][0].text) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx new file mode 100644 index 0000000000..5fd9a7a2df --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasegroongabm25retriever.mdx @@ -0,0 +1,152 @@ +--- +title: "SupabaseGroongaBM25Retriever" +id: supabasegroongabm25retriever +slug: "/supabasegroongabm25retriever" +description: "A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search." +--- + +# SupabaseGroongaBM25Retriever + +A full-text Retriever that fetches documents from the SupabaseGroongaDocumentStore using PGroonga search. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the full-text search pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabaseGroongaDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabaseGroongaBM25Retriever` retrieves Documents from the `SupabaseGroongaDocumentStore` using [PGroonga](https://pgroonga.github.io/), a PostgreSQL extension for fast, multilingual full-text search. + +Unlike embedding-based retrievers, this Retriever works with plain text queries and requires no embeddings. It supports a wide range of languages out of the box through PGroonga's multilingual indexing capabilities. + +The Retriever can be combined with `SupabasePgvectorEmbeddingRetriever` and a [`DocumentJoiner`](../joiners/documentjoiner.mdx) for hybrid search pipelines that take advantage of both keyword and semantic retrieval. +You can also use of the [Smart Pipeline Connections](https://docs.haystack.deepset.ai/docs/smart-pipeline-connections) and skip the `DocumentJoiner` if you want to combine the results of both retrievers in a RAG pipeline. + +In addition to `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Prerequisites + +PGroonga must be enabled in your Supabase project. Run the following SQL in the Supabase SQL editor: + +```sql +CREATE EXTENSION IF NOT EXISTS pgroonga; +``` + +You also need to create a SQL function that PGroonga uses for search. See the [integration README](https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase/) for the required function definition. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabaseGroongaDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` environment variables for your Supabase project. + +```python +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) +from haystack.utils import Secret + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_SERVICE_KEY` with your Supabase service role key. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabaseGroongaBM25Retriever, +) + +document_store = SupabaseGroongaDocumentStore( + supabase_url="https://.supabase.co", + supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), + table_name="haystack_groonga_documents", +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +retriever = SupabaseGroongaBM25Retriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx new file mode 100644 index 0000000000..80c6cc870b --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorembeddingretriever.mdx @@ -0,0 +1,115 @@ +--- +title: "SupabasePgvectorEmbeddingRetriever" +id: supabasepgvectorembeddingretriever +slug: "/supabasepgvectorembeddingretriever" +description: "An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorEmbeddingRetriever + +An embedding-based Retriever compatible with the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. After a Text Embedder and before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. After a Text Embedder and before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query_embedding`: A vector representing the query (a list of floats) | +| **Output variables** | `documents`: A list of documents | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorEmbeddingRetriever` is a thin wrapper around [`PgvectorEmbeddingRetriever`](pgvectorembeddingretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. It compares the query and Document embeddings and fetches the Documents most relevant to the query based on vector similarity. + +When using this Retriever in your pipeline, make sure embeddings are available. Add a Document Embedder to your indexing pipeline and a Text Embedder to your query pipeline. + +In addition to `query_embedding`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve), `filters` to narrow down the search space, and `vector_function` to override the similarity function set on the Document Store. + +Some relevant parameters that impact embedding retrieval must be defined when the `SupabasePgvectorDocumentStore` is initialized: `embedding_dimension`, `vector_function`, and `search_strategy` (`"exact_nearest_neighbor"` or `"hnsw"`). + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore(embedding_dimension=768) +retriever = SupabasePgvectorEmbeddingRetriever(document_store=document_store) + +# using a fake vector to keep the example simple +retriever.run(query_embedding=[0.1] * 768) +``` + +### In a Pipeline + +```python +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy +from haystack.components.embedders import ( + SentenceTransformersTextEmbedder, + SentenceTransformersDocumentEmbedder, +) + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorEmbeddingRetriever, +) + +document_store = SupabasePgvectorDocumentStore( + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_embedder = SentenceTransformersDocumentEmbedder() +documents_with_embeddings = document_embedder.run(documents) + +document_store.write_documents( + documents_with_embeddings.get("documents"), + policy=DuplicatePolicy.OVERWRITE, +) + +query_pipeline = Pipeline() +query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) +query_pipeline.add_component( + "retriever", + SupabasePgvectorEmbeddingRetriever(document_store=document_store), +) +query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + +query = "How many languages are there?" + +result = query_pipeline.run({"text_embedder": {"text": query}}) + +print(result["retriever"]["documents"][0]) +``` diff --git a/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx new file mode 100644 index 0000000000..c29e0066ba --- /dev/null +++ b/docs-website/versioned_docs/version-2.30/pipeline-components/retrievers/supabasepgvectorkeywordretriever.mdx @@ -0,0 +1,135 @@ +--- +title: "SupabasePgvectorKeywordRetriever" +id: supabasepgvectorkeywordretriever +slug: "/supabasepgvectorkeywordretriever" +description: "A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore." +--- + +# SupabasePgvectorKeywordRetriever + +A keyword-based Retriever that fetches documents matching a query from the SupabasePgvectorDocumentStore. + +
+ +| | | +| --- | --- | +| **Most common position in a pipeline** | 1. Before a [`PromptBuilder`](../builders/promptbuilder.mdx) in a RAG pipeline 2. The last component in the semantic search pipeline 3. Before an [`ExtractiveReader`](../readers/extractivereader.mdx) in an extractive QA pipeline | +| **Mandatory init variables** | `document_store`: An instance of a [SupabasePgvectorDocumentStore](../../document-stores/supabasedocumentstore.mdx) | +| **Mandatory run variables** | `query`: A string | +| **Output variables** | `documents`: A list of documents (matching the query) | +| **API reference** | [Supabase](/reference/integrations-supabase) | +| **GitHub link** | https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/supabase | +| **Package name** | `supabase-haystack` | + +
+ +## Overview + +`SupabasePgvectorKeywordRetriever` is a thin wrapper around [`PgvectorKeywordRetriever`](pgvectorkeywordretriever.mdx), adapted for use with `SupabasePgvectorDocumentStore`. + +It uses PostgreSQL full-text search (`to_tsvector` / `plainto_tsquery`) to find Documents and ranks them with the `ts_rank_cd` function. The ranking considers how often the query terms appear in the Document, how close together the terms are, and how important the part of the Document is where they occur. For more details, see the [PostgreSQL documentation](https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING). + +Keep in mind that, unlike similar components such as `ElasticsearchBM25Retriever`, this Retriever does not apply fuzzy search out of the box, so it's necessary to carefully formulate the query in order to avoid getting zero results. + +The language used to parse query and Document content for keyword retrieval is set via the `language` parameter on the `SupabasePgvectorDocumentStore` (defaults to `"english"`). + +In addition to the `query`, the Retriever accepts optional parameters including `top_k` (the maximum number of Documents to retrieve) and `filters` to narrow the search space. + +## Installation + +```shell +pip install supabase-haystack +``` + +## Usage + +### On its own + +This Retriever needs the `SupabasePgvectorDocumentStore` and indexed Documents to run. + +Set the `SUPABASE_DB_URL` environment variable with your Supabase database connection string. + +```python +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +document_store = SupabasePgvectorDocumentStore() +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) + +retriever.run(query="my nice query") +``` + +### In a RAG pipeline + +The prerequisites for running this code are: + +- Set an environment variable `OPENAI_API_KEY` with your OpenAI API key. +- Set an environment variable `SUPABASE_DB_URL` with the connection string to your Supabase database. + +```python +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.supabase import SupabasePgvectorDocumentStore +from haystack_integrations.components.retrievers.supabase import ( + SupabasePgvectorKeywordRetriever, +) + +prompt_template = [ + ChatMessage.from_user( + "Given these documents, answer the question.\nDocuments:\n" + "{% for doc in documents %}{{ doc.content }}{% endfor %}\n" + "Question: {{question}}\nAnswer:", + ), +] + +document_store = SupabasePgvectorDocumentStore( + language="english", + recreate_table=True, +) + +documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document( + content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors.", + ), + Document( + content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.", + ), +] + +document_store.write_documents(documents=documents, policy=DuplicatePolicy.SKIP) + +retriever = SupabasePgvectorKeywordRetriever(document_store=document_store) +rag_pipeline = Pipeline() +rag_pipeline.add_component(name="retriever", instance=retriever) +rag_pipeline.add_component( + instance=ChatPromptBuilder( + template=prompt_template, + required_variables={"question", "documents"}, + ), + name="prompt_builder", +) +rag_pipeline.add_component(instance=OpenAIChatGenerator(), name="llm") +rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") +rag_pipeline.connect("retriever", "prompt_builder.documents") +rag_pipeline.connect("prompt_builder.prompt", "llm.messages") +rag_pipeline.connect("llm.replies", "answer_builder.replies") +rag_pipeline.connect("retriever", "answer_builder.documents") + +question = "languages spoken around the world today" +result = rag_pipeline.run( + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + }, +) +print(result["answer_builder"]) +``` diff --git a/docs-website/versioned_sidebars/version-2.30-sidebars.json b/docs-website/versioned_sidebars/version-2.30-sidebars.json index 530eb1e702..fcefb1373d 100644 --- a/docs-website/versioned_sidebars/version-2.30-sidebars.json +++ b/docs-website/versioned_sidebars/version-2.30-sidebars.json @@ -131,9 +131,11 @@ "href": "https://haystack.deepset.ai/integrations/neo4j-document-store" }, "document-stores/opensearch-document-store", + "document-stores/oracledocumentstore", "document-stores/pgvectordocumentstore", "document-stores/pinecone-document-store", "document-stores/qdrant-document-store", + "document-stores/supabasedocumentstore", "document-stores/valkeydocumentstore", "document-stores/vespadocumentstore", "document-stores/weaviatedocumentstore" @@ -550,6 +552,7 @@ "pipeline-components/retrievers/chromaqueryretriever", "pipeline-components/retrievers/elasticsearchbm25retriever", "pipeline-components/retrievers/elasticsearchembeddingretriever", + "pipeline-components/retrievers/elasticsearchsqlretriever", "pipeline-components/retrievers/faissembeddingretriever", "pipeline-components/retrievers/falkordbcypherretriever", "pipeline-components/retrievers/falkordbembeddingretriever", @@ -565,6 +568,8 @@ "pipeline-components/retrievers/opensearchbm25retriever", "pipeline-components/retrievers/opensearchembeddingretriever", "pipeline-components/retrievers/opensearchhybridretriever", + "pipeline-components/retrievers/oracleembeddingretriever", + "pipeline-components/retrievers/oraclekeywordretriever", "pipeline-components/retrievers/pgvectorembeddingretriever", "pipeline-components/retrievers/pgvectorkeywordretriever", "pipeline-components/retrievers/pineconedenseretriever", @@ -572,6 +577,9 @@ "pipeline-components/retrievers/qdranthybridretriever", "pipeline-components/retrievers/qdrantsparseembeddingretriever", "pipeline-components/retrievers/sentencewindowretrieval", + "pipeline-components/retrievers/supabasegroongabm25retriever", + "pipeline-components/retrievers/supabasepgvectorembeddingretriever", + "pipeline-components/retrievers/supabasepgvectorkeywordretriever", "pipeline-components/retrievers/snowflaketableretriever", "pipeline-components/retrievers/textembeddingretriever", "pipeline-components/retrievers/valkeyembeddingretriever", @@ -740,4 +748,4 @@ ] } ] -} \ No newline at end of file +}