integrations-python/langchain/langchain_vectorize/retrievers.py at aebdf7988119ead9d9bc83ea700109f4e0d60055 · vectorize-io/integrations-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""Vectorize LangChain retrievers."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Literal, Optional

import vectorize_client
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from typing_extensions import override
from vectorize_client import (
    ApiClient,
    Configuration,
    PipelinesApi,
    RetrieveDocumentsRequest,
)

if TYPE_CHECKING:
    from langchain_core.callbacks import CallbackManagerForRetrieverRun
    from langchain_core.runnables import RunnableConfig

_METADATA_FIELDS = {
    "relevancy",
    "chunk_id",
    "total_chunks",
    "origin",
    "origin_id",
    "similarity",
    "source",
    "unique_source",
    "source_display_name",
    "pipeline_id",
    "org_id",
}
_NOT_SET = object()


class VectorizeRetriever(BaseRetriever):
    """Vectorize retriever."""

    api_token: str
    """The Vectorize API token."""
    environment: Literal["prod", "dev", "local", "staging"] = "prod"
    """The Vectorize API environment."""
    organization: Optional[str] = None  # noqa: UP007
    """The Vectorize organization ID."""
    pipeline_id: Optional[str] = None  # noqa: UP007
    """The Vectorize pipeline ID."""
    num_results: int = 5
    """The number of documents to return."""
    rerank: bool = False
    """Whether to rerank the results."""
    metadata_filters: list[dict[str, Any]] = []
    """The metadata filters to apply when retrieving the documents."""

    _pipelines: PipelinesApi | None = None

    @override
    def model_post_init(self, /, context: Any) -> None:
        header_name = None
        header_value = None
        if self.environment == "prod":
            host = "https://api.vectorize.io/v1"
        elif self.environment == "dev":
            host = "https://api-dev.vectorize.io/v1"
        elif self.environment == "local":
            host = "http://localhost:3000/api"
            header_name = "x-lambda-api-key"
            header_value = self.api_token
        else:
            host = "https://api-staging.vectorize.io/v1"
        api = ApiClient(
            Configuration(host=host, access_token=self.api_token, debug=True),
            header_name,
            header_value,
        )
        self._pipelines = PipelinesApi(api)

    @staticmethod
    def _convert_document(document: vectorize_client.models.Document) -> Document:
        metadata = {field: getattr(document, field) for field in _METADATA_FIELDS}
        return Document(id=document.id, page_content=document.text, metadata=metadata)

    @override
    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun,
        organization: str | None = None,
        pipeline_id: str | None = None,
        num_results: int | None = None,
        rerank: bool | None = None,
        metadata_filters: list[dict[str, Any]] | None = None,
    ) -> list[Document]:
        request = RetrieveDocumentsRequest(
            question=query,
            num_results=num_results or self.num_results,
            rerank=rerank or self.rerank,
            metadata_filters=metadata_filters or self.metadata_filters,
        )
        response = self._pipelines.retrieve_documents(
            organization or self.organization, pipeline_id or self.pipeline_id, request
        )
        return [self._convert_document(doc) for doc in response.documents]

    @override
    def invoke(
        self,
        input: str,
        config: RunnableConfig | None = None,
        *,
        organization: str = "",
        pipeline_id: str = "",
        num_results: int = _NOT_SET,
        rerank: bool = _NOT_SET,
        metadata_filters: list[dict[str, Any]] = _NOT_SET,
    ) -> list[Document]:
        """Invoke the retriever to get relevant documents.

        Main entry point for retriever invocations.

        Args:
            input: The query string.
            config: Configuration for the retriever. Defaults to None.
            organization: The organization to retrieve documents from.
                If set, overrides the organization set at the initialization of the
                retriever.
            pipeline_id: The pipeline ID to retrieve documents from.
                If set, overrides the pipeline ID set at the initialization of the
                retriever.
            num_results: The number of results to retrieve.
                If set, overrides the number of results set at the initialization of
                the retriever.
            rerank: Whether to rerank the retrieved documents.
                If set, overrides the reranking set at the initialization of the
                retriever.
            metadata_filters: The metadata filters to apply when retrieving documents.
                If set, overrides the metadata filters set at the initialization of the
                retriever.

        Returns:
            List of relevant documents.

        Examples:

            .. code-block:: python

                retriever.invoke("query")
        """
        kwargs = {}
        if organization:
            kwargs["organization"] = organization
        if pipeline_id:
            kwargs["pipeline_id"] = pipeline_id
        if num_results is not _NOT_SET:
            kwargs["num_results"] = num_results
        if rerank is not _NOT_SET:
            kwargs["rerank"] = rerank
        if metadata_filters is not _NOT_SET:
            kwargs["metadata_filters"] = metadata_filters

        return super().invoke(input, config, **kwargs)