Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 35 additions & 12 deletions src/uipath_langchain/agent/tools/extraction_tool.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Ixp extraction tool."""

from typing import Any
import uuid
from typing import Any, Optional

from langchain.tools import BaseTool
from langchain_core.messages import ToolCall, ToolMessage
from langchain_core.tools import StructuredTool
from langgraph.types import Command, interrupt
from pydantic import BaseModel, Field
from uipath.agent.models.agent import AgentIxpExtractionResourceConfig
from uipath.eval.mocks import mockable
from uipath.platform.attachments import Attachment
from uipath.platform.common import DocumentExtraction
from uipath.platform.documents import ExtractionResponseIXP

Expand All @@ -26,6 +27,34 @@ class StructuredToolWithWrapper(StructuredToolWithOutputType, ToolWrapperMixin):
pass


class ExtractionToolInputSchema(BaseModel):
"""Alias-free mirror of `Attachment` used as the tool's args_schema.

We don't use `Attachment` directly because its fields carry aliases
(`id` -> `ID`, `full_name` -> `FullName`, ...) and LangChain mishandles
aliased fields in two places (see PR #796):

1. `BaseTool._parse_input()` extracts each field with `getattr(model, key)`,
where `key` is the alias. For aliases that collide with built-in model
attributes (e.g. `schema`), this returns the built-in instead of the
field value, so downstream `kwargs.get("id") / kwargs.get("full_name")`
came back as `None`.
2. `tool_call_schema` rebuilds a subset of the model by copying each field
but drops alias and serialization options, so the rebuilt schema no
longer matches what the LLM emits.
Comment on lines +37 to +44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: these bugs are not what causes it. It was us fixing the bugs via our structured tool subclass that surfaced the issue.


Until LangChain fixes both, exposing an alias-free schema with field
names matching `Attachment`'s python names sidesteps the issue. Keep the
fields here in sync with `Attachment` — the test
`test_extraction_tool_has_attachment_input_schema` enforces this.
"""

id: uuid.UUID
full_name: str
mime_type: str
metadata: Optional[dict[str, Any]] = Field(None)


def create_ixp_extraction_tool(
resource: AgentIxpExtractionResourceConfig,
) -> StructuredTool:
Expand All @@ -38,27 +67,21 @@ def create_ixp_extraction_tool(
@mockable(
name=resource.name,
description=resource.description,
input_schema=Attachment.model_json_schema(),
input_schema=ExtractionToolInputSchema.model_json_schema(),
output_schema=ExtractionResponseIXP.model_json_schema(),
example_calls=resource.properties.example_calls,
)
async def extraction_tool_fn(**kwargs: Any) -> ExtractionResponseIXP:
from uipath.platform import UiPath

attachment = ExtractionToolInputSchema.model_validate(kwargs)
uipath = UiPath()

attachment_id = kwargs.get("id")
attachment_full_name = kwargs.get("full_name")

# TODO: attachment_mime_type is currently not used anywhere (attachment_full_name will also be obsolete once attachments api is onboarded)
# should we use them somewhere else? otherwise input_schema should only contain the file id
# attachment_mime_type = kwargs.get("mime_type")

# TODO: current workaround. DocumentExtraction model should support attachment_id and use the
# start_ixp_extraction_from_attachment sdk method once support is added

attachment_local_file_path = await uipath.attachments.download_async(
key=attachment_id, destination_path=attachment_full_name
key=attachment.id, destination_path=attachment.full_name
)
document_extraction_response = interrupt(
DocumentExtraction(
Expand Down Expand Up @@ -95,7 +118,7 @@ async def extraction_tool_wrapper(
tool = StructuredToolWithWrapper(
name=tool_name,
description=resource.description,
args_schema=Attachment,
args_schema=ExtractionToolInputSchema,
coroutine=extraction_tool_fn,
output_type=ExtractionResponseIXP,
metadata={
Expand Down
48 changes: 45 additions & 3 deletions tests/agent/tools/test_extraction_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
from uipath.platform.attachments import Attachment
from uipath.platform.documents import ExtractionResponseIXP

from uipath_langchain.agent.tools.extraction_tool import create_ixp_extraction_tool
from uipath_langchain.agent.tools.extraction_tool import (
ExtractionToolInputSchema,
create_ixp_extraction_tool,
)


class TestExtractionToolMetadata:
Expand Down Expand Up @@ -76,10 +79,16 @@ def test_extraction_tool_has_correct_description(self, extraction_resource):
assert tool.description == "Extract data from files"

def test_extraction_tool_has_attachment_input_schema(self, extraction_resource):
"""Test that extraction tool uses Attachment as input schema."""
"""Test that extraction tool's input schema mirrors Attachment fields."""
tool = create_ixp_extraction_tool(extraction_resource)

assert tool.args_schema == Attachment
assert tool.args_schema is ExtractionToolInputSchema
schema_fields = ExtractionToolInputSchema.model_fields
attachment_fields = Attachment.model_fields

assert schema_fields.keys() == attachment_fields.keys()
for name, attachment_field in attachment_fields.items():
assert schema_fields[name].annotation == attachment_field.annotation

def test_extraction_tool_has_extraction_response_output_type(
self, extraction_resource
Expand Down Expand Up @@ -235,6 +244,39 @@ async def test_extraction_tool_propagates_download_exception(

assert "Download failed" in str(exc_info.value)

@pytest.mark.asyncio
@patch("uipath.platform.UiPath")
@patch("uipath_langchain.agent.tools.extraction_tool.interrupt")
async def test_extraction_tool_handles_alias_keyed_input(
self, mock_interrupt, mock_uipath_class, extraction_resource
):
"""The LLM emits Attachment fields by alias (ID/FullName/MimeType) — the
same shape Attachment.model_dump(by_alias=True) produces. download_async
must still be called with the populated UUID, not key=None.
"""
mock_client = MagicMock()
mock_uipath_class.return_value = mock_client
mock_client.attachments.download_async = AsyncMock(
return_value="/path/to/document.pdf"
)
mock_interrupt.return_value = {"extracted_data": {"field1": "value1"}}

tool = create_ixp_extraction_tool(extraction_resource)

attachment = ExtractionToolInputSchema(
id=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
full_name="document.pdf",
mime_type="application/pdf",
)
aliased_input = attachment.model_dump()

await tool.ainvoke(aliased_input)

mock_client.attachments.download_async.assert_called_once_with(
key=UUID("fa93f4ca-bd3f-473a-93e5-e6e5b5a8f27f"),
destination_path="document.pdf",
)


class TestExtractionToolNameSanitization:
"""Test that extraction tool names are properly sanitized."""
Expand Down
Loading