Skip to content

Commit 6c77a0c

Browse files
committed
feat(tools): Add LangExtract tool for structured information extraction
Add LangExtractTool to the community tools module, enabling ADK agents to extract structured data (entities, attributes, relationships) from unstructured text using Google's LangExtract library. - New: src/google/adk_community/tools/langextract_tool.py - New: src/google/adk_community/tools/__init__.py - New: tests/unittests/tools/test_langextract_tool.py - Updated: pyproject.toml with langextract optional dependency - Updated: adk_community __init__.py to expose tools module LangExtractToolConfig uses @DataClass for concise, idiomatic config.
1 parent a67b583 commit 6c77a0c

6 files changed

Lines changed: 410 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,12 @@ changelog = "https://github.com/google/adk-python-community/blob/main/CHANGELOG.
4141
documentation = "https://google.github.io/adk-docs/"
4242

4343
[project.optional-dependencies]
44+
langextract = [
45+
"langextract>=0.1.0",
46+
]
47+
4448
test = [
49+
"langextract>=0.1.0", # For LangExtractTool tests
4550
"pytest>=8.4.2",
4651
"pytest-asyncio>=1.2.0",
4752
]

src/google/adk_community/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@
1414

1515
from . import memory
1616
from . import sessions
17+
from . import tools
1718
from . import version
1819
__version__ = version.__version__
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .langextract_tool import LangExtractTool
16+
from .langextract_tool import LangExtractToolConfig
17+
18+
__all__ = [
19+
'LangExtractTool',
20+
'LangExtractToolConfig',
21+
]
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import asyncio
18+
from dataclasses import dataclass
19+
from dataclasses import field
20+
import logging
21+
from typing import Any
22+
from typing import Optional
23+
24+
from google.adk.tools import BaseTool
25+
from google.adk.tools.tool_context import ToolContext
26+
from google.genai import types
27+
from typing_extensions import override
28+
29+
try:
30+
import langextract as lx
31+
except ImportError as e:
32+
raise ImportError(
33+
'LangExtract tools require pip install langextract.'
34+
) from e
35+
36+
logger = logging.getLogger(__name__)
37+
38+
39+
class LangExtractTool(BaseTool):
40+
"""A tool that extracts structured information from text using LangExtract.
41+
42+
This tool wraps the langextract library to enable LLM agents to extract
43+
structured data (entities, attributes, relationships) from unstructured
44+
text. The agent provides the text to extract from and a description of
45+
what to extract; other parameters are pre-configured at construction time.
46+
47+
Args:
48+
name: The name of the tool. Defaults to 'langextract'.
49+
description: The description of the tool shown to the LLM.
50+
examples: Optional list of langextract ExampleData for few-shot
51+
extraction guidance.
52+
model_id: The model ID for langextract to use internally.
53+
Defaults to 'gemini-2.5-flash'.
54+
api_key: Optional API key for langextract. If None, uses the
55+
LANGEXTRACT_API_KEY environment variable.
56+
extraction_passes: Number of extraction passes. Defaults to 1.
57+
max_workers: Maximum worker threads for langextract. Defaults to 1.
58+
max_char_buffer: Maximum character buffer size for text chunking.
59+
Defaults to 4000.
60+
61+
Examples::
62+
63+
from google.adk_community.tools import LangExtractTool
64+
import langextract as lx
65+
66+
tool = LangExtractTool(
67+
name='extract_entities',
68+
description='Extract named entities from text.',
69+
examples=[
70+
lx.data.ExampleData(
71+
text='John is a software engineer at Google.',
72+
extractions=[
73+
lx.data.Extraction(
74+
extraction_class='person',
75+
extraction_text='John',
76+
attributes={
77+
'role': 'software engineer',
78+
'company': 'Google',
79+
},
80+
)
81+
],
82+
)
83+
],
84+
)
85+
"""
86+
87+
def __init__(
88+
self,
89+
*,
90+
name: str = 'langextract',
91+
description: str = (
92+
'Extracts structured information from unstructured'
93+
' text. Provide the text and a description of what'
94+
' to extract.'
95+
),
96+
examples: Optional[list[lx.data.ExampleData]] = None,
97+
model_id: str = 'gemini-2.5-flash',
98+
api_key: Optional[str] = None,
99+
extraction_passes: int = 1,
100+
max_workers: int = 1,
101+
max_char_buffer: int = 4000,
102+
):
103+
super().__init__(name=name, description=description)
104+
self._examples = examples or []
105+
self._model_id = model_id
106+
self._api_key = api_key
107+
self._extraction_passes = extraction_passes
108+
self._max_workers = max_workers
109+
self._max_char_buffer = max_char_buffer
110+
111+
@override
112+
def _get_declaration(self) -> Optional[types.FunctionDeclaration]:
113+
return types.FunctionDeclaration(
114+
name=self.name,
115+
description=self.description,
116+
parameters=types.Schema(
117+
type=types.Type.OBJECT,
118+
properties={
119+
'text': types.Schema(
120+
type=types.Type.STRING,
121+
description=(
122+
'The unstructured text to extract information from.'
123+
),
124+
),
125+
'prompt_description': types.Schema(
126+
type=types.Type.STRING,
127+
description=(
128+
'A description of what kind of information to'
129+
' extract from the text.'
130+
),
131+
),
132+
},
133+
required=['text', 'prompt_description'],
134+
),
135+
)
136+
137+
@override
138+
async def run_async(
139+
self, *, args: dict[str, Any], tool_context: ToolContext
140+
) -> Any:
141+
text = args.get('text')
142+
prompt_description = args.get('prompt_description')
143+
144+
if not text:
145+
return {'error': 'The "text" parameter is required.'}
146+
if not prompt_description:
147+
return {'error': 'The "prompt_description" parameter is required.'}
148+
149+
try:
150+
extract_kwargs: dict[str, Any] = {
151+
'text_or_documents': text,
152+
'prompt_description': prompt_description,
153+
'examples': self._examples,
154+
'model_id': self._model_id,
155+
'extraction_passes': self._extraction_passes,
156+
'max_workers': self._max_workers,
157+
'max_char_buffer': self._max_char_buffer,
158+
}
159+
if self._api_key is not None:
160+
extract_kwargs['api_key'] = self._api_key
161+
162+
# lx.extract() is synchronous; run in a thread to avoid
163+
# blocking the event loop.
164+
result = await asyncio.to_thread(lx.extract, **extract_kwargs)
165+
166+
extractions = []
167+
for extraction in result:
168+
entry = {
169+
'extraction_class': extraction.extraction_class,
170+
'extraction_text': extraction.extraction_text,
171+
}
172+
if extraction.attributes:
173+
entry['attributes'] = extraction.attributes
174+
extractions.append(entry)
175+
176+
return {'extractions': extractions}
177+
178+
except Exception as e:
179+
logger.error('LangExtract extraction failed: %s', e)
180+
return {'error': f'Extraction failed: {e}'}
181+
182+
183+
@dataclass
184+
class LangExtractToolConfig:
185+
"""Configuration for LangExtractTool."""
186+
187+
name: str = 'langextract'
188+
description: str = (
189+
'Extracts structured information from unstructured text.'
190+
)
191+
examples: list[lx.data.ExampleData] = field(default_factory=list)
192+
model_id: str = 'gemini-2.5-flash'
193+
api_key: Optional[str] = None
194+
extraction_passes: int = 1
195+
max_workers: int = 1
196+
max_char_buffer: int = 4000
197+
198+
def build(self) -> LangExtractTool:
199+
"""Instantiate a LangExtractTool from this config."""
200+
return LangExtractTool(
201+
name=self.name,
202+
description=self.description,
203+
examples=self.examples,
204+
model_id=self.model_id,
205+
api_key=self.api_key,
206+
extraction_passes=self.extraction_passes,
207+
max_workers=self.max_workers,
208+
max_char_buffer=self.max_char_buffer,
209+
)

tests/unittests/tools/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)