Skip to content

Commit 3eba930

Browse files
committed
Add transcription support for single-agent live scenarios
1 parent 955632c commit 3eba930

2 files changed

Lines changed: 103 additions & 4 deletions

File tree

src/google/adk/runners.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,12 +1104,12 @@ def _new_invocation_context_for_live(
11041104
live_request_queue: Optional[LiveRequestQueue] = None,
11051105
run_config: Optional[RunConfig] = None,
11061106
) -> InvocationContext:
1107-
"""Creates a new invocation context for live multi-agent."""
1107+
"""Creates a new invocation context for live single and multi-agent scenarios."""
11081108
run_config = run_config or RunConfig()
11091109

11101110
# For live multi-agent, we need model's text transcription as context for
1111-
# next agent.
1112-
if self.agent.sub_agents and live_request_queue:
1111+
# next agent. For single-agent, we need a general transcription support.
1112+
if live_request_queue:
11131113
if not run_config.response_modalities:
11141114
# default
11151115
run_config.response_modalities = ['AUDIO']
@@ -1123,7 +1123,8 @@ def _new_invocation_context_for_live(
11231123
types.AudioTranscriptionConfig()
11241124
)
11251125
if not run_config.input_audio_transcription:
1126-
# need this input transcription for agent transferring in live mode.
1126+
# need this input transcription for agent transferring in multi-agent live
1127+
# mode and for general transcription support in single agent live mode.
11271128
run_config.input_audio_transcription = types.AudioTranscriptionConfig()
11281129
return self._new_invocation_context(
11291130
session,

tests/unittests/streaming/test_live_streaming_configs.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,3 +642,101 @@ def test_streaming_with_context_window_compression_config():
642642
llm_request_sent_to_mock.live_connect_config.context_window_compression.sliding_window.target_tokens
643643
== 500
644644
)
645+
646+
def test_single_agent_live_streaming_with_transcription():
647+
"""Test single-agent streaming adds transcription configs when not provided."""
648+
response1 = LlmResponse(
649+
turn_complete=True,
650+
)
651+
652+
mock_model = testing_utils.MockModel.create([response1])
653+
654+
root_agent = Agent(
655+
name='single_agent',
656+
model=mock_model,
657+
tools=[],
658+
)
659+
660+
runner = testing_utils.InMemoryRunner(root_agent=root_agent)
661+
662+
live_request_queue = LiveRequestQueue()
663+
live_request_queue.send_realtime(
664+
blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm')
665+
)
666+
667+
res_events = runner.run_live(live_request_queue)
668+
669+
assert res_events is not None, 'Expected a list of events, got None.'
670+
assert (
671+
len(res_events) > 0
672+
), 'Expected at least one response, but got an empty list.'
673+
assert len(mock_model.requests) == 1
674+
675+
# Get the request that was captured
676+
llm_request_sent_to_mock = mock_model.requests[0]
677+
678+
# Assert that transcription configs were added
679+
assert llm_request_sent_to_mock.live_connect_config is not None
680+
assert (
681+
llm_request_sent_to_mock.live_connect_config.output_audio_transcription
682+
is not None
683+
)
684+
assert (
685+
llm_request_sent_to_mock.live_connect_config.input_audio_transcription
686+
is not None
687+
)
688+
689+
690+
def test_single_agent_live_streaming_respects_explicit_transcription():
691+
"""Test that single-agent live streaming respects explicitly provided transcription configs."""
692+
response1 = LlmResponse(
693+
turn_complete=True,
694+
)
695+
696+
mock_model = testing_utils.MockModel.create([response1])
697+
698+
# Create a single agent (no sub_agents)
699+
root_agent = Agent(
700+
name='single_agent',
701+
model=mock_model,
702+
tools=[],
703+
)
704+
705+
runner = testing_utils.InMemoryRunner(root_agent=root_agent)
706+
707+
# Create run config with input and output audio transcription
708+
explicit_output_config = types.AudioTranscriptionConfig()
709+
explicit_input_config = types.AudioTranscriptionConfig()
710+
run_config = RunConfig(
711+
output_audio_transcription=explicit_output_config,
712+
input_audio_transcription=explicit_input_config,
713+
)
714+
715+
live_request_queue = LiveRequestQueue()
716+
live_request_queue.send_realtime(
717+
blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm')
718+
)
719+
720+
res_events = runner.run_live(live_request_queue, run_config)
721+
722+
assert res_events is not None, 'Expected a list of events, got None.'
723+
assert (
724+
len(res_events) > 0
725+
), 'Expected at least one response, but got an empty list.'
726+
assert len(mock_model.requests) == 1
727+
728+
# Get the request that was captured
729+
llm_request_sent_to_mock = mock_model.requests[0]
730+
731+
# Assert that the explicit configs were used
732+
assert llm_request_sent_to_mock.live_connect_config is not None
733+
assert (
734+
llm_request_sent_to_mock.live_connect_config.output_audio_transcription
735+
is explicit_output_config
736+
)
737+
assert (
738+
llm_request_sent_to_mock.live_connect_config.input_audio_transcription
739+
is explicit_input_config
740+
)
741+
742+

0 commit comments

Comments
 (0)