Skip to content

Commit 6b69841

Browse files
authored
add modality-aware Instructions with audio/text variants (#4987)
1 parent 9003a95 commit 6b69841

10 files changed

Lines changed: 509 additions & 75 deletions

File tree

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import logging
2+
from datetime import datetime
3+
4+
from dotenv import load_dotenv
5+
6+
from livekit.agents import (
7+
Agent,
8+
AgentServer,
9+
AgentSession,
10+
JobContext,
11+
JobProcess,
12+
cli,
13+
function_tool,
14+
inference,
15+
)
16+
from livekit.agents.llm import Instructions
17+
from livekit.plugins import silero
18+
19+
logger = logging.getLogger("instructions-per-modality")
20+
21+
load_dotenv()
22+
23+
BASE_INSTRUCTIONS = """\\
24+
You are a scheduling assistant named Alex that helps users book appointments.
25+
{modality_specific}
26+
Call `book_appointment` to finalise the booking.
27+
Never invent or assume details the user did not provide — ask for them instead.
28+
The current date is {current_date}.
29+
"""
30+
31+
# Voice users speak in approximate, self-correcting natural language.
32+
# The LLM needs guidance on how to parse what was said, not how to say things back.
33+
AUDIO_SPECIFIC = """
34+
The user is speaking — their input arrives as voice transcription and may be imperfect.
35+
When interpreting what the user said:
36+
- Resolve relative spoken expressions to a concrete date/time: 'next Tuesday', 'tomorrow afternoon', 'the week after next around 3'.
37+
- Spoken numbers may be ambiguous: 'three thirty' could mean 3:30 PM or the 30th of March — ask for clarification when context does not make it obvious.
38+
- Honor verbal self-corrections: if the user says 'wait, I meant Thursday not Tuesday', update your understanding to Thursday and discard Tuesday.
39+
- Ignore filler words and hesitations ('um', 'uh', 'like', 'I guess').
40+
- Always confirm the resolved date and time out loud before booking, since spoken input is inherently ambiguous.
41+
"""
42+
43+
# Text users type precise values — no need to normalise spoken patterns.
44+
TEXT_SPECIFIC = """
45+
The user is typing — take their input literally.
46+
When interpreting what the user wrote:
47+
- Accept exact dates and times in any common format (ISO, natural language, 12-hour or 24-hour clock).
48+
- If the user provides a complete and unambiguous date and time, you may book immediately without asking for confirmation.
49+
- Only ask follow-up questions for genuinely missing information.
50+
"""
51+
52+
53+
class SchedulingAgent(Agent):
54+
def __init__(self) -> None:
55+
current_date = datetime.now().strftime("%Y-%m-%d %A")
56+
super().__init__(
57+
instructions=Instructions(
58+
audio=BASE_INSTRUCTIONS.format(
59+
modality_specific=AUDIO_SPECIFIC, current_date=current_date
60+
),
61+
text=BASE_INSTRUCTIONS.format(
62+
modality_specific=TEXT_SPECIFIC, current_date=current_date
63+
),
64+
)
65+
)
66+
67+
async def on_enter(self) -> None:
68+
self.session.generate_reply()
69+
70+
@function_tool
71+
async def book_appointment(self, date: str, time: str) -> None:
72+
"""Book an appointment.
73+
74+
Args:
75+
date: The date of the appointment in the format YYYY-MM-DD
76+
time: The time of the appointment in the format HH:MM
77+
"""
78+
logger.info(f"booking appointment for {date} at {time}")
79+
return f"Appointment booked for {date} at {time}"
80+
81+
82+
server = AgentServer()
83+
84+
85+
def prewarm(proc: JobProcess) -> None:
86+
proc.userdata["vad"] = silero.VAD.load()
87+
88+
89+
server.setup_fnc = prewarm
90+
91+
92+
@server.rtc_session()
93+
async def entrypoint(ctx: JobContext) -> None:
94+
session = AgentSession(
95+
stt=inference.STT("deepgram/nova-3"),
96+
llm=inference.LLM("openai/gpt-4.1-mini"),
97+
tts=inference.TTS("cartesia/sonic-3"),
98+
vad=ctx.proc.userdata["vad"],
99+
)
100+
101+
await session.start(agent=SchedulingAgent(), room=ctx.room)
102+
103+
104+
if __name__ == "__main__":
105+
cli.run_app(server)

livekit-agents/livekit/agents/beta/workflows/address.py

Lines changed: 62 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
from ... import llm, stt, tts, vad
7+
from ...llm import Instructions
78
from ...llm.tool_context import ToolError, ToolFlag, function_tool
89
from ...types import NOT_GIVEN, NotGivenOr
910
from ...utils import is_given
@@ -15,6 +16,47 @@
1516
from ...voice.audio_recognition import TurnDetectionMode
1617

1718

19+
_BASE_INSTRUCTIONS = """
20+
You are only a single step in a broader system, responsible solely for capturing an address.
21+
You will be handling addresses from any country.
22+
{modality_specific}
23+
Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. (before asking any questions or providing any answers.)
24+
Don't invent new addresses, stick strictly to what the user said.
25+
{confirmation_instructions}
26+
If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country.
27+
Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
28+
Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
29+
{extra_instructions}
30+
"""
31+
32+
_AUDIO_SPECIFIC = """
33+
Expect that users will say address in different formats with fields filled like:
34+
- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',
35+
- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',
36+
- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',
37+
- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',
38+
- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',
39+
Normalize common spoken patterns silently:
40+
- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.
41+
- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.
42+
- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.
43+
- Filter out filler words or hesitations.
44+
- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.
45+
Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
46+
When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.
47+
Do not read the number and the suffix letters separately.
48+
Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.
49+
For example, read 90210 as 'nine zero two one zero.'
50+
Avoid using bullet points and parenthese in any responses.
51+
Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially.
52+
"""
53+
54+
_TEXT_SPECIFIC = """
55+
Expect users to type their address directly.
56+
If the address looks almost correct but has minor issues (e.g. missing country or postal code), prompt for clarification.
57+
"""
58+
59+
1860
@dataclass
1961
class GetAddressResult:
2062
address: str
@@ -34,40 +76,27 @@ def __init__(
3476
allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
3577
require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
3678
) -> None:
79+
confirmation_instructions = (
80+
"Call `confirm_address` after the user confirmed the address is correct."
81+
)
82+
extra = extra_instructions if extra_instructions else ""
83+
3784
super().__init__(
38-
instructions=(
39-
"You are only a single step in a broader system, responsible solely for capturing an address.\n"
40-
"You will be handling addresses from any country. Expect that users will say address in different formats with fields filled like:\n"
41-
"- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',\n"
42-
"- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',\n"
43-
"- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',\n"
44-
"- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',\n"
45-
"- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',\n"
46-
"Normalize common spoken patterns silently:\n"
47-
"- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.\n"
48-
"- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.\n"
49-
"- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.\n"
50-
"- Filter out filler words or hesitations.\n"
51-
"- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.\n"
52-
"Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
53-
"Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. "
54-
"(before asking any questions or providing any answers.) \n"
55-
"Don't invent new addresses, stick strictly to what the user said. \n"
56-
+ (
57-
"Call `confirm_address` after the user confirmed the address is correct. \n"
58-
if require_confirmation is not False
59-
else ""
60-
)
61-
+ "When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.\n"
62-
"Do not read the number and the suffix letters separately.\n"
63-
"Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.\n"
64-
"For example, read 90210 as 'nine zero two one zero.'\n"
65-
"Avoid using bullet points and parenthese in any responses.\n"
66-
"Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially. \n"
67-
"If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country. \n"
68-
"Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
69-
"Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
70-
+ extra_instructions
85+
instructions=Instructions(
86+
_BASE_INSTRUCTIONS.format(
87+
modality_specific=_AUDIO_SPECIFIC,
88+
confirmation_instructions=(
89+
confirmation_instructions if require_confirmation is not False else ""
90+
),
91+
extra_instructions=extra,
92+
),
93+
text=_BASE_INSTRUCTIONS.format(
94+
modality_specific=_TEXT_SPECIFIC,
95+
confirmation_instructions=(
96+
confirmation_instructions if require_confirmation is True else ""
97+
),
98+
extra_instructions=extra,
99+
),
71100
),
72101
chat_ctx=chat_ctx,
73102
turn_detection=turn_detection,

livekit-agents/livekit/agents/beta/workflows/email_address.py

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import TYPE_CHECKING
66

77
from ... import llm, stt, tts, vad
8+
from ...llm import Instructions
89
from ...llm.tool_context import ToolError, ToolFlag, function_tool
910
from ...types import NOT_GIVEN, NotGivenOr
1011
from ...utils import is_given
@@ -19,6 +20,39 @@
1920
r"^[A-Za-z0-9][A-Za-z0-9._%+\-]*@(?:[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?\.)+[A-Za-z]{2,}$"
2021
)
2122

23+
_BASE_INSTRUCTIONS = """
24+
You are only a single step in a broader system, responsible solely for capturing an email address.
25+
{modality_specific}
26+
Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. (before asking any questions or providing any answers.)
27+
Don't invent new email addresses, stick strictly to what the user said.
28+
{confirmation_instructions}
29+
If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed.
30+
Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
31+
Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
32+
{extra_instructions}
33+
"""
34+
35+
_AUDIO_SPECIFIC = """
36+
Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:
37+
- 'john dot doe at gmail dot com'
38+
- 'susan underscore smith at yahoo dot co dot uk'
39+
- 'dave dash b at protonmail dot com'
40+
- 'jane at example' (partial—prompt for the domain)
41+
- 'theo t h e o at livekit dot io' (name followed by spelling)
42+
Normalize common spoken patterns silently:
43+
- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.
44+
- Convert 'at' to `@`.
45+
- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.
46+
- Filter out filler words or hesitations.
47+
- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).
48+
Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
49+
"""
50+
51+
_TEXT_SPECIFIC = """
52+
Handle input as typed text. Expect users to type their email address directly in standard format.
53+
If the address looks almost correct but has minor typos (e.g. missing '@' or domain), prompt for clarification.
54+
"""
55+
2256

2357
@dataclass
2458
class GetEmailResult:
@@ -39,34 +73,27 @@ def __init__(
3973
allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
4074
require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
4175
) -> None:
76+
confirmation_instructions = (
77+
"Call `confirm_email_address` after the user confirmed the email address is correct."
78+
)
79+
extra = extra_instructions if extra_instructions else ""
80+
4281
super().__init__(
43-
instructions=(
44-
"You are only a single step in a broader system, responsible solely for capturing an email address.\n"
45-
"Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:\n"
46-
"- 'john dot doe at gmail dot com'\n"
47-
"- 'susan underscore smith at yahoo dot co dot uk'\n"
48-
"- 'dave dash b at protonmail dot com'\n"
49-
"- 'jane at example' (partial—prompt for the domain)\n"
50-
"- 'theo t h e o at livekit dot io' (name followed by spelling)\n"
51-
"Normalize common spoken patterns silently:\n"
52-
"- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.\n"
53-
"- Convert 'at' to `@`.\n"
54-
"- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.\n"
55-
"- Filter out filler words or hesitations.\n"
56-
"- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).\n"
57-
"Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
58-
"Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. "
59-
"(before asking any questions or providing any answers.) \n"
60-
"Don't invent new email addresses, stick strictly to what the user said. \n"
61-
+ (
62-
"Call `confirm_email_address` after the user confirmed the email address is correct. \n"
63-
if require_confirmation is not False
64-
else ""
65-
)
66-
+ "If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed. \n"
67-
"Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
68-
"Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
69-
+ extra_instructions
82+
instructions=Instructions(
83+
_BASE_INSTRUCTIONS.format(
84+
modality_specific=_AUDIO_SPECIFIC,
85+
confirmation_instructions=(
86+
confirmation_instructions if require_confirmation is not False else ""
87+
),
88+
extra_instructions=extra,
89+
),
90+
text=_BASE_INSTRUCTIONS.format(
91+
modality_specific=_TEXT_SPECIFIC,
92+
confirmation_instructions=(
93+
confirmation_instructions if require_confirmation is True else ""
94+
),
95+
extra_instructions=extra,
96+
),
7097
),
7198
chat_ctx=chat_ctx,
7299
turn_detection=turn_detection,

livekit-agents/livekit/agents/llm/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
FunctionCall,
1212
FunctionCallOutput,
1313
ImageContent,
14+
Instructions,
1415
MetricsReport,
1516
)
1617
from .fallback_adapter import AvailabilityChangedEvent, FallbackAdapter
@@ -71,6 +72,7 @@
7172
"AgentConfigUpdate",
7273
"AgentHandoff",
7374
"MetricsReport",
75+
"Instructions",
7476
"ChatItem",
7577
"ChoiceDelta",
7678
"ChatChunk",

0 commit comments

Comments
 (0)