diff --git a/sdk/ai/azure-ai-voicelive/CHANGELOG.md b/sdk/ai/azure-ai-voicelive/CHANGELOG.md index 483d70489edb..b44c4fffc9dc 100644 --- a/sdk/ai/azure-ai-voicelive/CHANGELOG.md +++ b/sdk/ai/azure-ai-voicelive/CHANGELOG.md @@ -5,13 +5,33 @@ ### Features Added - **Support for Explicit Null Values**: Enhanced `RequestSession` to properly serialize explicitly set `None` values (e.g., `turn_detection=None` now correctly sends `"turn_detection": null` in the WebSocket message) +- **Filler Response Configuration**: Added support for filler response generation during latency or tool calls: + - `BasicFillerResponseConfig` for static filler texts that are randomly selected + - `LlmFillerResponseConfig` for LLM-generated context-aware filler responses + - `FillerTrigger` enum with `latency` and `tool` triggers + - `filler_response` field in `RequestSession` and `ResponseSession` +- **Foundry Agent Integration**: Added support for Azure AI Foundry agents: + - `FoundryAgentTool` for defining Foundry agent configurations + - `ResponseFoundryAgentCallItem` for Foundry agent call responses + - `FoundryAgentContextType` enum for context management (`no_context`, `agent_context`) + - Server events for Foundry agent call lifecycle: `ServerEventResponseFoundryAgentCallArgumentsDelta`, `ServerEventResponseFoundryAgentCallArgumentsDone`, `ServerEventResponseFoundryAgentCallInProgress`, `ServerEventResponseFoundryAgentCallCompleted`, `ServerEventResponseFoundryAgentCallFailed` +- **Reasoning Effort Control**: Added `reasoning_effort` field to `RequestSession`, `ResponseSession`, and `ResponseCreateParams` for controlling reasoning models effort levels with `ReasoningEffort` enum (`none`, `minimal`, `low`, `medium`, `high`, `xhigh`) +- **Response Metadata**: Added `metadata` field to `Response` and `ResponseCreateParams` for attaching up to 16 key-value pairs (max 64 chars for keys, 512 chars for values) +- **Array Encoding Support**: Enhanced serialization to support pipe, space, comma, and newline-delimited array encoding formats +- **Custom Text Normalization**: Added `custom_text_normalization_url` field to `AzureStandardVoice`, `AzureCustomVoice`, and `AzurePersonalVoice` for custom text normalization configurations +- **Avatar Scene Configuration**: Added `Scene` model for controlling avatar's zoom level, position (x/y), rotation (x/y/z pitch/yaw/roll), and movement amplitude in the video frame +- **Enhanced Avatar Configuration**: Added `scene` and `output_audit_audio` fields to `AvatarConfig` for scene control and audit audio forwarding via WebSocket ### Other Changes -- **Dependency Update**: Updated minimum `azure-core` version from 1.35.0 to 1.36.0 +- **Dependency Update**: Updated minimum `azure-core` version from 1.36.0 to 1.37.0 +- **Security Enhancement**: Removed `eval()` usage in serialization utilities, replaced with explicit type checking for improved security +- **Serialization Improvements**: Enhanced model_base deserialization for mutable types and array-encoded strings ### Bug Fixes +- **Audio Format Values**: Fixed `OutputAudioFormat` enum values to use underscore format (`pcm16_8000hz`, `pcm16_16000hz`) instead of hyphenated format for consistency with wire protocol and backward compatibility + ## 1.2.0b2 (2025-11-20) ### Features Added diff --git a/sdk/ai/azure-ai-voicelive/apiview-properties.json b/sdk/ai/azure-ai-voicelive/apiview-properties.json index 11a239dd8cfa..1bde12b720f5 100644 --- a/sdk/ai/azure-ai-voicelive/apiview-properties.json +++ b/sdk/ai/azure-ai-voicelive/apiview-properties.json @@ -23,6 +23,8 @@ "azure.ai.voicelive.models.AzureSemanticVadMultilingual": "VoiceLive.AzureSemanticVadMultilingual", "azure.ai.voicelive.models.AzureStandardVoice": "VoiceLive.AzureStandardVoice", "azure.ai.voicelive.models.Background": "VoiceLive.Background", + "azure.ai.voicelive.models.FillerResponseConfigBase": "VoiceLive.FillerResponseConfigBase", + "azure.ai.voicelive.models.BasicFillerResponseConfig": "VoiceLive.BasicFillerResponseConfig", "azure.ai.voicelive.models.CachedTokenDetails": "VoiceLive.CachedTokenDetails", "azure.ai.voicelive.models.ClientEvent": "VoiceLive.ClientEvent", "azure.ai.voicelive.models.ClientEventConversationItemCreate": "VoiceLive.ClientEventConversationItemCreate", @@ -44,15 +46,17 @@ "azure.ai.voicelive.models.ContentPart": "VoiceLive.ContentPart", "azure.ai.voicelive.models.ConversationItemBase": "VoiceLive.ConversationItemBase", "azure.ai.voicelive.models.ErrorResponse": "VoiceLive.ErrorResponse", + "azure.ai.voicelive.models.Tool": "VoiceLive.Tool", + "azure.ai.voicelive.models.FoundryAgentTool": "VoiceLive.FoundryAgentTool", "azure.ai.voicelive.models.FunctionCallItem": "VoiceLive.FunctionCallItem", "azure.ai.voicelive.models.FunctionCallOutputItem": "VoiceLive.FunctionCallOutputItem", - "azure.ai.voicelive.models.Tool": "VoiceLive.Tool", "azure.ai.voicelive.models.FunctionTool": "VoiceLive.FunctionTool", "azure.ai.voicelive.models.IceServer": "VoiceLive.IceServer", "azure.ai.voicelive.models.MessageContentPart": "VoiceLive.MessageContentPart", "azure.ai.voicelive.models.InputAudioContentPart": "VoiceLive.InputAudioContentPart", "azure.ai.voicelive.models.InputTextContentPart": "VoiceLive.InputTextContentPart", "azure.ai.voicelive.models.InputTokenDetails": "VoiceLive.InputTokenDetails", + "azure.ai.voicelive.models.LlmFillerResponseConfig": "VoiceLive.LlmFillerResponseConfig", "azure.ai.voicelive.models.LogProbProperties": "VoiceLive.LogProbProperties", "azure.ai.voicelive.models.MCPApprovalResponseRequestItem": "VoiceLive.MCPApprovalResponseRequestItem", "azure.ai.voicelive.models.MCPServer": "VoiceLive.MCPServer", @@ -71,6 +75,7 @@ "azure.ai.voicelive.models.ResponseCreateParams": "VoiceLive.ResponseCreateParams", "azure.ai.voicelive.models.ResponseFailedDetails": "VoiceLive.ResponseFailedDetails", "azure.ai.voicelive.models.ResponseItem": "VoiceLive.ResponseItem", + "azure.ai.voicelive.models.ResponseFoundryAgentCallItem": "VoiceLive.ResponseFoundryAgentCallItem", "azure.ai.voicelive.models.ResponseFunctionCallItem": "VoiceLive.ResponseFunctionCallItem", "azure.ai.voicelive.models.ResponseFunctionCallOutputItem": "VoiceLive.ResponseFunctionCallOutputItem", "azure.ai.voicelive.models.ResponseIncompleteDetails": "VoiceLive.ResponseIncompleteDetails", @@ -81,6 +86,7 @@ "azure.ai.voicelive.models.ResponseMessageItem": "VoiceLive.ResponseMessageItem", "azure.ai.voicelive.models.ResponseSession": "VoiceLive.ResponseSession", "azure.ai.voicelive.models.ResponseTextContentPart": "VoiceLive.ResponseTextContentPart", + "azure.ai.voicelive.models.Scene": "VoiceLive.Scene", "azure.ai.voicelive.models.ServerEvent": "VoiceLive.ServerEvent", "azure.ai.voicelive.models.ServerEventConversationItemCreated": "VoiceLive.ServerEventConversationItemCreated", "azure.ai.voicelive.models.ServerEventConversationItemDeleted": "VoiceLive.ServerEventConversationItemDeleted", @@ -112,6 +118,11 @@ "azure.ai.voicelive.models.ServerEventResponseContentPartDone": "VoiceLive.ServerEventResponseContentPartDone", "azure.ai.voicelive.models.ServerEventResponseCreated": "VoiceLive.ServerEventResponseCreated", "azure.ai.voicelive.models.ServerEventResponseDone": "VoiceLive.ServerEventResponseDone", + "azure.ai.voicelive.models.ServerEventResponseFoundryAgentCallArgumentsDelta": "VoiceLive.ServerEventResponseFoundryAgentCallArgumentsDelta", + "azure.ai.voicelive.models.ServerEventResponseFoundryAgentCallArgumentsDone": "VoiceLive.ServerEventResponseFoundryAgentCallArgumentsDone", + "azure.ai.voicelive.models.ServerEventResponseFoundryAgentCallCompleted": "VoiceLive.ServerEventResponseFoundryAgentCallCompleted", + "azure.ai.voicelive.models.ServerEventResponseFoundryAgentCallFailed": "VoiceLive.ServerEventResponseFoundryAgentCallFailed", + "azure.ai.voicelive.models.ServerEventResponseFoundryAgentCallInProgress": "VoiceLive.ServerEventResponseFoundryAgentCallInProgress", "azure.ai.voicelive.models.ServerEventResponseFunctionCallArgumentsDelta": "VoiceLive.ServerEventResponseFunctionCallArgumentsDelta", "azure.ai.voicelive.models.ServerEventResponseFunctionCallArgumentsDone": "VoiceLive.ServerEventResponseFunctionCallArgumentsDone", "azure.ai.voicelive.models.ServerEventResponseMcpCallArgumentsDelta": "VoiceLive.ServerEventResponseMcpCallArgumentsDelta", @@ -148,7 +159,9 @@ "azure.ai.voicelive.models.PersonalVoiceModels": "VoiceLive.PersonalVoiceModels", "azure.ai.voicelive.models.OutputAudioFormat": "VoiceLive.OutputAudioFormat", "azure.ai.voicelive.models.ToolType": "VoiceLive.ToolType", + "azure.ai.voicelive.models.FoundryAgentContextType": "VoiceLive.FoundryAgentContextType", "azure.ai.voicelive.models.MCPApprovalType": "VoiceLive.MCPApprovalType", + "azure.ai.voicelive.models.ReasoningEffort": "VoiceLive.ReasoningEffort", "azure.ai.voicelive.models.AnimationOutputType": "VoiceLive.AnimationOutputType", "azure.ai.voicelive.models.InputAudioFormat": "VoiceLive.InputAudioFormat", "azure.ai.voicelive.models.TurnDetectionType": "VoiceLive.TurnDetectionType", @@ -158,6 +171,8 @@ "azure.ai.voicelive.models.AvatarOutputProtocol": "VoiceLive.AvatarOutputProtocol", "azure.ai.voicelive.models.AudioTimestampType": "VoiceLive.AudioTimestampType", "azure.ai.voicelive.models.ToolChoiceLiteral": "VoiceLive.ToolChoiceLiteral", + "azure.ai.voicelive.models.FillerResponseConfigType": "VoiceLive.FillerResponseConfigType", + "azure.ai.voicelive.models.FillerTrigger": "VoiceLive.FillerTrigger", "azure.ai.voicelive.models.ResponseStatus": "VoiceLive.ResponseStatus", "azure.ai.voicelive.models.ResponseItemStatus": "VoiceLive.ResponseItemStatus", "azure.ai.voicelive.models.RequestImageContentPartDetail": "VoiceLive.RequestImageContentPartDetail", diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_types.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_types.py index af1ff22f45b7..86801624e71f 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_types.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_types.py @@ -12,3 +12,4 @@ from . import models as _models Voice = Union[str, "_models.OpenAIVoiceName", "_models.OpenAIVoice", "_models.AzureVoice"] ToolChoice = Union[str, "_models.ToolChoiceLiteral", "_models.ToolChoiceSelection"] +FillerResponseConfig = Union["_models.BasicFillerResponseConfig", "_models.LlmFillerResponseConfig"] diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py index 12926fa98dcf..d962e0e8fceb 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/model_base.py @@ -37,6 +37,7 @@ TZ_UTC = timezone.utc _T = typing.TypeVar("_T") +_NONE_TYPE = type(None) def _timedelta_as_isostr(td: timedelta) -> str: @@ -171,6 +172,21 @@ def default(self, o): # pylint: disable=too-many-return-statements r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{4}\s\d{2}:\d{2}:\d{2}\sGMT" ) +_ARRAY_ENCODE_MAPPING = { + "pipeDelimited": "|", + "spaceDelimited": " ", + "commaDelimited": ",", + "newlineDelimited": "\n", +} + + +def _deserialize_array_encoded(delimit: str, attr): + if isinstance(attr, str): + if attr == "": + return [] + return attr.split(delimit) + return attr + def _deserialize_datetime(attr: typing.Union[str, datetime]) -> datetime: """Deserialize ISO-8601 formatted string into Datetime object. @@ -202,7 +218,7 @@ def _deserialize_datetime(attr: typing.Union[str, datetime]) -> datetime: test_utc = date_obj.utctimetuple() if test_utc.tm_year > 9999 or test_utc.tm_year < 1: raise OverflowError("Hit max or min date") - return date_obj + return date_obj # type: ignore[no-any-return] def _deserialize_datetime_rfc7231(attr: typing.Union[str, datetime]) -> datetime: @@ -256,7 +272,7 @@ def _deserialize_time(attr: typing.Union[str, time]) -> time: """ if isinstance(attr, time): return attr - return isodate.parse_time(attr) + return isodate.parse_time(attr) # type: ignore[no-any-return] def _deserialize_bytes(attr): @@ -315,6 +331,8 @@ def _deserialize_int_as_str(attr): def get_deserializer(annotation: typing.Any, rf: typing.Optional["_RestField"] = None): if annotation is int and rf and rf._format == "str": return _deserialize_int_as_str + if annotation is str and rf and rf._format in _ARRAY_ENCODE_MAPPING: + return functools.partial(_deserialize_array_encoded, _ARRAY_ENCODE_MAPPING[rf._format]) if rf and rf._format: return _DESERIALIZE_MAPPING_WITHFORMAT.get(rf._format) return _DESERIALIZE_MAPPING.get(annotation) # pyright: ignore @@ -353,9 +371,39 @@ def __contains__(self, key: typing.Any) -> bool: return key in self._data def __getitem__(self, key: str) -> typing.Any: + # If this key has been deserialized (for mutable types), we need to handle serialization + if hasattr(self, "_attr_to_rest_field"): + cache_attr = f"_deserialized_{key}" + if hasattr(self, cache_attr): + rf = _get_rest_field(getattr(self, "_attr_to_rest_field"), key) + if rf: + value = self._data.get(key) + if isinstance(value, (dict, list, set)): + # For mutable types, serialize and return + # But also update _data with serialized form and clear flag + # so mutations via this returned value affect _data + serialized = _serialize(value, rf._format) + # If serialized form is same type (no transformation needed), + # return _data directly so mutations work + if isinstance(serialized, type(value)) and serialized == value: + return self._data.get(key) + # Otherwise return serialized copy and clear flag + try: + object.__delattr__(self, cache_attr) + except AttributeError: + pass + # Store serialized form back + self._data[key] = serialized + return serialized return self._data.__getitem__(key) def __setitem__(self, key: str, value: typing.Any) -> None: + # Clear any cached deserialized value when setting through dictionary access + cache_attr = f"_deserialized_{key}" + try: + object.__delattr__(self, cache_attr) + except AttributeError: + pass self._data.__setitem__(key, value) def __delitem__(self, key: str) -> None: @@ -483,6 +531,8 @@ def _is_model(obj: typing.Any) -> bool: def _serialize(o, format: typing.Optional[str] = None): # pylint: disable=too-many-return-statements if isinstance(o, list): + if format in _ARRAY_ENCODE_MAPPING and all(isinstance(x, str) for x in o): + return _ARRAY_ENCODE_MAPPING[format].join(o) return [_serialize(x, format) for x in o] if isinstance(o, dict): return {k: _serialize(v, format) for k, v in o.items()} @@ -767,6 +817,17 @@ def _deserialize_sequence( return obj if isinstance(obj, ET.Element): obj = list(obj) + try: + if ( + isinstance(obj, str) + and isinstance(deserializer, functools.partial) + and isinstance(deserializer.args[0], functools.partial) + and deserializer.args[0].func == _deserialize_array_encoded # pylint: disable=comparison-with-callable + ): + # encoded string may be deserialized to sequence + return deserializer(obj) + except: # pylint: disable=bare-except + pass return type(obj)(_deserialize(deserializer, entry, module) for entry in obj) @@ -817,16 +878,16 @@ def _get_deserialize_callable_from_annotation( # pylint: disable=too-many-retur # is it optional? try: - if any(a for a in annotation.__args__ if a == type(None)): # pyright: ignore + if any(a is _NONE_TYPE for a in annotation.__args__): # pyright: ignore if len(annotation.__args__) <= 2: # pyright: ignore if_obj_deserializer = _get_deserialize_callable_from_annotation( - next(a for a in annotation.__args__ if a != type(None)), module, rf # pyright: ignore + next(a for a in annotation.__args__ if a is not _NONE_TYPE), module, rf # pyright: ignore ) return functools.partial(_deserialize_with_optional, if_obj_deserializer) # the type is Optional[Union[...]], we need to remove the None type from the Union annotation_copy = copy.copy(annotation) - annotation_copy.__args__ = [a for a in annotation_copy.__args__ if a != type(None)] # pyright: ignore + annotation_copy.__args__ = [a for a in annotation_copy.__args__ if a is not _NONE_TYPE] # pyright: ignore return _get_deserialize_callable_from_annotation(annotation_copy, module, rf) except AttributeError: pass @@ -998,7 +1059,11 @@ def __init__( @property def _class_type(self) -> typing.Any: - return getattr(self._type, "args", [None])[0] + result = getattr(self._type, "args", [None])[0] + # type may be wrapped by nested functools.partial so we need to check for that + if isinstance(result, functools.partial): + return getattr(result, "args", [None])[0] + return result @property def _rest_name(self) -> str: @@ -1009,14 +1074,37 @@ def _rest_name(self) -> str: def __get__(self, obj: Model, type=None): # pylint: disable=redefined-builtin # by this point, type and rest_name will have a value bc we default # them in __new__ of the Model class - item = obj.get(self._rest_name) + # Use _data.get() directly to avoid triggering __getitem__ which clears the cache + item = obj._data.get(self._rest_name) if item is None: return item if self._is_model: return item - return _deserialize(self._type, _serialize(item, self._format), rf=self) + + # For mutable types, we want mutations to directly affect _data + # Check if we've already deserialized this value + cache_attr = f"_deserialized_{self._rest_name}" + if hasattr(obj, cache_attr): + # Return the value from _data directly (it's been deserialized in place) + return obj._data.get(self._rest_name) + + deserialized = _deserialize(self._type, _serialize(item, self._format), rf=self) + + # For mutable types, store the deserialized value back in _data + # so mutations directly affect _data + if isinstance(deserialized, (dict, list, set)): + obj._data[self._rest_name] = deserialized + object.__setattr__(obj, cache_attr, True) # Mark as deserialized + return deserialized + + return deserialized def __set__(self, obj: Model, value) -> None: + # Clear the cached deserialized object when setting a new value + cache_attr = f"_deserialized_{self._rest_name}" + if hasattr(obj, cache_attr): + object.__delattr__(obj, cache_attr) + if value is None: # we want to wipe out entries if users set attr to None try: @@ -1184,7 +1272,7 @@ def _get_wrapped_element( _get_element(v, exclude_readonly, meta, wrapped_element) else: wrapped_element.text = _get_primitive_type_value(v) - return wrapped_element + return wrapped_element # type: ignore[no-any-return] def _get_primitive_type_value(v) -> str: @@ -1197,7 +1285,9 @@ def _get_primitive_type_value(v) -> str: return str(v) -def _create_xml_element(tag, prefix=None, ns=None): +def _create_xml_element( + tag: typing.Any, prefix: typing.Optional[str] = None, ns: typing.Optional[str] = None +) -> ET.Element: if prefix and ns: ET.register_namespace(prefix, ns) if ns: diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py index 45a3e44e45cb..81ec1de5922b 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/_utils/serialization.py @@ -821,13 +821,20 @@ def serialize_basic(cls, data, data_type, **kwargs): :param str data_type: Type of object in the iterable. :rtype: str, int, float, bool :return: serialized object + :raises TypeError: raise if data_type is not one of str, int, float, bool. """ custom_serializer = cls._get_custom_serializers(data_type, **kwargs) if custom_serializer: return custom_serializer(data) if data_type == "str": return cls.serialize_unicode(data) - return eval(data_type)(data) # nosec # pylint: disable=eval-used + if data_type == "int": + return int(data) + if data_type == "float": + return float(data) + if data_type == "bool": + return bool(data) + raise TypeError("Unknown basic data type: {}".format(data_type)) @classmethod def serialize_unicode(cls, data): @@ -1757,7 +1764,7 @@ def deserialize_basic(self, attr, data_type): # pylint: disable=too-many-return :param str data_type: deserialization data type. :return: Deserialized basic type. :rtype: str, int, float or bool - :raises TypeError: if string format is not valid. + :raises TypeError: if string format is not valid or data_type is not one of str, int, float, bool. """ # If we're here, data is supposed to be a basic type. # If it's still an XML node, take the text @@ -1783,7 +1790,11 @@ def deserialize_basic(self, attr, data_type): # pylint: disable=too-many-return if data_type == "str": return self.deserialize_unicode(attr) - return eval(data_type)(attr) # nosec # pylint: disable=eval-used + if data_type == "int": + return int(attr) + if data_type == "float": + return float(attr) + raise TypeError("Unknown basic data type: {}".format(data_type)) @staticmethod def deserialize_unicode(data): diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py index 0b0327e00d38..b38881c5313c 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/__init__.py @@ -32,6 +32,7 @@ AzureStandardVoice, AzureVoice, Background, + BasicFillerResponseConfig, CachedTokenDetails, ClientEvent, ClientEventConversationItemCreate, @@ -55,6 +56,8 @@ ConversationRequestItem, EouDetection, ErrorResponse, + FillerResponseConfigBase, + FoundryAgentTool, FunctionCallItem, FunctionCallOutputItem, FunctionTool, @@ -62,6 +65,7 @@ InputAudioContentPart, InputTextContentPart, InputTokenDetails, + LlmFillerResponseConfig, LogProbProperties, MCPApprovalResponseRequestItem, MCPServer, @@ -80,6 +84,7 @@ ResponseCancelledDetails, ResponseCreateParams, ResponseFailedDetails, + ResponseFoundryAgentCallItem, ResponseFunctionCallItem, ResponseFunctionCallOutputItem, ResponseIncompleteDetails, @@ -92,6 +97,7 @@ ResponseSession, ResponseStatusDetails, ResponseTextContentPart, + Scene, ServerEvent, ServerEventConversationItemCreated, ServerEventConversationItemDeleted, @@ -123,6 +129,11 @@ ServerEventResponseContentPartDone, ServerEventResponseCreated, ServerEventResponseDone, + ServerEventResponseFoundryAgentCallArgumentsDelta, + ServerEventResponseFoundryAgentCallArgumentsDone, + ServerEventResponseFoundryAgentCallCompleted, + ServerEventResponseFoundryAgentCallFailed, + ServerEventResponseFoundryAgentCallInProgress, ServerEventResponseFunctionCallArgumentsDelta, ServerEventResponseFunctionCallArgumentsDone, ServerEventResponseMcpCallArgumentsDelta, @@ -161,6 +172,9 @@ ClientEventType, ContentPartType, EouThresholdLevel, + FillerResponseConfigType, + FillerTrigger, + FoundryAgentContextType, InputAudioFormat, ItemParamStatus, ItemType, @@ -171,6 +185,7 @@ OutputAudioFormat, PersonalVoiceModels, PhotoAvatarBaseModes, + ReasoningEffort, RequestImageContentPartDetail, ResponseItemStatus, ResponseStatus, @@ -202,6 +217,7 @@ "AzureStandardVoice", "AzureVoice", "Background", + "BasicFillerResponseConfig", "CachedTokenDetails", "ClientEvent", "ClientEventConversationItemCreate", @@ -225,6 +241,8 @@ "ConversationRequestItem", "EouDetection", "ErrorResponse", + "FillerResponseConfigBase", + "FoundryAgentTool", "FunctionCallItem", "FunctionCallOutputItem", "FunctionTool", @@ -232,6 +250,7 @@ "InputAudioContentPart", "InputTextContentPart", "InputTokenDetails", + "LlmFillerResponseConfig", "LogProbProperties", "MCPApprovalResponseRequestItem", "MCPServer", @@ -250,6 +269,7 @@ "ResponseCancelledDetails", "ResponseCreateParams", "ResponseFailedDetails", + "ResponseFoundryAgentCallItem", "ResponseFunctionCallItem", "ResponseFunctionCallOutputItem", "ResponseIncompleteDetails", @@ -262,6 +282,7 @@ "ResponseSession", "ResponseStatusDetails", "ResponseTextContentPart", + "Scene", "ServerEvent", "ServerEventConversationItemCreated", "ServerEventConversationItemDeleted", @@ -293,6 +314,11 @@ "ServerEventResponseContentPartDone", "ServerEventResponseCreated", "ServerEventResponseDone", + "ServerEventResponseFoundryAgentCallArgumentsDelta", + "ServerEventResponseFoundryAgentCallArgumentsDone", + "ServerEventResponseFoundryAgentCallCompleted", + "ServerEventResponseFoundryAgentCallFailed", + "ServerEventResponseFoundryAgentCallInProgress", "ServerEventResponseFunctionCallArgumentsDelta", "ServerEventResponseFunctionCallArgumentsDone", "ServerEventResponseMcpCallArgumentsDelta", @@ -328,6 +354,9 @@ "ClientEventType", "ContentPartType", "EouThresholdLevel", + "FillerResponseConfigType", + "FillerTrigger", + "FoundryAgentContextType", "InputAudioFormat", "ItemParamStatus", "ItemType", @@ -338,6 +367,7 @@ "OutputAudioFormat", "PersonalVoiceModels", "PhotoAvatarBaseModes", + "ReasoningEffort", "RequestImageContentPartDetail", "ResponseItemStatus", "ResponseStatus", diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py index 7b700bbdf126..24d291e8c06b 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_enums.py @@ -100,6 +100,33 @@ class EouThresholdLevel(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Default sensitivity threshold level.""" +class FillerResponseConfigType(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """Filler response configuration types.""" + + STATIC_FILLER = "static_filler" + """Static filler configuration type.""" + LLM_FILLER = "llm_filler" + """LLM-based filler configuration type.""" + + +class FillerTrigger(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """Triggers that can activate filler response generation.""" + + LATENCY = "latency" + """Trigger filler when response latency exceeds threshold.""" + TOOL = "tool" + """Trigger filler when a tool call is being executed.""" + + +class FoundryAgentContextType(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """The available set of Foundry agent context types.""" + + NO_CONTEXT = "no_context" + """Only the current user input is sent, no context maintained.""" + AGENT_CONTEXT = "agent_context" + """Agent maintains its own context (thread), only current input sent per call.""" + + class InputAudioFormat(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Input audio format types supported.""" @@ -130,6 +157,7 @@ class ItemType(str, Enum, metaclass=CaseInsensitiveEnumMeta): MCP_CALL = "mcp_call" MCP_APPROVAL_REQUEST = "mcp_approval_request" MCP_APPROVAL_RESPONSE = "mcp_approval_response" + FOUNDRY_AGENT_CALL = "foundry_agent_call" class MCPApprovalType(str, Enum, metaclass=CaseInsensitiveEnumMeta): @@ -192,9 +220,9 @@ class OutputAudioFormat(str, Enum, metaclass=CaseInsensitiveEnumMeta): PCM16 = "pcm16" """16-bit PCM audio format at default sampling rate (24kHz)""" - PCM16_8000_HZ = "pcm16-8000hz" + PCM16_8000_HZ = "pcm16_8000hz" """16-bit PCM audio format at 8kHz sampling rate""" - PCM16_16000_HZ = "pcm16-16000hz" + PCM16_16000_HZ = "pcm16_16000hz" """16-bit PCM audio format at 16kHz sampling rate""" G711_ULAW = "g711_ulaw" """G.711 μ-law (mu-law) audio format at 8kHz sampling rate""" @@ -220,6 +248,27 @@ class PhotoAvatarBaseModes(str, Enum, metaclass=CaseInsensitiveEnumMeta): """VASA-1 model""" +class ReasoningEffort(str, Enum, metaclass=CaseInsensitiveEnumMeta): + """Constrains effort on reasoning for reasoning models. Check model documentation for supported + values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. + """ + + NONE = "none" + """No reasoning effort.""" + MINIMAL = "minimal" + """Minimal reasoning effort.""" + LOW = "low" + """Low reasoning effort - faster responses with less reasoning.""" + MEDIUM = "medium" + """Medium reasoning effort - balanced between speed and reasoning depth.""" + HIGH = "high" + """High reasoning effort - more thorough reasoning, may take longer.""" + XHIGH = "xhigh" + """Extra high reasoning effort - maximum reasoning depth.""" + + class RequestImageContentPartDetail(str, Enum, metaclass=CaseInsensitiveEnumMeta): """Specifies an image's detail level. Can be 'auto', 'low', 'high', or an unknown future value.""" @@ -300,6 +349,11 @@ class ServerEventType(str, Enum, metaclass=CaseInsensitiveEnumMeta): RESPONSE_MCP_CALL_IN_PROGRESS = "response.mcp_call.in_progress" RESPONSE_MCP_CALL_COMPLETED = "response.mcp_call.completed" RESPONSE_MCP_CALL_FAILED = "response.mcp_call.failed" + RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA = "response.foundry_agent_call_arguments.delta" + RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE = "response.foundry_agent_call_arguments.done" + RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS = "response.foundry_agent_call.in_progress" + RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED = "response.foundry_agent_call.completed" + RESPONSE_FOUNDRY_AGENT_CALL_FAILED = "response.foundry_agent_call.failed" class ToolChoiceLiteral(str, Enum, metaclass=CaseInsensitiveEnumMeta): @@ -320,6 +374,7 @@ class ToolType(str, Enum, metaclass=CaseInsensitiveEnumMeta): FUNCTION = "function" MCP = "mcp" + FOUNDRY_AGENT = "foundry_agent" class TurnDetectionType(str, Enum, metaclass=CaseInsensitiveEnumMeta): diff --git a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py index 87ddc5547a8a..bef8a9eb7fa8 100644 --- a/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py +++ b/sdk/ai/azure-ai-voicelive/azure/ai/voicelive/models/_models.py @@ -15,6 +15,7 @@ AzureVoiceType, ClientEventType, ContentPartType, + FillerResponseConfigType, ItemType, MessageRole, ResponseStatus, @@ -117,7 +118,8 @@ class ConversationRequestItem(_Model): FunctionCallItem, FunctionCallOutputItem, MCPApprovalResponseRequestItem, MessageItem :ivar type: Required. Known values are: "message", "function_call", "function_call_output", - "mcp_list_tools", "mcp_call", "mcp_approval_request", and "mcp_approval_response". + "mcp_list_tools", "mcp_call", "mcp_approval_request", "mcp_approval_response", and + "foundry_agent_call". :vartype type: str or ~azure.ai.voicelive.models.ItemType :ivar id: :vartype id: str @@ -126,7 +128,8 @@ class ConversationRequestItem(_Model): __mapping__: dict[str, _Model] = {} type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) """Required. Known values are: \"message\", \"function_call\", \"function_call_output\", - \"mcp_list_tools\", \"mcp_call\", \"mcp_approval_request\", and \"mcp_approval_response\".""" + \"mcp_list_tools\", \"mcp_call\", \"mcp_approval_request\", \"mcp_approval_response\", and + \"foundry_agent_call\".""" id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @overload @@ -374,9 +377,15 @@ class AvatarConfig(_Model): :vartype customized: bool :ivar video: Optional video configuration including resolution, bitrate, and codec. :vartype video: ~azure.ai.voicelive.models.VideoParams + :ivar scene: Configuration for the avatar's zoom level, position, rotation and movement + amplitude in the video frame. + :vartype scene: ~azure.ai.voicelive.models.Scene :ivar output_protocol: Output protocol for avatar streaming. Default is 'webrtc'. Known values are: "webrtc" and "websocket". :vartype output_protocol: str or ~azure.ai.voicelive.models.AvatarOutputProtocol + :ivar output_audit_audio: When enabled, forwards audit audio via WebSocket for review/debugging + purposes, even when avatar output is delivered via WebRTC. + :vartype output_audit_audio: bool """ type: Optional[Union[str, "_models.AvatarConfigTypes"]] = rest_field( @@ -399,11 +408,17 @@ class AvatarConfig(_Model): """Indicates whether the avatar is customized or not. Required.""" video: Optional["_models.VideoParams"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Optional video configuration including resolution, bitrate, and codec.""" + scene: Optional["_models.Scene"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Configuration for the avatar's zoom level, position, rotation and movement amplitude in the + video frame.""" output_protocol: Optional[Union[str, "_models.AvatarOutputProtocol"]] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) """Output protocol for avatar streaming. Default is 'webrtc'. Known values are: \"webrtc\" and \"websocket\".""" + output_audit_audio: Optional[bool] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """When enabled, forwards audit audio via WebSocket for review/debugging purposes, even when + avatar output is delivered via WebRTC.""" @overload def __init__( @@ -416,7 +431,9 @@ def __init__( style: Optional[str] = None, model: Optional[Union[str, "_models.PhotoAvatarBaseModes"]] = None, video: Optional["_models.VideoParams"] = None, + scene: Optional["_models.Scene"] = None, output_protocol: Optional[Union[str, "_models.AvatarOutputProtocol"]] = None, + output_audit_audio: Optional[bool] = None, ) -> None: ... @overload @@ -477,6 +494,8 @@ class AzureCustomVoice(AzureVoice, discriminator="azure-custom"): :vartype temperature: float :ivar custom_lexicon_url: :vartype custom_lexicon_url: str + :ivar custom_text_normalization_url: + :vartype custom_text_normalization_url: str :ivar prefer_locales: :vartype prefer_locales: list[str] :ivar locale: @@ -500,6 +519,9 @@ class AzureCustomVoice(AzureVoice, discriminator="azure-custom"): temperature: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Temperature must be between 0.0 and 1.0.""" custom_lexicon_url: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + custom_text_normalization_url: Optional[str] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) prefer_locales: Optional[list[str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) locale: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) style: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -515,6 +537,7 @@ def __init__( endpoint_id: str, temperature: Optional[float] = None, custom_lexicon_url: Optional[str] = None, + custom_text_normalization_url: Optional[str] = None, prefer_locales: Optional[list[str]] = None, locale: Optional[str] = None, style: Optional[str] = None, @@ -549,6 +572,8 @@ class AzurePersonalVoice(AzureVoice, discriminator="azure-personal"): :vartype model: str or ~azure.ai.voicelive.models.PersonalVoiceModels :ivar custom_lexicon_url: :vartype custom_lexicon_url: str + :ivar custom_text_normalization_url: + :vartype custom_text_normalization_url: str :ivar prefer_locales: :vartype prefer_locales: list[str] :ivar locale: @@ -575,6 +600,9 @@ class AzurePersonalVoice(AzureVoice, discriminator="azure-personal"): """Underlying neural model to use for personal voice. Required. Known values are: \"DragonLatestNeural\", \"PhoenixLatestNeural\", and \"PhoenixV2Neural\".""" custom_lexicon_url: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + custom_text_normalization_url: Optional[str] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) prefer_locales: Optional[list[str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) locale: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) style: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -590,6 +618,7 @@ def __init__( model: Union[str, "_models.PersonalVoiceModels"], temperature: Optional[float] = None, custom_lexicon_url: Optional[str] = None, + custom_text_normalization_url: Optional[str] = None, prefer_locales: Optional[list[str]] = None, locale: Optional[str] = None, style: Optional[str] = None, @@ -618,7 +647,7 @@ class EouDetection(_Model): :ivar model: Required. Is one of the following types: Literal["semantic_detection_v1"], Literal["semantic_detection_v1_en"], Literal["semantic_detection_v1_multilingual"], str - :vartype model: str or str or str or str + :vartype model: str """ __mapping__: dict[str, _Model] = {} @@ -1024,6 +1053,8 @@ class AzureStandardVoice(AzureVoice, discriminator="azure-standard"): :vartype temperature: float :ivar custom_lexicon_url: :vartype custom_lexicon_url: str + :ivar custom_text_normalization_url: + :vartype custom_text_normalization_url: str :ivar prefer_locales: :vartype prefer_locales: list[str] :ivar locale: @@ -1045,6 +1076,9 @@ class AzureStandardVoice(AzureVoice, discriminator="azure-standard"): temperature: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Temperature must be between 0.0 and 1.0.""" custom_lexicon_url: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + custom_text_normalization_url: Optional[str] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) prefer_locales: Optional[list[str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) locale: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) style: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -1059,6 +1093,7 @@ def __init__( name: str, temperature: Optional[float] = None, custom_lexicon_url: Optional[str] = None, + custom_text_normalization_url: Optional[str] = None, prefer_locales: Optional[list[str]] = None, locale: Optional[str] = None, style: Optional[str] = None, @@ -1114,6 +1149,99 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) +class FillerResponseConfigBase(_Model): + """Base model for filler response configuration. + + You probably want to use the sub-classes and not this class directly. Known sub-classes are: + LlmFillerResponseConfig, BasicFillerResponseConfig + + :ivar type: The type of filler response configuration. Required. Known values are: + "static_filler" and "llm_filler". + :vartype type: str or ~azure.ai.voicelive.models.FillerResponseConfigType + :ivar triggers: List of triggers that can fire the filler. Any trigger can activate the filler + (OR logic). + Supported: 'latency', 'tool'. + :vartype triggers: list[str or ~azure.ai.voicelive.models.FillerTrigger] + :ivar latency_threshold_ms: Latency threshold in milliseconds before triggering filler + response. Default is 2000ms. + :vartype latency_threshold_ms: int + """ + + __mapping__: dict[str, _Model] = {} + type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) + """The type of filler response configuration. Required. Known values are: \"static_filler\" and + \"llm_filler\".""" + triggers: Optional[list[Union[str, "_models.FillerTrigger"]]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """List of triggers that can fire the filler. Any trigger can activate the filler (OR logic). + Supported: 'latency', 'tool'.""" + latency_threshold_ms: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Latency threshold in milliseconds before triggering filler response. Default is 2000ms.""" + + @overload + def __init__( + self, + *, + type: str, + triggers: Optional[list[Union[str, "_models.FillerTrigger"]]] = None, + latency_threshold_ms: Optional[int] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + +class BasicFillerResponseConfig(FillerResponseConfigBase, discriminator="static_filler"): + """Configuration for basic/static filler response generation. + Randomly selects from configured texts when any trigger condition is met. + + :ivar triggers: List of triggers that can fire the filler. Any trigger can activate the filler + (OR logic). + Supported: 'latency', 'tool'. + :vartype triggers: list[str or ~azure.ai.voicelive.models.FillerTrigger] + :ivar latency_threshold_ms: Latency threshold in milliseconds before triggering filler + response. Default is 2000ms. + :vartype latency_threshold_ms: int + :ivar type: Required. Static filler configuration type. + :vartype type: str or ~azure.ai.voicelive.models.STATIC_FILLER + :ivar texts: List of filler text options to randomly select from. + :vartype texts: list[str] + """ + + type: Literal[FillerResponseConfigType.STATIC_FILLER] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required. Static filler configuration type.""" + texts: Optional[list[str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """List of filler text options to randomly select from.""" + + @overload + def __init__( + self, + *, + triggers: Optional[list[Union[str, "_models.FillerTrigger"]]] = None, + latency_threshold_ms: Optional[int] = None, + texts: Optional[list[str]] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = FillerResponseConfigType.STATIC_FILLER # type: ignore + + class CachedTokenDetails(_Model): """Details of output token usage. @@ -1942,6 +2070,117 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) +class Tool(_Model): + """The base representation of a voicelive tool definition. + + You probably want to use the sub-classes and not this class directly. Known sub-classes are: + FoundryAgentTool, FunctionTool, MCPServer + + :ivar type: Required. Known values are: "function", "mcp", and "foundry_agent". + :vartype type: str or ~azure.ai.voicelive.models.ToolType + """ + + __mapping__: dict[str, _Model] = {} + type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) + """Required. Known values are: \"function\", \"mcp\", and \"foundry_agent\".""" + + @overload + def __init__( + self, + *, + type: str, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + +class FoundryAgentTool(Tool, discriminator="foundry_agent"): + """The definition of a Foundry agent tool as used by the voicelive endpoint. + + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.FOUNDRY_AGENT + :ivar agent_name: The name of the Foundry agent to call. Required. + :vartype agent_name: str + :ivar agent_version: The version of the Foundry agent to call. + :vartype agent_version: str + :ivar project_name: The name of the Foundry project containing the agent. Required. + :vartype project_name: str + :ivar client_id: The client ID associated with the Foundry agent. + :vartype client_id: str + :ivar description: An optional description for the Foundry agent tool. If this is provided, it + will be used instead of the agent's description in foundry portal. + :vartype description: str + :ivar foundry_resource_override: An optional override for the Foundry resource used to execute + the agent. + :vartype foundry_resource_override: str + :ivar agent_context_type: The context type to use when invoking the Foundry agent. Defaults to + 'agent_context'. Known values are: "no_context" and "agent_context". + :vartype agent_context_type: str or ~azure.ai.voicelive.models.FoundryAgentContextType + :ivar return_agent_response_directly: Whether to return the agent's response directly in the + VoiceLive response. Set to false means to ask the voice live to rewrite the response. + :vartype return_agent_response_directly: bool + """ + + type: Literal[ToolType.FOUNDRY_AGENT] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + agent_name: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The name of the Foundry agent to call. Required.""" + agent_version: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The version of the Foundry agent to call.""" + project_name: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The name of the Foundry project containing the agent. Required.""" + client_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The client ID associated with the Foundry agent.""" + description: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """An optional description for the Foundry agent tool. If this is provided, it will be used + instead of the agent's description in foundry portal.""" + foundry_resource_override: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """An optional override for the Foundry resource used to execute the agent.""" + agent_context_type: Optional[Union[str, "_models.FoundryAgentContextType"]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """The context type to use when invoking the Foundry agent. Defaults to 'agent_context'. Known + values are: \"no_context\" and \"agent_context\".""" + return_agent_response_directly: Optional[bool] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Whether to return the agent's response directly in the VoiceLive response. Set to false means + to ask the voice live to rewrite the response.""" + + @overload + def __init__( + self, + *, + agent_name: str, + project_name: str, + agent_version: Optional[str] = None, + client_id: Optional[str] = None, + description: Optional[str] = None, + foundry_resource_override: Optional[str] = None, + agent_context_type: Optional[Union[str, "_models.FoundryAgentContextType"]] = None, + return_agent_response_directly: Optional[bool] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ToolType.FOUNDRY_AGENT # type: ignore + + class FunctionCallItem(ConversationRequestItem, discriminator="function_call"): """A function call item within a conversation. @@ -2043,38 +2282,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ItemType.FUNCTION_CALL_OUTPUT # type: ignore -class Tool(_Model): - """The base representation of a voicelive tool definition. - - You probably want to use the sub-classes and not this class directly. Known sub-classes are: - FunctionTool, MCPServer - - :ivar type: Required. Known values are: "function" and "mcp". - :vartype type: str or ~azure.ai.voicelive.models.ToolType - """ - - __mapping__: dict[str, _Model] = {} - type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) - """Required. Known values are: \"function\" and \"mcp\".""" - - @overload - def __init__( - self, - *, - type: str, - ) -> None: ... - - @overload - def __init__(self, mapping: Mapping[str, Any]) -> None: - """ - :param mapping: raw JSON to initialize the model. - :type mapping: Mapping[str, Any] - """ - - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - - class FunctionTool(Tool, discriminator="function"): """The definition of a function tool as used by the voicelive endpoint. @@ -2308,6 +2515,60 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) +class LlmFillerResponseConfig(FillerResponseConfigBase, discriminator="llm_filler"): + """Configuration for LLM-based filler response generation. + Uses LLM to generate context-aware filler responses when any trigger condition is met. + + :ivar triggers: List of triggers that can fire the filler. Any trigger can activate the filler + (OR logic). + Supported: 'latency', 'tool'. + :vartype triggers: list[str or ~azure.ai.voicelive.models.FillerTrigger] + :ivar latency_threshold_ms: Latency threshold in milliseconds before triggering filler + response. Default is 2000ms. + :vartype latency_threshold_ms: int + :ivar type: Required. LLM-based filler configuration type. + :vartype type: str or ~azure.ai.voicelive.models.LLM_FILLER + :ivar model: The model to use for LLM-based filler generation. Default is gpt-4.1-mini. + :vartype model: str + :ivar instructions: Custom instructions for generating filler responses. If not provided, a + default prompt is used. + :vartype instructions: str + :ivar max_completion_tokens: Maximum number of tokens to generate for the filler response. + :vartype max_completion_tokens: int + """ + + type: Literal[FillerResponseConfigType.LLM_FILLER] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required. LLM-based filler configuration type.""" + model: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The model to use for LLM-based filler generation. Default is gpt-4.1-mini.""" + instructions: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Custom instructions for generating filler responses. If not provided, a default prompt is used.""" + max_completion_tokens: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Maximum number of tokens to generate for the filler response.""" + + @overload + def __init__( + self, + *, + triggers: Optional[list[Union[str, "_models.FillerTrigger"]]] = None, + latency_threshold_ms: Optional[int] = None, + model: Optional[str] = None, + instructions: Optional[str] = None, + max_completion_tokens: Optional[int] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = FillerResponseConfigType.LLM_FILLER # type: ignore + + class LogProbProperties(_Model): """A single log probability entry for a token. @@ -2681,8 +2942,6 @@ class RequestSession(_Model): :vartype instructions: str :ivar input_audio_sampling_rate: Input audio sampling rate in Hz. Available values: - - * For pcm16: 8000, 16000, 24000 * For g711_alaw/g711_ulaw: 8000. @@ -2691,7 +2950,7 @@ class RequestSession(_Model): "g711_ulaw", and "g711_alaw". :vartype input_audio_format: str or ~azure.ai.voicelive.models.InputAudioFormat :ivar output_audio_format: Output audio format. Default is 'pcm16'. Known values are: "pcm16", - "pcm16-8000hz", "pcm16-16000hz", "g711_ulaw", and "g711_alaw". + "pcm16_8000hz", "pcm16_16000hz", "g711_ulaw", and "g711_alaw". :vartype output_audio_format: str or ~azure.ai.voicelive.models.OutputAudioFormat :ivar turn_detection: Type of turn detection to use. :vartype turn_detection: ~azure.ai.voicelive.models.TurnDetection @@ -2719,6 +2978,15 @@ class RequestSession(_Model): :ivar max_response_output_tokens: Maximum number of tokens to generate in the response. Default is unlimited. Is either a int type or a Literal["inf"] type. :vartype max_response_output_tokens: int or str + :ivar reasoning_effort: Constrains effort on reasoning for reasoning models. Check model + documentation for supported values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: "none", "minimal", "low", "medium", "high", and "xhigh". + :vartype reasoning_effort: str or ~azure.ai.voicelive.models.ReasoningEffort + :ivar filler_response: Configuration for filler response generation during latency or tool + calls. Is either a BasicFillerResponseConfig type or a LlmFillerResponseConfig type. + :vartype filler_response: ~azure.ai.voicelive.models.BasicFillerResponseConfig or + ~azure.ai.voicelive.models.LlmFillerResponseConfig """ model: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -2737,8 +3005,6 @@ class RequestSession(_Model): input_audio_sampling_rate: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Input audio sampling rate in Hz. Available values: - - * For pcm16: 8000, 16000, 24000 * For g711_alaw/g711_ulaw: 8000.""" @@ -2750,8 +3016,8 @@ class RequestSession(_Model): output_audio_format: Optional[Union[str, "_models.OutputAudioFormat"]] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) - """Output audio format. Default is 'pcm16'. Known values are: \"pcm16\", \"pcm16-8000hz\", - \"pcm16-16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" + """Output audio format. Default is 'pcm16'. Known values are: \"pcm16\", \"pcm16_8000hz\", + \"pcm16_16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" turn_detection: Optional["_models.TurnDetection"] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) @@ -2786,6 +3052,19 @@ class RequestSession(_Model): ) """Maximum number of tokens to generate in the response. Default is unlimited. Is either a int type or a Literal[\"inf\"] type.""" + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Constrains effort on reasoning for reasoning models. Check model documentation for supported + values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: \"none\", \"minimal\", \"low\", \"medium\", \"high\", and + \"xhigh\".""" + filler_response: Optional["_types.FillerResponseConfig"] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Configuration for filler response generation during latency or tool calls. Is either a + BasicFillerResponseConfig type or a LlmFillerResponseConfig type.""" @overload def __init__( @@ -2809,6 +3088,8 @@ def __init__( tool_choice: Optional["_types.ToolChoice"] = None, temperature: Optional[float] = None, max_response_output_tokens: Optional[Union[int, Literal["inf"]]] = None, + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = None, + filler_response: Optional["_types.FillerResponseConfig"] = None, ) -> None: ... @overload @@ -2895,7 +3176,7 @@ class Response(_Model): could be responding in either text or audio. :vartype modalities: list[str or ~azure.ai.voicelive.models.Modality] :ivar output_audio_format: The format of output audio. Options are ``pcm16``, ``g711_ulaw``, or - ``g711_alaw``. Known values are: "pcm16", "pcm16-8000hz", "pcm16-16000hz", "g711_ulaw", and + ``g711_alaw``. Known values are: "pcm16", "pcm16_8000hz", "pcm16_16000hz", "g711_ulaw", and "g711_alaw". :vartype output_audio_format: str or ~azure.ai.voicelive.models.OutputAudioFormat :ivar temperature: Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. @@ -2904,6 +3185,10 @@ class Response(_Model): inclusive of tool calls, that was used in this response. Is either a int type or a Literal["inf"] type. :vartype max_output_tokens: int or str + :ivar metadata: Set of up to 16 key-value pairs that can be attached to an object. + This can be useful for storing additional information about the object in a structured format. + Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long. + :vartype metadata: dict[str, str] """ id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -2953,7 +3238,7 @@ class Response(_Model): visibility=["read", "create", "update", "delete", "query"] ) """The format of output audio. Options are ``pcm16``, ``g711_ulaw``, or ``g711_alaw``. Known - values are: \"pcm16\", \"pcm16-8000hz\", \"pcm16-16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" + values are: \"pcm16\", \"pcm16_8000hz\", \"pcm16_16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" temperature: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.""" max_output_tokens: Optional[Union[int, Literal["inf"]]] = rest_field( @@ -2962,6 +3247,10 @@ class Response(_Model): """Maximum number of output tokens for a single assistant response, inclusive of tool calls, that was used in this response. Is either a int type or a Literal[\"inf\"] type.""" + metadata: Optional[dict[str, str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Set of up to 16 key-value pairs that can be attached to an object. + This can be useful for storing additional information about the object in a structured format. + Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long.""" @overload def __init__( @@ -2979,6 +3268,7 @@ def __init__( output_audio_format: Optional[Union[str, "_models.OutputAudioFormat"]] = None, temperature: Optional[float] = None, max_output_tokens: Optional[Union[int, Literal["inf"]]] = None, + metadata: Optional[dict[str, str]] = None, ) -> None: ... @overload @@ -3130,7 +3420,7 @@ class ResponseCreateParams(_Model): :vartype voice: str or ~azure.ai.voicelive.models.OpenAIVoiceName or ~azure.ai.voicelive.models.OpenAIVoice or ~azure.ai.voicelive.models.AzureVoice :ivar output_audio_format: The format of output audio. Options are ``pcm16``, ``g711_ulaw``, or - ``g711_alaw``. Known values are: "pcm16", "pcm16-8000hz", "pcm16-16000hz", "g711_ulaw", and + ``g711_alaw``. Known values are: "pcm16", "pcm16_8000hz", "pcm16_16000hz", "g711_ulaw", and "g711_alaw". :vartype output_audio_format: str or ~azure.ai.voicelive.models.OutputAudioFormat :ivar tools: Tools (functions) available to the model. @@ -3151,6 +3441,15 @@ class ResponseCreateParams(_Model): added into the conversation history and returned with synthesized audio output in the created response. :vartype pre_generated_assistant_message: ~azure.ai.voicelive.models.AssistantMessageItem + :ivar reasoning_effort: Constrains effort on reasoning for reasoning models. Check model + documentation for supported values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: "none", "minimal", "low", "medium", "high", and "xhigh". + :vartype reasoning_effort: str or ~azure.ai.voicelive.models.ReasoningEffort + :ivar metadata: Set of up to 16 key-value pairs that can be attached to an object. + This can be useful for storing additional information about the object in a structured format. + Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long. + :vartype metadata: dict[str, str] """ commit: Optional[bool] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -3191,7 +3490,7 @@ class ResponseCreateParams(_Model): visibility=["read", "create", "update", "delete", "query"] ) """The format of output audio. Options are ``pcm16``, ``g711_ulaw``, or ``g711_alaw``. Known - values are: \"pcm16\", \"pcm16-8000hz\", \"pcm16-16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" + values are: \"pcm16\", \"pcm16_8000hz\", \"pcm16_16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" tools: Optional[list["_models.Tool"]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Tools (functions) available to the model.""" tool_choice: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -3213,6 +3512,18 @@ class ResponseCreateParams(_Model): """Create the response with pre-generated assistant message. The message item would be added into the conversation history and returned with synthesized audio output in the created response.""" + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Constrains effort on reasoning for reasoning models. Check model documentation for supported + values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: \"none\", \"minimal\", \"low\", \"medium\", \"high\", and + \"xhigh\".""" + metadata: Optional[dict[str, str]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Set of up to 16 key-value pairs that can be attached to an object. + This can be useful for storing additional information about the object in a structured format. + Keys can be a maximum of 64 characters long and values can be a maximum of 512 characters long.""" @overload def __init__( @@ -3231,6 +3542,8 @@ def __init__( temperature: Optional[float] = None, max_output_tokens: Optional[Union[int, Literal["inf"]]] = None, pre_generated_assistant_message: Optional["_models.AssistantMessageItem"] = None, + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = None, + metadata: Optional[dict[str, str]] = None, ) -> None: ... @overload @@ -3281,12 +3594,13 @@ class ResponseItem(_Model): """Base for any response item; discriminated by ``type``. You probably want to use the sub-classes and not this class directly. Known sub-classes are: - ResponseFunctionCallItem, ResponseFunctionCallOutputItem, ResponseMCPApprovalRequestItem, - ResponseMCPApprovalResponseItem, ResponseMCPCallItem, ResponseMCPListToolItem, - ResponseMessageItem + ResponseFoundryAgentCallItem, ResponseFunctionCallItem, ResponseFunctionCallOutputItem, + ResponseMCPApprovalRequestItem, ResponseMCPApprovalResponseItem, ResponseMCPCallItem, + ResponseMCPListToolItem, ResponseMessageItem :ivar type: Required. Known values are: "message", "function_call", "function_call_output", - "mcp_list_tools", "mcp_call", "mcp_approval_request", and "mcp_approval_response". + "mcp_list_tools", "mcp_call", "mcp_approval_request", "mcp_approval_response", and + "foundry_agent_call". :vartype type: str or ~azure.ai.voicelive.models.ItemType :ivar id: :vartype id: str @@ -3297,7 +3611,8 @@ class ResponseItem(_Model): __mapping__: dict[str, _Model] = {} type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) """Required. Known values are: \"message\", \"function_call\", \"function_call_output\", - \"mcp_list_tools\", \"mcp_call\", \"mcp_approval_request\", and \"mcp_approval_response\".""" + \"mcp_list_tools\", \"mcp_call\", \"mcp_approval_request\", \"mcp_approval_response\", and + \"foundry_agent_call\".""" id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) object: Optional[Literal["realtime.item"]] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Default value is \"realtime.item\".""" @@ -3322,6 +3637,70 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) +class ResponseFoundryAgentCallItem(ResponseItem, discriminator="foundry_agent_call"): + """A response item that represents a call to a Foundry agent. + + :ivar id: + :vartype id: str + :ivar object: Default value is "realtime.item". + :vartype object: str + :ivar type: The type of the item. Required. + :vartype type: str or ~azure.ai.voicelive.models.FOUNDRY_AGENT_CALL + :ivar name: The name of the Foundry agent. Required. + :vartype name: str + :ivar call_id: The ID of the call. Required. + :vartype call_id: str + :ivar arguments: The arguments for the agent call. Required. + :vartype arguments: str + :ivar agent_response_id: The ID of the agent response, if any. + :vartype agent_response_id: str + :ivar output: The output of the agent call. + :vartype output: str + :ivar error: The error, if any, from the agent call. + :vartype error: any + """ + + type: Literal[ItemType.FOUNDRY_AGENT_CALL] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """The type of the item. Required.""" + name: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The name of the Foundry agent. Required.""" + call_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the call. Required.""" + arguments: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The arguments for the agent call. Required.""" + agent_response_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the agent response, if any.""" + output: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The output of the agent call.""" + error: Optional[Any] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The error, if any, from the agent call.""" + + @overload + def __init__( + self, + *, + name: str, + call_id: str, + arguments: str, + id: Optional[str] = None, # pylint: disable=redefined-builtin + object: Optional[Literal["realtime.item"]] = None, + agent_response_id: Optional[str] = None, + output: Optional[str] = None, + error: Optional[Any] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ItemType.FOUNDRY_AGENT_CALL # type: ignore + + class ResponseFunctionCallItem(ResponseItem, discriminator="function_call"): """A function call item within a conversation. @@ -3733,8 +4112,6 @@ class ResponseSession(_Model): :vartype instructions: str :ivar input_audio_sampling_rate: Input audio sampling rate in Hz. Available values: - - * For pcm16: 8000, 16000, 24000 * For g711_alaw/g711_ulaw: 8000. @@ -3743,7 +4120,7 @@ class ResponseSession(_Model): "g711_ulaw", and "g711_alaw". :vartype input_audio_format: str or ~azure.ai.voicelive.models.InputAudioFormat :ivar output_audio_format: Output audio format. Default is 'pcm16'. Known values are: "pcm16", - "pcm16-8000hz", "pcm16-16000hz", "g711_ulaw", and "g711_alaw". + "pcm16_8000hz", "pcm16_16000hz", "g711_ulaw", and "g711_alaw". :vartype output_audio_format: str or ~azure.ai.voicelive.models.OutputAudioFormat :ivar turn_detection: Type of turn detection to use. :vartype turn_detection: ~azure.ai.voicelive.models.TurnDetection @@ -3771,6 +4148,15 @@ class ResponseSession(_Model): :ivar max_response_output_tokens: Maximum number of tokens to generate in the response. Default is unlimited. Is either a int type or a Literal["inf"] type. :vartype max_response_output_tokens: int or str + :ivar reasoning_effort: Constrains effort on reasoning for reasoning models. Check model + documentation for supported values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: "none", "minimal", "low", "medium", "high", and "xhigh". + :vartype reasoning_effort: str or ~azure.ai.voicelive.models.ReasoningEffort + :ivar filler_response: Configuration for filler response generation during latency or tool + calls. Is either a BasicFillerResponseConfig type or a LlmFillerResponseConfig type. + :vartype filler_response: ~azure.ai.voicelive.models.BasicFillerResponseConfig or + ~azure.ai.voicelive.models.LlmFillerResponseConfig :ivar agent: The agent configuration for the session, if applicable. :vartype agent: ~azure.ai.voicelive.models.AgentConfig :ivar id: The unique identifier for the session. @@ -3793,8 +4179,6 @@ class ResponseSession(_Model): input_audio_sampling_rate: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """Input audio sampling rate in Hz. Available values: - - * For pcm16: 8000, 16000, 24000 * For g711_alaw/g711_ulaw: 8000.""" @@ -3806,8 +4190,8 @@ class ResponseSession(_Model): output_audio_format: Optional[Union[str, "_models.OutputAudioFormat"]] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) - """Output audio format. Default is 'pcm16'. Known values are: \"pcm16\", \"pcm16-8000hz\", - \"pcm16-16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" + """Output audio format. Default is 'pcm16'. Known values are: \"pcm16\", \"pcm16_8000hz\", + \"pcm16_16000hz\", \"g711_ulaw\", and \"g711_alaw\".""" turn_detection: Optional["_models.TurnDetection"] = rest_field( visibility=["read", "create", "update", "delete", "query"] ) @@ -3842,6 +4226,19 @@ class ResponseSession(_Model): ) """Maximum number of tokens to generate in the response. Default is unlimited. Is either a int type or a Literal[\"inf\"] type.""" + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Constrains effort on reasoning for reasoning models. Check model documentation for supported + values for each model. + Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in + a response. Known values are: \"none\", \"minimal\", \"low\", \"medium\", \"high\", and + \"xhigh\".""" + filler_response: Optional["_types.FillerResponseConfig"] = rest_field( + visibility=["read", "create", "update", "delete", "query"] + ) + """Configuration for filler response generation during latency or tool calls. Is either a + BasicFillerResponseConfig type or a LlmFillerResponseConfig type.""" agent: Optional["_models.AgentConfig"] = rest_field(visibility=["read", "create", "update", "delete", "query"]) """The agent configuration for the session, if applicable.""" id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @@ -3869,6 +4266,8 @@ def __init__( tool_choice: Optional["_types.ToolChoice"] = None, temperature: Optional[float] = None, max_response_output_tokens: Optional[Union[int, Literal["inf"]]] = None, + reasoning_effort: Optional[Union[str, "_models.ReasoningEffort"]] = None, + filler_response: Optional["_types.FillerResponseConfig"] = None, agent: Optional["_models.AgentConfig"] = None, id: Optional[str] = None, # pylint: disable=redefined-builtin ) -> None: ... @@ -3916,6 +4315,79 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ContentPartType.TEXT # type: ignore +class Scene(_Model): + """Configuration for avatar's zoom level, position, rotation and movement amplitude in the video + frame. + + :ivar zoom: Zoom level of the avatar. Range is (0, +āˆž). Values less than 1 zoom out, values + greater than 1 zoom in. + :vartype zoom: float + :ivar position_x: Horizontal position of the avatar. Range is [-1, 1], as a proportion of frame + width. Negative values move left, positive values move right. + :vartype position_x: float + :ivar position_y: Vertical position of the avatar. Range is [-1, 1], as a proportion of frame + height. Negative values move up, positive values move down. + :vartype position_y: float + :ivar rotation_x: Rotation around the X-axis (pitch). Range is [-Ļ€, Ļ€] in radians. Negative + values rotate up, positive values rotate down. + :vartype rotation_x: float + :ivar rotation_y: Rotation around the Y-axis (yaw). Range is [-Ļ€, Ļ€] in radians. Negative + values rotate left, positive values rotate right. + :vartype rotation_y: float + :ivar rotation_z: Rotation around the Z-axis (roll). Range is [-Ļ€, Ļ€] in radians. Negative + values rotate anticlockwise, positive values rotate clockwise. + :vartype rotation_z: float + :ivar amplitude: Amplitude of the avatar movement. Range is (0, 1]. Values in (0, 1) mean + reduced amplitude, 1 means full amplitude. + :vartype amplitude: float + """ + + zoom: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Zoom level of the avatar. Range is (0, +āˆž). Values less than 1 zoom out, values greater than 1 + zoom in.""" + position_x: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Horizontal position of the avatar. Range is [-1, 1], as a proportion of frame width. Negative + values move left, positive values move right.""" + position_y: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Vertical position of the avatar. Range is [-1, 1], as a proportion of frame height. Negative + values move up, positive values move down.""" + rotation_x: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Rotation around the X-axis (pitch). Range is [-Ļ€, Ļ€] in radians. Negative values rotate up, + positive values rotate down.""" + rotation_y: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Rotation around the Y-axis (yaw). Range is [-Ļ€, Ļ€] in radians. Negative values rotate left, + positive values rotate right.""" + rotation_z: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Rotation around the Z-axis (roll). Range is [-Ļ€, Ļ€] in radians. Negative values rotate + anticlockwise, positive values rotate clockwise.""" + amplitude: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """Amplitude of the avatar movement. Range is (0, 1]. Values in (0, 1) mean reduced amplitude, 1 + means full amplitude.""" + + @overload + def __init__( + self, + *, + zoom: Optional[float] = None, + position_x: Optional[float] = None, + position_y: Optional[float] = None, + rotation_x: Optional[float] = None, + rotation_y: Optional[float] = None, + rotation_z: Optional[float] = None, + amplitude: Optional[float] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + class ServerEvent(_Model): """A voicelive server event. @@ -3935,6 +4407,10 @@ class ServerEvent(_Model): ServerEventResponseAudioTranscriptDelta, ServerEventResponseAudioTranscriptDone, ServerEventResponseContentPartAdded, ServerEventResponseContentPartDone, ServerEventResponseCreated, ServerEventResponseDone, + ServerEventResponseFoundryAgentCallCompleted, ServerEventResponseFoundryAgentCallFailed, + ServerEventResponseFoundryAgentCallInProgress, + ServerEventResponseFoundryAgentCallArgumentsDelta, + ServerEventResponseFoundryAgentCallArgumentsDone, ServerEventResponseFunctionCallArgumentsDelta, ServerEventResponseFunctionCallArgumentsDone, ServerEventResponseMcpCallCompleted, ServerEventResponseMcpCallFailed, ServerEventResponseMcpCallInProgress, ServerEventResponseMcpCallArgumentsDelta, @@ -3960,7 +4436,10 @@ class ServerEvent(_Model): "response.function_call_arguments.done", "mcp_list_tools.in_progress", "mcp_list_tools.completed", "mcp_list_tools.failed", "response.mcp_call_arguments.delta", "response.mcp_call_arguments.done", "mcp_approval_request", "mcp_approval_response", - "response.mcp_call.in_progress", "response.mcp_call.completed", and "response.mcp_call.failed". + "response.mcp_call.in_progress", "response.mcp_call.completed", "response.mcp_call.failed", + "response.foundry_agent_call_arguments.delta", "response.foundry_agent_call_arguments.done", + "response.foundry_agent_call.in_progress", "response.foundry_agent_call.completed", and + "response.foundry_agent_call.failed". :vartype type: str or ~azure.ai.voicelive.models.ServerEventType :ivar event_id: :vartype event_id: str @@ -3968,8 +4447,8 @@ class ServerEvent(_Model): __mapping__: dict[str, _Model] = {} type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) - """The type of event. Required. Known values are: \"error\", \"session.avatar.connecting\", - \"session.created\", \"session.updated\", + """The type of event. Required. Known values are: \"error\", + \"session.avatar.connecting\", \"session.created\", \"session.updated\", \"conversation.item.input_audio_transcription.completed\", \"conversation.item.input_audio_transcription.delta\", \"conversation.item.input_audio_transcription.failed\", \"conversation.item.created\", @@ -3987,8 +4466,10 @@ class ServerEvent(_Model): \"response.function_call_arguments.done\", \"mcp_list_tools.in_progress\", \"mcp_list_tools.completed\", \"mcp_list_tools.failed\", \"response.mcp_call_arguments.delta\", \"response.mcp_call_arguments.done\", \"mcp_approval_request\", \"mcp_approval_response\", - \"response.mcp_call.in_progress\", \"response.mcp_call.completed\", and - \"response.mcp_call.failed\".""" + \"response.mcp_call.in_progress\", \"response.mcp_call.completed\", + \"response.mcp_call.failed\", \"response.foundry_agent_call_arguments.delta\", + \"response.foundry_agent_call_arguments.done\", \"response.foundry_agent_call.in_progress\", + \"response.foundry_agent_call.completed\", and \"response.foundry_agent_call.failed\".""" event_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) @overload @@ -5491,6 +5972,246 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.type = ServerEventType.RESPONSE_DONE # type: ignore +class ServerEventResponseFoundryAgentCallArgumentsDelta( + ServerEvent, discriminator="response.foundry_agent_call_arguments.delta" +): # pylint: disable=name-too-long + """Represents a delta update of the arguments for a Foundry agent call. + + :ivar event_id: + :vartype event_id: str + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA + :ivar delta: The delta of the arguments. Required. + :vartype delta: str + :ivar item_id: The ID of the item associated with the event. Required. + :vartype item_id: str + :ivar response_id: The ID of the response associated with the event. Required. + :vartype response_id: str + :ivar output_index: The index of the output associated with the event. Required. + :vartype output_index: int + """ + + type: Literal[ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + delta: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The delta of the arguments. Required.""" + item_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the item associated with the event. Required.""" + response_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the response associated with the event. Required.""" + output_index: int = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The index of the output associated with the event. Required.""" + + @overload + def __init__( + self, + *, + delta: str, + item_id: str, + response_id: str, + output_index: int, + event_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA # type: ignore + + +class ServerEventResponseFoundryAgentCallArgumentsDone( + ServerEvent, discriminator="response.foundry_agent_call_arguments.done" +): # pylint: disable=name-too-long + """Indicates the completion of the arguments for a Foundry agent call. + + :ivar event_id: + :vartype event_id: str + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE + :ivar item_id: The ID of the item associated with the event. Required. + :vartype item_id: str + :ivar response_id: The ID of the response associated with the event. Required. + :vartype response_id: str + :ivar output_index: The index of the output associated with the event. Required. + :vartype output_index: int + :ivar arguments: The full arguments for the agent call. + :vartype arguments: str + """ + + type: Literal[ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + item_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the item associated with the event. Required.""" + response_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the response associated with the event. Required.""" + output_index: int = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The index of the output associated with the event. Required.""" + arguments: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The full arguments for the agent call.""" + + @overload + def __init__( + self, + *, + item_id: str, + response_id: str, + output_index: int, + event_id: Optional[str] = None, + arguments: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE # type: ignore + + +class ServerEventResponseFoundryAgentCallCompleted( + ServerEvent, discriminator="response.foundry_agent_call.completed" +): # pylint: disable=name-too-long + """Indicates the Foundry agent call has completed. + + :ivar event_id: + :vartype event_id: str + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED + :ivar item_id: The ID of the item associated with the event. Required. + :vartype item_id: str + :ivar output_index: The index of the output associated with the event. Required. + :vartype output_index: int + """ + + type: Literal[ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + item_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the item associated with the event. Required.""" + output_index: int = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The index of the output associated with the event. Required.""" + + @overload + def __init__( + self, + *, + item_id: str, + output_index: int, + event_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED # type: ignore + + +class ServerEventResponseFoundryAgentCallFailed( + ServerEvent, discriminator="response.foundry_agent_call.failed" +): # pylint: disable=name-too-long + """Indicates the Foundry agent call has failed. + + :ivar event_id: + :vartype event_id: str + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_FOUNDRY_AGENT_CALL_FAILED + :ivar item_id: The ID of the item associated with the event. Required. + :vartype item_id: str + :ivar output_index: The index of the output associated with the event. Required. + :vartype output_index: int + """ + + type: Literal[ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_FAILED] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + item_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the item associated with the event. Required.""" + output_index: int = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The index of the output associated with the event. Required.""" + + @overload + def __init__( + self, + *, + item_id: str, + output_index: int, + event_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_FAILED # type: ignore + + +class ServerEventResponseFoundryAgentCallInProgress( + ServerEvent, discriminator="response.foundry_agent_call.in_progress" +): # pylint: disable=name-too-long + """Indicates the Foundry agent call is in progress. + + :ivar event_id: + :vartype event_id: str + :ivar type: Required. + :vartype type: str or ~azure.ai.voicelive.models.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS + :ivar item_id: The ID of the item associated with the event. Required. + :vartype item_id: str + :ivar output_index: The index of the output associated with the event. Required. + :vartype output_index: int + :ivar agent_response_id: The ID of the agent response, if any. + :vartype agent_response_id: str + """ + + type: Literal[ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) # type: ignore + """Required.""" + item_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the item associated with the event. Required.""" + output_index: int = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The index of the output associated with the event. Required.""" + agent_response_id: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"]) + """The ID of the agent response, if any.""" + + @overload + def __init__( + self, + *, + item_id: str, + output_index: int, + event_id: Optional[str] = None, + agent_response_id: Optional[str] = None, + ) -> None: ... + + @overload + def __init__(self, mapping: Mapping[str, Any]) -> None: + """ + :param mapping: raw JSON to initialize the model. + :type mapping: Mapping[str, Any] + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.type = ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS # type: ignore + + class ServerEventResponseFunctionCallArgumentsDelta( ServerEvent, discriminator="response.function_call_arguments.delta" ): # pylint: disable=name-too-long @@ -6320,13 +7041,13 @@ class ToolChoiceSelection(_Model): You probably want to use the sub-classes and not this class directly. Known sub-classes are: ToolChoiceFunctionSelection - :ivar type: Required. Known values are: "function" and "mcp". + :ivar type: Required. Known values are: "function", "mcp", and "foundry_agent". :vartype type: str or ~azure.ai.voicelive.models.ToolType """ __mapping__: dict[str, _Model] = {} type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) - """Required. Known values are: \"function\" and \"mcp\".""" + """Required. Known values are: \"function\", \"mcp\", and \"foundry_agent\".""" @overload def __init__( diff --git a/sdk/ai/azure-ai-voicelive/pyproject.toml b/sdk/ai/azure-ai-voicelive/pyproject.toml index a9e8a901bc41..5fdee2a8b6d1 100644 --- a/sdk/ai/azure-ai-voicelive/pyproject.toml +++ b/sdk/ai/azure-ai-voicelive/pyproject.toml @@ -32,7 +32,7 @@ keywords = ["azure", "azure sdk"] dependencies = [ "isodate>=0.6.1", - "azure-core>=1.36.0", + "azure-core>=1.37.0", "typing-extensions>=4.6.0", ] dynamic = [ diff --git a/sdk/ai/azure-ai-voicelive/samples/async_function_calling_sample.py b/sdk/ai/azure-ai-voicelive/samples/async_function_calling_sample.py index 4faf55a22a3b..4b804ef4e447 100644 --- a/sdk/ai/azure-ai-voicelive/samples/async_function_calling_sample.py +++ b/sdk/ai/azure-ai-voicelive/samples/async_function_calling_sample.py @@ -702,7 +702,7 @@ async def main(): """Main async function.""" # Get credentials from environment variables api_key = os.environ.get("AZURE_VOICELIVE_API_KEY") - endpoint = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "wss://api.voicelive.com/v1") + endpoint = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "https://test.voicelive.com/") if not api_key: print("āŒ Error: No API key provided") diff --git a/sdk/ai/azure-ai-voicelive/samples/async_mcp_sample.py b/sdk/ai/azure-ai-voicelive/samples/async_mcp_sample.py index 775bc41e62d9..a9f120caf6d6 100644 --- a/sdk/ai/azure-ai-voicelive/samples/async_mcp_sample.py +++ b/sdk/ai/azure-ai-voicelive/samples/async_mcp_sample.py @@ -363,17 +363,18 @@ def __init__( self.audio_processor: Optional[AudioProcessor] = None self.session_ready: bool = False - async def run(self): """Run the async MCP call client with audio input.""" try: logger.info(f"Connecting to VoiceLive API with model {self.model}") # Connect to VoiceLive WebSocket API asynchronously + # Using 2026-01-01-preview API version for MCP support async with connect( endpoint=self.endpoint, credential=self.credential, model=self.model, + api_version="2026-01-01-preview", ) as connection: # Initialize audio processor self.audio_processor = AudioProcessor(connection) @@ -422,17 +423,14 @@ async def _setup_session(self, connection): MCPServer( server_label="deepwiki", server_url="https://mcp.deepwiki.com/mcp", - allowed_tools= [ - "read_wiki_structure", - "ask_question" - ], + allowed_tools=["read_wiki_structure", "ask_question"], require_approval="never", ), MCPServer( server_label="azure_doc", server_url="https://learn.microsoft.com/api/mcp", require_approval="always", - ) + ), ] # Create session configuration with MCP tools @@ -557,7 +555,9 @@ async def _handle_mcp_approval_request(self, conversation_created_event, connect logger.error("MCP approval item missing ID") return - logger.info(f"MCP Approval Request received: id={approval_id}, server_label={server_label}, function_name={function_name}, arguments={arguments}") + logger.info( + f"MCP Approval Request received: id={approval_id}, server_label={server_label}, function_name={function_name}, arguments={arguments}" + ) # wait for user input to approve or deny approval_response = False @@ -574,8 +574,7 @@ async def _handle_mcp_approval_request(self, conversation_created_event, connect # Send approval response approval_response_item = MCPApprovalResponseRequestItem( - approval_request_id=approval_id, - approve=approval_response + approval_request_id=approval_id, approve=approval_response ) await connection.conversation.item.create(item=approval_response_item) @@ -642,12 +641,13 @@ async def _handle_mcp_call_arguments(self, conversation_created_event, connectio logger.error(f"Error waiting for MCP call arguments done: {e}") return + async def main(): """Main async function.""" # Get credentials from environment variables api_key = os.environ.get("AZURE_VOICELIVE_API_KEY") - # important, PLEASE SET the features=mcp_preview:true in query params to enable mcp features - endpoint = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "wss://api.voicelive.com/v1?features=mcp_preview:true") + # Note: Use api_version="2026-01-01-preview" to enable MCP features + endpoint = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "https://test.voicelive.com/") if not api_key: print("āŒ Error: No API key provided") @@ -668,7 +668,7 @@ async def main(): credential=credential, model="gpt-4o-realtime-preview", voice="en-US-AvaNeural", - instructions="You are a helpful AI assistant with access to some mcp server. " + instructions="You are a helpful AI assistant with access to some mcp server. ", ) # Setup signal handlers for graceful shutdown diff --git a/sdk/ai/azure-ai-voicelive/samples/supervisor_agent_sample.py b/sdk/ai/azure-ai-voicelive/samples/supervisor_agent_sample.py new file mode 100644 index 000000000000..9514cf835e63 --- /dev/null +++ b/sdk/ai/azure-ai-voicelive/samples/supervisor_agent_sample.py @@ -0,0 +1,637 @@ +# pylint: disable=line-too-long,useless-suppression +#!/usr/bin/env python + +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +""" +FILE: supervisor_agent_sample.py + +DESCRIPTION: + This sample demonstrates how to use a supervisor agent powered by azure foundry in a voice live. + The Supervisor pattern is a common multi-agent architecture where a central AI agent acts as a coordinator, managing and delegating tasks to specialized supervisor agents. +""" + +import os +import sys +import asyncio +import logging +import base64 +import signal +import threading +import queue +from typing import Union, Optional, cast +from concurrent.futures import ThreadPoolExecutor + +# Audio processing imports +try: + import pyaudio +except ImportError: + print("This sample requires pyaudio. Install with: pip install pyaudio") + sys.exit(1) + +# Environment variable loading +try: + from dotenv import load_dotenv + + load_dotenv() +except ImportError: + print("Note: python-dotenv not installed. Using existing environment variables.") + +# Azure VoiceLive SDK imports +from azure.core.credentials import AzureKeyCredential +from azure.core.credentials_async import AsyncTokenCredential +from azure.ai.voicelive.aio import connect +from azure.ai.voicelive.models import ( + RequestSession, + ServerEventType, + ServerVad, + AudioEchoCancellation, + AzureStandardVoice, + Modality, + InputAudioFormat, + OutputAudioFormat, + FoundryAgentTool, + ToolChoiceLiteral, + AudioInputTranscriptionOptions, + ResponseFoundryAgentCallItem, +) + +# Set up logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +def _get_required_env(name: str) -> str: + """Get a required environment variable or exit with error.""" + value = os.environ.get(name) + if not value: + print(f"āŒ Error: No {name} provided") + print(f"Please set the {name} environment variable.") + sys.exit(1) + return value + + +supervisor_agent_name = _get_required_env("SUPERVISOR_AGENT_NAME") +supervisor_agent_version = _get_required_env("SUPERVISOR_AGENT_VERSION") +supervisor_agent_project_name = _get_required_env("SUPERVISOR_AGENT_PROJECT_NAME") + +supervisor_agent_description = "You are a supervisor agent that determines the next response whenever the agent faces a non-trivial decision" + +chat_agent_instructions = f""" +You are a helpful agent. Your task is to maintain a natural conversation flow with the user. +By default, you must always use the {supervisor_agent_name} tool to get your next response, except when handling greetings (e.g., "hello", "hi there") or engaging in basic chitchat (e.g., "how are you?", "thank you"). +Before calling {supervisor_agent_name}, you MUST ALWAYS say something to the user (e.g., "Let me look into that.", "Just a second."). Never call {supervisor_agent_name} without first saying something to the user. +""" + +chat_agent_model = os.environ.get("CHAT_AGENT_MODEL", "gpt-realtime") + +chat_agent_voice = os.environ.get("CHAT_AGENT_VOICE", "en-US-AvaMultilingualNeural") + +class AudioProcessor: + """ + Handles real-time audio capture and playback for the voice assistant. + + This class manages bidirectional audio streaming: + - Captures audio input from the microphone using PyAudio + - Plays back audio output through speakers using PyAudio + - Uses separate threads for capture, sending, and playback to avoid blocking + - Uses queues to buffer audio data between threads for thread-safe communication + + Audio format: PCM16, 24kHz, mono + """ + + def __init__(self, connection): + self.connection = connection + self.audio = pyaudio.PyAudio() + + # Audio configuration - PCM16, 24kHz, mono as specified + self.format = pyaudio.paInt16 + self.channels = 1 + self.rate = 24000 + self.chunk_size = 1024 + + # Capture and playback state + self.is_capturing = False + self.is_playing = False + self.input_stream = None + self.output_stream = None + + # Audio queues and threading + self.audio_queue: "queue.Queue[bytes]" = queue.Queue() + self.audio_send_queue: "queue.Queue[str]" = queue.Queue() # base64 audio to send + self.executor = ThreadPoolExecutor(max_workers=3) + self.capture_thread: Optional[threading.Thread] = None + self.playback_thread: Optional[threading.Thread] = None + self.send_thread: Optional[threading.Thread] = None + self.loop: Optional[asyncio.AbstractEventLoop] = None # Store the event loop + + logger.info("AudioProcessor initialized with 24kHz PCM16 mono audio") + + async def start_capture(self): + """Start capturing audio from microphone.""" + if self.is_capturing: + return + + # Store the current event loop for use in threads + self.loop = asyncio.get_event_loop() + + self.is_capturing = True + + try: + self.input_stream = self.audio.open( + format=self.format, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.chunk_size, + stream_callback=None, + ) + + self.input_stream.start_stream() + + # Start capture thread + self.capture_thread = threading.Thread(target=self._capture_audio_thread) + self.capture_thread.daemon = True + self.capture_thread.start() + + # Start audio send thread + self.send_thread = threading.Thread(target=self._send_audio_thread) + self.send_thread.daemon = True + self.send_thread.start() + + logger.info("Started audio capture") + + except Exception as e: + logger.error(f"Failed to start audio capture: {e}") + self.is_capturing = False + raise + + def _capture_audio_thread(self): + """Audio capture thread - runs in background.""" + while self.is_capturing and self.input_stream: + try: + # Read audio data + audio_data = self.input_stream.read(self.chunk_size, exception_on_overflow=False) + + if audio_data and self.is_capturing: + # Convert to base64 and queue for sending + audio_base64 = base64.b64encode(audio_data).decode("utf-8") + self.audio_send_queue.put(audio_base64) + + except Exception as e: + if self.is_capturing: + logger.error(f"Error in audio capture: {e}") + break + + def _send_audio_thread(self): + """Audio send thread - handles async operations from sync thread.""" + while self.is_capturing: + try: + # Get audio data from queue (blocking with timeout) + audio_base64 = self.audio_send_queue.get(timeout=0.1) + + if audio_base64 and self.is_capturing and self.loop: + # Schedule the async send operation in the main event loop + try: + future = asyncio.run_coroutine_threadsafe( + self.connection.input_audio_buffer.append(audio=audio_base64), self.loop + ) + # Wait briefly to catch any immediate errors + try: + future.result(timeout=0.05) + except TimeoutError: + pass # Expected - operation still running + except Exception as e: + if self.is_capturing: + logger.error(f"Error scheduling audio send: {e}") + + except queue.Empty: + continue + except Exception as e: + if self.is_capturing: + logger.error(f"Error in send audio thread: {e}") + break + logger.debug("Send audio thread exiting") + + async def stop_capture(self): + """Stop capturing audio.""" + if not self.is_capturing: + return + + self.is_capturing = False + + if self.input_stream: + self.input_stream.stop_stream() + self.input_stream.close() + self.input_stream = None + + if self.capture_thread: + self.capture_thread.join(timeout=1.0) + + if self.send_thread: + self.send_thread.join(timeout=1.0) + + # Clear the send queue + while not self.audio_send_queue.empty(): + try: + self.audio_send_queue.get_nowait() + except queue.Empty: + break + + logger.info("Stopped audio capture") + + async def start_playback(self): + """Initialize audio playback system.""" + if self.is_playing: + return + + self.is_playing = True + + try: + self.output_stream = self.audio.open( + format=self.format, + channels=self.channels, + rate=self.rate, + output=True, + frames_per_buffer=self.chunk_size, + ) + + # Start playback thread + self.playback_thread = threading.Thread(target=self._playback_audio_thread) + self.playback_thread.daemon = True + self.playback_thread.start() + + logger.info("Audio playback system ready") + + except Exception as e: + logger.error(f"Failed to initialize audio playback: {e}") + self.is_playing = False + raise + + def _playback_audio_thread(self): + """Audio playback thread - runs in background.""" + while self.is_playing: + try: + # Get audio data from queue (blocking with timeout) + audio_data = self.audio_queue.get(timeout=0.1) + + # Double-check state before writing to prevent race condition + if audio_data and self.is_playing: + stream = self.output_stream + if stream: + try: + stream.write(audio_data) + except OSError as e: + # Stream was closed, exit gracefully + if self.is_playing: + logger.debug(f"Stream write interrupted: {e}") + break + + except queue.Empty: + continue + except Exception as e: + if self.is_playing: + logger.error(f"Error in audio playback: {e}") + break + logger.debug("Playback thread exiting") + + async def queue_audio(self, audio_data: bytes): + """Queue audio data for playback.""" + if self.is_playing: + self.audio_queue.put(audio_data) + + async def stop_playback(self): + """Stop audio playback and clear queue.""" + if not self.is_playing: + return + + self.is_playing = False + + # Clear the queue to help thread exit faster + while not self.audio_queue.empty(): + try: + self.audio_queue.get_nowait() + except queue.Empty: + break + + # IMPORTANT: Join thread BEFORE closing stream to prevent race condition + # where thread tries to write after stream is closed + if self.playback_thread: + self.playback_thread.join(timeout=2.0) + self.playback_thread = None + + if self.output_stream: + try: + self.output_stream.stop_stream() + self.output_stream.close() + except Exception as e: + logger.warning(f"Error closing output stream: {e}") + finally: + self.output_stream = None + + logger.info("Stopped audio playback") + + async def cleanup(self): + """Clean up audio resources.""" + await self.stop_capture() + await self.stop_playback() + + if self.audio: + self.audio.terminate() + + self.executor.shutdown(wait=True) + logger.info("Audio processor cleaned up") + + +class AsyncSupervisorAgentClient: + """Async client for Azure Voice Live API with supervisor agent capabilities and audio input.""" + + def __init__( + self, + endpoint: str, + credential: Union[AzureKeyCredential, AsyncTokenCredential], + model: str, + voice: str, + instructions: str, + ): + self.endpoint = endpoint + self.credential = credential + self.model = model + self.voice = voice + self.instructions = instructions + self.session_id: Optional[str] = None + self.audio_processor: Optional[AudioProcessor] = None + self.session_ready: bool = False + + + async def run(self) -> None: + """Run the voice assistant with supervisor agent capabilities. + + Establishes WebSocket connection, configures the session, + and processes events until interrupted. + """ + try: + logger.info(f"Connecting to VoiceLive API with model {self.model}") + + # Connect to VoiceLive WebSocket API asynchronously + async with connect( + endpoint=self.endpoint, + credential=self.credential, + model=self.model, + api_version="2026-01-01-preview", + ) as connection: + # Initialize audio processor + self.audio_processor = AudioProcessor(connection) + + # Configure session with function tools + await self._setup_session(connection) + + # Start audio playback system + await self.audio_processor.start_playback() + + logger.info("Voice assistant with agent tools ready! Start speaking...") + print("\n" + "=" * 70) + print("šŸŽ¤ VOICE ASSISTANT WITH SUPERVISOR AGENT READY") + print("Try saying:") + print(" • 'What production do you have?'") + print(" • 'What color do you have?'") + print("Press Ctrl+C to exit") + print("=" * 70 + "\n") + + # Process events asynchronously + await self._process_events(connection) + + except KeyboardInterrupt: + logger.info("Received interrupt signal, shutting down...") + except Exception as e: + logger.error(f"Connection error: {e}") + raise + finally: + # Cleanup audio processor + if self.audio_processor: + await self.audio_processor.cleanup() + + async def _setup_session(self, connection) -> None: + """Configure the VoiceLive session with supervisor agent capabilities.""" + logger.info("Setting up voice conversation session...") + + # Create voice configuration + voice_config = AzureStandardVoice(name=self.voice) + + # Create turn detection configuration + turn_detection_config = ServerVad(threshold=0.5, prefix_padding_ms=300, silence_duration_ms=500) + + # Define available foundry agent tools + foundry_agent_tools: list[FoundryAgentTool] = [ + FoundryAgentTool( + agent_name=supervisor_agent_name, + agent_version=supervisor_agent_version, + project_name=supervisor_agent_project_name, + description=supervisor_agent_description, + ) + ] + + # Create session configuration with foundry agent tools + session_config = RequestSession( + modalities=[Modality.TEXT, Modality.AUDIO], + instructions=self.instructions, + voice=voice_config, + input_audio_format=InputAudioFormat.PCM16, + output_audio_format=OutputAudioFormat.PCM16, + input_audio_echo_cancellation=AudioEchoCancellation(), + turn_detection=turn_detection_config, + tools=foundry_agent_tools, + tool_choice=ToolChoiceLiteral.AUTO, # Let the model decide when to call functions + input_audio_transcription=AudioInputTranscriptionOptions(model="whisper-1"), + ) + + # Send session configuration asynchronously + await connection.session.update(session=session_config) + logger.info("Session configuration with agent tools sent") + + async def _process_events(self, connection) -> None: + """Process events from the VoiceLive connection.""" + try: + async for event in connection: + await self._handle_event(event, connection) + except KeyboardInterrupt: + logger.info("Event processing interrupted") + except Exception as e: + logger.error(f"Error processing events: {e}") + raise + + async def _handle_event(self, event, connection) -> None: + """Handle different types of server events from VoiceLive.""" + ap = self.audio_processor + assert ap is not None, "AudioProcessor must be initialized" + + if event.type == ServerEventType.SESSION_UPDATED: + self.session_id = event.session.id + logger.info(f"Session ready: {self.session_id}") + self.session_ready = True + + # Start audio capture once session is ready + await ap.start_capture() + print("šŸŽ¤ Ready for voice input! Try asking about zava productions...") + + elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED: + logger.info("šŸŽ¤ User started speaking - stopping playback") + print("šŸŽ¤ Listening...") + + # Stop current assistant audio playback (interruption handling) + try: + await ap.stop_playback() + except Exception as e: + logger.error(f"Error stopping playback: {e}") + + elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED: + logger.info("šŸŽ¤ User stopped speaking") + print("šŸ¤” Processing...") + + # Restart playback system for response + try: + await ap.start_playback() + except Exception as e: + logger.error(f"Error starting playback: {e}") + + elif event.type == ServerEventType.RESPONSE_CREATED: + logger.info("šŸ¤– Assistant response created") + + elif event.type == ServerEventType.RESPONSE_TEXT_DELTA: + logger.info(f"Text response: {event.delta}") + + elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA: + # Stream audio response to speakers + logger.debug("Received audio delta") + await ap.queue_audio(event.delta) + + elif event.type == ServerEventType.RESPONSE_AUDIO_DONE: + logger.info("šŸ¤– Assistant finished speaking") + print("šŸŽ¤ Ready for next input...") + + elif event.type == ServerEventType.RESPONSE_DONE: + logger.info("āœ… Response complete") + + elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED: + if isinstance(event.item, ResponseFoundryAgentCallItem): + logger.info(f"šŸ› ļø Foundry Agent Call initiated with tool: {event.item.name}") + + elif event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE: + logger.info(f"šŸ› ļø Foundry Agent Call arguments: {event.arguments}") + + elif event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS: + if event.agent_response_id: + logger.info(f"šŸ› ļø Foundry Agent Call in progress with response ID: {event.agent_response_id}") + + elif event.type == ServerEventType.RESPONSE_OUTPUT_ITEM_DONE: + if isinstance(event.item, ResponseFoundryAgentCallItem): + logger.info(f"šŸ› ļø Foundry Agent Call completed with output: {event.item.output}") + + elif event.type == ServerEventType.ERROR: + logger.error(f"āŒ VoiceLive error: {event.error.message}") + print(f"Error: {event.error.message}") + + +async def main() -> None: + """Main entry point for the supervisor agent sample.""" + # Get credentials from environment variables + api_key = os.environ.get("AZURE_VOICELIVE_API_KEY") + endpoint = os.environ.get("AZURE_VOICELIVE_ENDPOINT", "wss://api.voicelive.com/v1") + + if not api_key: + print("āŒ Error: No AZURE_VOICELIVE_API_KEY provided") + print("Please set the AZURE_VOICELIVE_API_KEY environment variable.") + sys.exit(1) + + # Option 1: API key authentication (simple, recommended for quick start) + credential: Union[AzureKeyCredential, AsyncTokenCredential] = AzureKeyCredential(api_key) + + # Option 2: Async AAD authentication (requires azure-identity) + # Uncomment the lines below to use AAD authentication instead: + # from azure.identity.aio import AzureCliCredential, DefaultAzureCredential + # credential = AzureCliCredential() + + # Create and run the supervisor agent client + client = AsyncSupervisorAgentClient( + endpoint=endpoint, + credential=credential, + model=chat_agent_model, + voice=chat_agent_voice, + instructions=chat_agent_instructions, + ) + + # Setup signal handlers for graceful shutdown + def signal_handler(sig, frame): + logger.info("Received shutdown signal") + raise KeyboardInterrupt() + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + await client.run() + except KeyboardInterrupt: + print("\nšŸ‘‹ Supervisor Agent shut down.") + except Exception as e: + logger.error(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + # Check for required dependencies + dependencies = { + "pyaudio": "Audio processing", + "azure.ai.voicelive": "Azure VoiceLive SDK", + "azure.core": "Azure Core libraries", + } + + missing_deps = [] + for dep, description in dependencies.items(): + try: + __import__(dep.replace("-", "_")) + except ImportError: + missing_deps.append(f"{dep} ({description})") + + if missing_deps: + print("āŒ Missing required dependencies:") + for dep in missing_deps: + print(f" - {dep}") + print("\nInstall with: pip install azure-ai-voicelive pyaudio python-dotenv") + sys.exit(1) + + # Check audio system + try: + p = pyaudio.PyAudio() + # Check for input devices + input_devices = [ + i + for i in range(p.get_device_count()) + if cast(Union[int, float], p.get_device_info_by_index(i).get("maxInputChannels", 0) or 0) > 0 + ] + # Check for output devices + output_devices = [ + i + for i in range(p.get_device_count()) + if cast(Union[int, float], p.get_device_info_by_index(i).get("maxOutputChannels", 0) or 0) > 0 + ] + p.terminate() + + if not input_devices: + print("āŒ No audio input devices found. Please check your microphone.") + sys.exit(1) + if not output_devices: + print("āŒ No audio output devices found. Please check your speakers.") + sys.exit(1) + + except Exception as e: + print(f"āŒ Audio system check failed: {e}") + sys.exit(1) + + print("šŸŽ™ļø Voice Assistant with Supervisor Agent - Azure VoiceLive SDK") + print("=" * 65) + + # Run the async main function + asyncio.run(main()) diff --git a/sdk/ai/azure-ai-voicelive/tests/test_live_realtime_service.py b/sdk/ai/azure-ai-voicelive/tests/test_live_realtime_service.py index e768a53e10a8..ff9ee1e40031 100644 --- a/sdk/ai/azure-ai-voicelive/tests/test_live_realtime_service.py +++ b/sdk/ai/azure-ai-voicelive/tests/test_live_realtime_service.py @@ -410,6 +410,7 @@ async def test_realtime_service_tool_call(self, test_data_dir: Path, model: str, await conn.session.update(session=session) await conn.input_audio_buffer.append(audio=_load_audio_b64(audio_file)) + await conn.response.create() timeout_s = 10 conversation_created_events = [] function_call_results = [] diff --git a/sdk/ai/azure-ai-voicelive/tests/test_unit_enums.py b/sdk/ai/azure-ai-voicelive/tests/test_unit_enums.py index a3f71871d674..d3341a18a3e7 100644 --- a/sdk/ai/azure-ai-voicelive/tests/test_unit_enums.py +++ b/sdk/ai/azure-ai-voicelive/tests/test_unit_enums.py @@ -199,8 +199,8 @@ class TestOutputAudioFormat: def test_pcm_formats(self): """Test PCM format values.""" assert OutputAudioFormat.PCM16 == "pcm16" - assert OutputAudioFormat.PCM16_8000_HZ == "pcm16-8000hz" - assert OutputAudioFormat.PCM16_16000_HZ == "pcm16-16000hz" + assert OutputAudioFormat.PCM16_8000_HZ == "pcm16_8000hz" + assert OutputAudioFormat.PCM16_16000_HZ == "pcm16_16000hz" def test_g711_formats(self): """Test G.711 format values.""" diff --git a/sdk/ai/azure-ai-voicelive/tests/test_unit_models_filler_foundry.py b/sdk/ai/azure-ai-voicelive/tests/test_unit_models_filler_foundry.py new file mode 100644 index 000000000000..7667ef28fd20 --- /dev/null +++ b/sdk/ai/azure-ai-voicelive/tests/test_unit_models_filler_foundry.py @@ -0,0 +1,669 @@ +# coding=utf-8 +# -------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +""" +Unit tests for FillerResponseConfig and FoundryAgentTool models. +Tests the filler response feature and Foundry agent integration. +""" + +import pytest +from azure.ai.voicelive.models import ( + BasicFillerResponseConfig, + FillerResponseConfigType, + FillerTrigger, + FoundryAgentContextType, + FoundryAgentTool, + ItemType, + LlmFillerResponseConfig, + ReasoningEffort, + RequestSession, + Response, + ResponseCreateParams, + ResponseFoundryAgentCallItem, + ResponseSession, + ServerEventResponseFoundryAgentCallArgumentsDelta, + ServerEventResponseFoundryAgentCallArgumentsDone, + ServerEventResponseFoundryAgentCallCompleted, + ServerEventResponseFoundryAgentCallFailed, + ServerEventResponseFoundryAgentCallInProgress, + ServerEventType, + ToolType, +) + + +class TestBasicFillerResponseConfig: + """Test BasicFillerResponseConfig model.""" + + def test_basic_filler_minimal(self): + """Test BasicFillerResponseConfig with minimal parameters.""" + config = BasicFillerResponseConfig() + + assert config.type == FillerResponseConfigType.STATIC_FILLER + assert config.triggers is None + assert config.latency_threshold_ms is None + assert config.texts is None + + def test_basic_filler_with_texts(self): + """Test BasicFillerResponseConfig with filler texts.""" + texts = ["Hmm...", "Let me think...", "One moment..."] + config = BasicFillerResponseConfig(texts=texts) + + assert config.type == FillerResponseConfigType.STATIC_FILLER + assert config.texts == texts + assert len(config.texts) == 3 + + def test_basic_filler_with_triggers(self): + """Test BasicFillerResponseConfig with triggers.""" + config = BasicFillerResponseConfig( + triggers=[FillerTrigger.LATENCY, FillerTrigger.TOOL], latency_threshold_ms=2000, texts=["Please wait..."] + ) + + assert config.type == FillerResponseConfigType.STATIC_FILLER + assert FillerTrigger.LATENCY in config.triggers + assert FillerTrigger.TOOL in config.triggers + assert config.latency_threshold_ms == 2000 + + def test_basic_filler_string_triggers(self): + """Test BasicFillerResponseConfig with string triggers.""" + config = BasicFillerResponseConfig(triggers=["latency", "tool"]) + + assert config.triggers == ["latency", "tool"] + + +class TestLlmFillerResponseConfig: + """Test LlmFillerResponseConfig model.""" + + def test_llm_filler_minimal(self): + """Test LlmFillerResponseConfig with minimal parameters.""" + config = LlmFillerResponseConfig() + + assert config.type == FillerResponseConfigType.LLM_FILLER + assert config.model is None + assert config.instructions is None + assert config.max_completion_tokens is None + + def test_llm_filler_full(self): + """Test LlmFillerResponseConfig with all parameters.""" + config = LlmFillerResponseConfig( + triggers=[FillerTrigger.LATENCY], + latency_threshold_ms=1500, + model="gpt-4o-mini", + instructions="Generate brief filler responses.", + max_completion_tokens=50, + ) + + assert config.type == FillerResponseConfigType.LLM_FILLER + assert FillerTrigger.LATENCY in config.triggers + assert config.latency_threshold_ms == 1500 + assert config.model == "gpt-4o-mini" + assert config.max_completion_tokens == 50 + + def test_filler_config_type_discrimination(self): + """Test that filler config types are properly discriminated.""" + basic = BasicFillerResponseConfig(texts=["Wait..."]) + llm = LlmFillerResponseConfig(model="gpt-4o") + + assert basic.type != llm.type + assert basic.type == FillerResponseConfigType.STATIC_FILLER + assert llm.type == FillerResponseConfigType.LLM_FILLER + + +class TestFoundryAgentTool: + """Test FoundryAgentTool model.""" + + def test_foundry_agent_minimal(self): + """Test FoundryAgentTool with required parameters only.""" + tool = FoundryAgentTool(agent_name="my-agent", project_name="my-project") + + assert tool.type == ToolType.FOUNDRY_AGENT + assert tool.agent_name == "my-agent" + assert tool.project_name == "my-project" + assert tool.agent_version is None + + def test_foundry_agent_full(self): + """Test FoundryAgentTool with all parameters.""" + tool = FoundryAgentTool( + agent_name="my-agent", + project_name="my-project", + agent_version="v1.0", + client_id="client-123", + description="A helpful agent", + foundry_resource_override="https://custom.azure.com", + agent_context_type=FoundryAgentContextType.AGENT_CONTEXT, + return_agent_response_directly=True, + ) + + assert tool.agent_name == "my-agent" + assert tool.agent_version == "v1.0" + assert tool.client_id == "client-123" + assert tool.description == "A helpful agent" + assert tool.agent_context_type == FoundryAgentContextType.AGENT_CONTEXT + assert tool.return_agent_response_directly is True + + def test_foundry_agent_context_types(self): + """Test FoundryAgentTool with different context types.""" + no_context = FoundryAgentTool( + agent_name="a", project_name="p", agent_context_type=FoundryAgentContextType.NO_CONTEXT + ) + agent_context = FoundryAgentTool( + agent_name="a", project_name="p", agent_context_type=FoundryAgentContextType.AGENT_CONTEXT + ) + + assert no_context.agent_context_type == FoundryAgentContextType.NO_CONTEXT + assert agent_context.agent_context_type == FoundryAgentContextType.AGENT_CONTEXT + + +class TestResponseFoundryAgentCallItem: + """Test ResponseFoundryAgentCallItem model.""" + + def test_foundry_call_item_minimal(self): + """Test ResponseFoundryAgentCallItem with required fields.""" + item = ResponseFoundryAgentCallItem(name="my-agent", call_id="call-123", arguments='{"param": "value"}') + + assert item.type == ItemType.FOUNDRY_AGENT_CALL + assert item.name == "my-agent" + assert item.call_id == "call-123" + assert item.arguments == '{"param": "value"}' + + def test_foundry_call_item_with_output(self): + """Test ResponseFoundryAgentCallItem with output.""" + item = ResponseFoundryAgentCallItem( + name="my-agent", + call_id="call-123", + arguments="{}", + agent_response_id="resp-456", + output='{"result": "success"}', + ) + + assert item.agent_response_id == "resp-456" + assert item.output == '{"result": "success"}' + + def test_foundry_call_item_with_error(self): + """Test ResponseFoundryAgentCallItem with error.""" + error = {"code": "error", "message": "Failed"} + item = ResponseFoundryAgentCallItem(name="agent", call_id="call-1", arguments="{}", error=error) + + assert item.error == error + + +class TestFoundryAgentServerEvents: + """Test server events for Foundry agent calls.""" + + def test_arguments_delta_event(self): + """Test ServerEventResponseFoundryAgentCallArgumentsDelta.""" + event = ServerEventResponseFoundryAgentCallArgumentsDelta( + delta='{"p":', item_id="item-1", response_id="resp-1", output_index=0 + ) + + assert event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA + assert event.delta == '{"p":' + assert event.item_id == "item-1" + + def test_arguments_done_event(self): + """Test ServerEventResponseFoundryAgentCallArgumentsDone.""" + event = ServerEventResponseFoundryAgentCallArgumentsDone( + item_id="item-1", response_id="resp-1", output_index=0, arguments='{"param": "value"}' + ) + + assert event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE + assert event.arguments == '{"param": "value"}' + + def test_in_progress_event(self): + """Test ServerEventResponseFoundryAgentCallInProgress.""" + event = ServerEventResponseFoundryAgentCallInProgress( + item_id="item-1", output_index=0, agent_response_id="agent-resp-1" + ) + + assert event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS + assert event.agent_response_id == "agent-resp-1" + + def test_completed_event(self): + """Test ServerEventResponseFoundryAgentCallCompleted.""" + event = ServerEventResponseFoundryAgentCallCompleted(item_id="item-1", output_index=0) + + assert event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED + + def test_failed_event(self): + """Test ServerEventResponseFoundryAgentCallFailed.""" + event = ServerEventResponseFoundryAgentCallFailed(item_id="item-1", output_index=0) + + assert event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_FAILED + + +class TestReasoningEffort: + """Test ReasoningEffort enum and usage.""" + + def test_reasoning_effort_values(self): + """Test all ReasoningEffort enum values.""" + assert ReasoningEffort.NONE == "none" + assert ReasoningEffort.MINIMAL == "minimal" + assert ReasoningEffort.LOW == "low" + assert ReasoningEffort.MEDIUM == "medium" + assert ReasoningEffort.HIGH == "high" + assert ReasoningEffort.XHIGH == "xhigh" + + def test_reasoning_effort_in_request_session(self): + """Test using reasoning_effort in RequestSession.""" + session = RequestSession(model="o1-preview", reasoning_effort=ReasoningEffort.MEDIUM) + + assert session.reasoning_effort == ReasoningEffort.MEDIUM + + def test_reasoning_effort_in_response_params(self): + """Test using reasoning_effort in ResponseCreateParams.""" + params = ResponseCreateParams(reasoning_effort=ReasoningEffort.LOW) + + assert params.reasoning_effort == ReasoningEffort.LOW + + +class TestResponseMetadata: + """Test metadata field in Response models.""" + + def test_response_with_metadata(self): + """Test Response with metadata.""" + metadata = {"session": "abc", "user": "123"} + response = Response(id="resp-1", metadata=metadata) + + assert response.metadata == metadata + assert response.metadata["session"] == "abc" + + def test_response_create_params_with_metadata(self): + """Test ResponseCreateParams with metadata.""" + metadata = {"test": "value"} + params = ResponseCreateParams(metadata=metadata) + + assert params.metadata == metadata + + +class TestSessionWithFillerResponse: + """Test session models with filler_response field.""" + + def test_request_session_with_basic_filler(self): + """Test RequestSession with BasicFillerResponseConfig.""" + filler = BasicFillerResponseConfig(texts=["Hmm..."]) + session = RequestSession(model="gpt-4o-realtime-preview", filler_response=filler) + + assert session.filler_response is not None + assert session.filler_response.type == FillerResponseConfigType.STATIC_FILLER + + def test_request_session_with_llm_filler(self): + """Test RequestSession with LlmFillerResponseConfig.""" + filler = LlmFillerResponseConfig(model="gpt-4o-mini") + session = RequestSession(model="gpt-4o-realtime-preview", filler_response=filler) + + assert session.filler_response is not None + assert session.filler_response.type == FillerResponseConfigType.LLM_FILLER + + def test_response_session_with_filler(self): + """Test ResponseSession with filler_response.""" + filler = BasicFillerResponseConfig(texts=["Wait..."]) + session = ResponseSession(model="gpt-4o-realtime-preview", filler_response=filler) + + assert session.filler_response is not None + + +class TestNewEnums: + """Test new enum types.""" + + def test_filler_trigger_enum(self): + """Test FillerTrigger enum.""" + assert FillerTrigger.LATENCY == "latency" + assert FillerTrigger.TOOL == "tool" + + def test_filler_config_type_enum(self): + """Test FillerResponseConfigType enum.""" + assert FillerResponseConfigType.STATIC_FILLER == "static_filler" + assert FillerResponseConfigType.LLM_FILLER == "llm_filler" + + def test_foundry_context_type_enum(self): + """Test FoundryAgentContextType enum.""" + assert FoundryAgentContextType.NO_CONTEXT == "no_context" + assert FoundryAgentContextType.AGENT_CONTEXT == "agent_context" + + def test_tool_type_includes_foundry(self): + """Test ToolType includes FOUNDRY_AGENT.""" + assert ToolType.FOUNDRY_AGENT == "foundry_agent" + + def test_item_type_includes_foundry_call(self): + """Test ItemType includes FOUNDRY_AGENT_CALL.""" + assert ItemType.FOUNDRY_AGENT_CALL == "foundry_agent_call" + + +class TestIntegrationScenarios: + """Test integration scenarios with new features.""" + + def test_session_with_foundry_and_filler(self): + """Test complete session with Foundry agent and filler.""" + agent = FoundryAgentTool( + agent_name="support-agent", + project_name="support", + agent_context_type=FoundryAgentContextType.AGENT_CONTEXT, + ) + filler = LlmFillerResponseConfig(triggers=[FillerTrigger.LATENCY, FillerTrigger.TOOL], model="gpt-4o-mini") + session = RequestSession( + model="gpt-4o-realtime-preview", + tools=[agent], + reasoning_effort=ReasoningEffort.MEDIUM, + filler_response=filler, + ) + + assert len(session.tools) == 1 + assert session.tools[0].type == ToolType.FOUNDRY_AGENT + assert session.reasoning_effort == ReasoningEffort.MEDIUM + assert session.filler_response.type == FillerResponseConfigType.LLM_FILLER + + def test_mixed_tool_types(self): + """Test session with mixed tool types.""" + from azure.ai.voicelive.models import FunctionTool + + func = FunctionTool(name="func", description="test", parameters={}) + foundry = FoundryAgentTool(agent_name="agent", project_name="project") + session = RequestSession(model="gpt-4o-realtime-preview", tools=[func, foundry]) + + assert len(session.tools) == 2 + assert session.tools[0].type == ToolType.FUNCTION + assert session.tools[1].type == ToolType.FOUNDRY_AGENT + + +class TestSerialization: + """Test serialization and deserialization of new models.""" + + def test_basic_filler_serialization(self): + """Test BasicFillerResponseConfig serialization.""" + config = BasicFillerResponseConfig( + triggers=[FillerTrigger.LATENCY], latency_threshold_ms=2000, texts=["Wait...", "One moment..."] + ) + + # Serialize to dict + data = dict(config) + + assert data["type"] == "static_filler" + assert data["triggers"] == ["latency"] + assert data["latency_threshold_ms"] == 2000 + assert data["texts"] == ["Wait...", "One moment..."] + + def test_llm_filler_serialization(self): + """Test LlmFillerResponseConfig serialization.""" + config = LlmFillerResponseConfig( + triggers=[FillerTrigger.TOOL], model="gpt-4o-mini", instructions="Be brief", max_completion_tokens=50 + ) + + data = dict(config) + + assert data["type"] == "llm_filler" + assert data["model"] == "gpt-4o-mini" + assert data["instructions"] == "Be brief" + assert data["max_completion_tokens"] == 50 + + def test_foundry_tool_serialization(self): + """Test FoundryAgentTool serialization.""" + tool = FoundryAgentTool( + agent_name="my-agent", + project_name="my-project", + agent_version="v1.0", + client_id="client-123", + description="Test agent", + agent_context_type=FoundryAgentContextType.AGENT_CONTEXT, + return_agent_response_directly=True, + ) + + data = dict(tool) + + assert data["type"] == "foundry_agent" + assert data["agent_name"] == "my-agent" + assert data["project_name"] == "my-project" + assert data["agent_version"] == "v1.0" + assert data["agent_context_type"] == "agent_context" + assert data["return_agent_response_directly"] is True + + def test_foundry_call_item_serialization(self): + """Test ResponseFoundryAgentCallItem serialization.""" + item = ResponseFoundryAgentCallItem( + name="agent-1", + call_id="call-123", + arguments='{"param": "value"}', + output='{"result": "success"}', + agent_response_id="resp-456", + ) + + data = dict(item) + + assert data["type"] == "foundry_agent_call" + assert data["name"] == "agent-1" + assert data["call_id"] == "call-123" + assert data["arguments"] == '{"param": "value"}' + assert data["output"] == '{"result": "success"}' + + def test_foundry_event_serialization(self): + """Test Foundry server event serialization.""" + event = ServerEventResponseFoundryAgentCallArgumentsDelta( + delta='{"p":', item_id="item-1", response_id="resp-1", output_index=0, event_id="evt-123" + ) + + data = dict(event) + + assert data["type"] == "response.foundry_agent_call_arguments.delta" + assert data["delta"] == '{"p":' + assert data["item_id"] == "item-1" + + +class TestEdgeCases: + """Test edge cases and boundary conditions.""" + + def test_basic_filler_empty_texts(self): + """Test BasicFillerResponseConfig with empty texts list.""" + config = BasicFillerResponseConfig(texts=[]) + assert config.texts == [] + + def test_basic_filler_single_text(self): + """Test BasicFillerResponseConfig with single text.""" + config = BasicFillerResponseConfig(texts=["Only one"]) + assert len(config.texts) == 1 + + def test_llm_filler_zero_tokens(self): + """Test LlmFillerResponseConfig with zero max tokens.""" + config = LlmFillerResponseConfig(max_completion_tokens=0) + assert config.max_completion_tokens == 0 + + def test_foundry_tool_minimal_required_only(self): + """Test FoundryAgentTool with only required fields.""" + tool = FoundryAgentTool(agent_name="a", project_name="p") + + assert tool.agent_name == "a" + assert tool.project_name == "p" + assert tool.agent_version is None + assert tool.client_id is None + assert tool.description is None + + def test_foundry_call_item_empty_arguments(self): + """Test ResponseFoundryAgentCallItem with empty arguments.""" + item = ResponseFoundryAgentCallItem(name="agent", call_id="call-1", arguments="{}") + assert item.arguments == "{}" + + def test_multiple_filler_triggers(self): + """Test filler config with all trigger types.""" + config = BasicFillerResponseConfig(triggers=[FillerTrigger.LATENCY, FillerTrigger.TOOL]) + assert len(config.triggers) == 2 + assert FillerTrigger.LATENCY in config.triggers + assert FillerTrigger.TOOL in config.triggers + + +class TestValidation: + """Test model validation and constraints.""" + + def test_reasoning_effort_all_values(self): + """Test all ReasoningEffort enum values work in models.""" + efforts = [ + ReasoningEffort.NONE, + ReasoningEffort.MINIMAL, + ReasoningEffort.LOW, + ReasoningEffort.MEDIUM, + ReasoningEffort.HIGH, + ReasoningEffort.XHIGH, + ] + + for effort in efforts: + session = RequestSession(reasoning_effort=effort) + assert session.reasoning_effort == effort + + def test_metadata_max_keys(self): + """Test Response metadata with multiple keys.""" + metadata = {f"key{i}": f"value{i}" for i in range(16)} + response = Response(metadata=metadata) + + assert len(response.metadata) == 16 + + def test_metadata_long_values(self): + """Test Response metadata with long values.""" + long_value = "x" * 512 + metadata = {"key": long_value} + response = Response(metadata=metadata) + + assert response.metadata["key"] == long_value + + def test_foundry_context_type_string_values(self): + """Test FoundryAgentContextType with string values.""" + tool1 = FoundryAgentTool(agent_name="a", project_name="p", agent_context_type="no_context") + tool2 = FoundryAgentTool(agent_name="a", project_name="p", agent_context_type="agent_context") + + assert tool1.agent_context_type == "no_context" + assert tool2.agent_context_type == "agent_context" + + +class TestTypeUnions: + """Test union type handling for filler configs.""" + + def test_session_accepts_basic_filler(self): + """Test RequestSession accepts BasicFillerResponseConfig.""" + filler = BasicFillerResponseConfig(texts=["Wait"]) + session = RequestSession(filler_response=filler) + + assert isinstance(session.filler_response, BasicFillerResponseConfig) + + def test_session_accepts_llm_filler(self): + """Test RequestSession accepts LlmFillerResponseConfig.""" + filler = LlmFillerResponseConfig(model="gpt-4o") + session = RequestSession(filler_response=filler) + + assert isinstance(session.filler_response, LlmFillerResponseConfig) + + def test_response_session_filler_types(self): + """Test ResponseSession with different filler types.""" + basic = BasicFillerResponseConfig(texts=["Hmm"]) + llm = LlmFillerResponseConfig() + + session1 = ResponseSession(filler_response=basic) + session2 = ResponseSession(filler_response=llm) + + assert session1.filler_response.type == FillerResponseConfigType.STATIC_FILLER + assert session2.filler_response.type == FillerResponseConfigType.LLM_FILLER + + +class TestServerEventTypes: + """Test all Foundry agent server event types.""" + + def test_all_foundry_event_types_exist(self): + """Test that all Foundry agent event types are defined.""" + expected_types = [ + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA, + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE, + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS, + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED, + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_FAILED, + ] + + for event_type in expected_types: + assert event_type is not None + assert isinstance(event_type, str) + + def test_event_type_string_values(self): + """Test Foundry agent event type string values.""" + assert ( + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA == "response.foundry_agent_call_arguments.delta" + ) + assert ( + ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE == "response.foundry_agent_call_arguments.done" + ) + assert ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_IN_PROGRESS == "response.foundry_agent_call.in_progress" + assert ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED == "response.foundry_agent_call.completed" + assert ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_FAILED == "response.foundry_agent_call.failed" + + +class TestComplexScenarios: + """Test complex real-world scenarios.""" + + def test_complete_foundry_workflow(self): + """Test complete Foundry agent workflow with all components.""" + # Create tool + tool = FoundryAgentTool( + agent_name="support-bot", + project_name="customer-support", + agent_version="v2.0", + description="Customer support agent", + agent_context_type=FoundryAgentContextType.AGENT_CONTEXT, + return_agent_response_directly=False, + ) + + # Create session with tool + session = RequestSession(model="gpt-4o-realtime-preview", tools=[tool], reasoning_effort=ReasoningEffort.MEDIUM) + + # Create call item + call_item = ResponseFoundryAgentCallItem( + name="support-bot", + call_id="call-abc123", + arguments='{"query": "help with billing"}', + agent_response_id="resp-def456", + output='{"answer": "Here is billing help..."}', + ) + + # Create events + delta_event = ServerEventResponseFoundryAgentCallArgumentsDelta( + delta='{"query":', item_id="item-1", response_id="resp-1", output_index=0 + ) + + done_event = ServerEventResponseFoundryAgentCallArgumentsDone( + item_id="item-1", response_id="resp-1", output_index=0, arguments='{"query": "help with billing"}' + ) + + completed_event = ServerEventResponseFoundryAgentCallCompleted(item_id="item-1", output_index=0) + + # Verify all components + assert tool.type == ToolType.FOUNDRY_AGENT + assert session.tools[0].agent_name == "support-bot" + assert call_item.type == ItemType.FOUNDRY_AGENT_CALL + assert delta_event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DELTA + assert done_event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_ARGUMENTS_DONE + assert completed_event.type == ServerEventType.RESPONSE_FOUNDRY_AGENT_CALL_COMPLETED + + def test_session_with_all_new_features(self): + """Test session combining all new features.""" + # Foundry agent + agent = FoundryAgentTool(agent_name="multi-agent", project_name="enterprise") + + # Filler config + filler = LlmFillerResponseConfig( + triggers=[FillerTrigger.LATENCY, FillerTrigger.TOOL], + latency_threshold_ms=1500, + model="gpt-4o-mini", + max_completion_tokens=30, + ) + + # Create session with everything + session = RequestSession( + model="gpt-4o-realtime-preview", + tools=[agent], + reasoning_effort=ReasoningEffort.HIGH, + filler_response=filler, + ) + + # Verify all features present + assert session.tools[0].type == ToolType.FOUNDRY_AGENT + assert session.reasoning_effort == ReasoningEffort.HIGH + assert session.filler_response.type == FillerResponseConfigType.LLM_FILLER + assert session.filler_response.model == "gpt-4o-mini" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/sdk/ai/azure-ai-voicelive/tests/test_unit_serialization.py b/sdk/ai/azure-ai-voicelive/tests/test_unit_serialization.py index 7ca27ecf45bf..cca0e0fdd415 100644 --- a/sdk/ai/azure-ai-voicelive/tests/test_unit_serialization.py +++ b/sdk/ai/azure-ai-voicelive/tests/test_unit_serialization.py @@ -335,3 +335,62 @@ def test_serializer_with_real_world_data(self): assert modalities_result == ["text", "audio"] assert formats_result == ["pcm16", "g711_ulaw"] + + +class TestSerializationSecurity: + """Test security improvements in serialization (removed eval usage).""" + + def test_serialize_basic_no_eval(self): + """Test that basic type serialization doesn't use eval().""" + from azure.ai.voicelive._utils.serialization import Serializer + + serializer = Serializer() + + # These should work without eval + assert serializer.serialize_basic("test", "str") == "test" + assert serializer.serialize_basic(42, "int") == 42 + assert serializer.serialize_basic(3.14, "float") == 3.14 + assert serializer.serialize_basic(True, "bool") is True + + def test_serialize_basic_invalid_type_raises_error(self): + """Test that invalid data types raise TypeError instead of eval error.""" + from azure.ai.voicelive._utils.serialization import Serializer + + serializer = Serializer() + + # Should raise TypeError for unsupported types, not execute arbitrary code + with pytest.raises(TypeError, match="Unknown basic data type"): + serializer.serialize_basic("test", "malicious_code") + + def test_deserialize_basic_no_eval(self): + """Test that basic type deserialization doesn't use eval().""" + from azure.ai.voicelive._utils.serialization import Deserializer + + deserializer = Deserializer() + + # These should work without eval + assert deserializer.deserialize_basic("test", "str") == "test" + assert deserializer.deserialize_basic("42", "int") == 42 + assert deserializer.deserialize_basic("3.14", "float") == 3.14 + + def test_deserialize_basic_invalid_type_raises_error(self): + """Test that invalid data types raise TypeError in deserialization.""" + from azure.ai.voicelive._utils.serialization import Deserializer + + deserializer = Deserializer() + + # Should raise TypeError for unsupported types + with pytest.raises(TypeError, match="Unknown basic data type"): + deserializer.deserialize_basic("test", "unknown_type") + + def test_security_bool_serialization(self): + """Test that bool serialization works correctly without eval.""" + from azure.ai.voicelive._utils.serialization import Serializer + + serializer = Serializer() + + # Test bool specifically as it was changed from eval(data_type)(data) + assert serializer.serialize_basic(True, "bool") is True + assert serializer.serialize_basic(False, "bool") is False + assert serializer.serialize_basic(1, "bool") is True # Truthy value + assert serializer.serialize_basic(0, "bool") is False # Falsy value