Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
# Include any files or directories that you don't want to be copied to your
# container here (e.g., local build artifacts, temporary files, etc.).
#
# For more help, visit the .dockerignore file reference guide at
# https://docs.docker.com/engine/reference/builder/#dockerignore-file

**/.DS_Store
**/__pycache__
**/.venv
**/.classpath
**/.dockerignore
**/.env
**/.git
**/.gitignore
**/.project
**/.settings
**/.toolstarget
**/.vs
**/.vscode
**/*.*proj.user
**/*.dbmdl
**/*.jfm
**/bin
**/charts
**/docker-compose*
**/compose*
**/Dockerfile*
**/node_modules
**/npm-debug.log
**/obj
**/secrets.dev.yaml
**/values.dev.yaml
LICENSE
README.md
# UV and Python cache directories
**/__pycache__/
**/*.py[cod]
Expand Down
9 changes: 9 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"chat.tools.terminal.autoApprove": {
"/^cd H:\\\\Works\\\\Code-Migration\\\\Container-Migration-Solution-Accelerator\\\\src\\\\backend-api ; python -m ruff check src/ --fix 2>&1$/": {
"approve": true,
"matchCommandLine": true
},
"npx eslint": true
}
}
38 changes: 38 additions & 0 deletions infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ param aiModelVersion string = '2025-04-16'
@description('Optional. AI model deployment token capacity. Lower this if initial provisioning fails due to capacity. Defaults to 50K tokens per minute to improve regional success rate.')
param aiModelCapacity int = 500

@minLength(1)
@description('Optional. Name of the embedding model to deploy. Defaults to text-embedding-3-large.')
param aiEmbeddingModelName string = 'text-embedding-3-large'

@description('Optional. Version of the embedding model. Defaults to 1.')
param aiEmbeddingModelVersion string = '1'

@description('Optional. Embedding model deployment token capacity. Defaults to 500.')
param aiEmbeddingModelCapacity int = 500

@description('Optional. The tags to apply to all deployed Azure resources.')
param tags resourceInput<'Microsoft.Resources/resourceGroups@2025-04-01'>.tags = {}

Expand Down Expand Up @@ -761,6 +771,18 @@ module existingAiFoundryAiServicesDeployments 'modules/ai-services-deployments.b
capacity: aiModelCapacity
}
}
{
name: aiEmbeddingModelName
model: {
format: 'OpenAI'
name: aiEmbeddingModelName
version: aiEmbeddingModelVersion
}
sku: {
name: 'Standard'
capacity: aiEmbeddingModelCapacity
}
}
]
roleAssignments: [
// Service Principal permissions
Expand Down Expand Up @@ -857,6 +879,18 @@ module aiFoundry 'br/public:avm/ptn/ai-ml/ai-foundry:0.4.0' = if(!useExistingAiF
capacity: aiModelCapacity
}
}
{
name: aiEmbeddingModelName
model: {
format: 'OpenAI'
name: aiEmbeddingModelName
version: aiEmbeddingModelVersion
}
sku: {
name: 'Standard'
capacity: aiEmbeddingModelCapacity
}
}
]
tags: allTags
enableTelemetry: enableTelemetry
Expand Down Expand Up @@ -905,6 +939,10 @@ module appConfiguration 'br/public:avm/res/app-configuration/configuration-store
name: 'AZURE_OPENAI_CHAT_DEPLOYMENT_NAME'
value: aiModelDeploymentName
}
{
name: 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
value: aiEmbeddingModelName
}
{
name: 'AZURE_OPENAI_ENDPOINT'
value: 'https://${aiServicesName}.cognitiveservices.azure.com/'
Expand Down
10 changes: 5 additions & 5 deletions src/frontend/src/pages/batchView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -713,14 +713,14 @@ const BatchStoryPage = () => {
<Text size={500} weight="semibold" style={{ marginBottom: '12px', display: 'block' }}>Step Timeline</Text>
<div style={{ display: 'flex', flexDirection: 'column', gap: '8px' }}>
{(() => {
const stepOrder = ['analysis', 'design', 'yaml', 'yaml_conversion', 'documentation'];
const stepOrder = ['analysis', 'design', 'yaml', 'documentation'];
const stepLabels: Record<string, string> = {
'analysis': 'Analysis', 'design': 'Design', 'yaml': 'YAML Conversion',
'yaml_conversion': 'YAML Conversion', 'documentation': 'Documentation'
'documentation': 'Documentation'
};
const stepIcons: Record<string, string> = {
'analysis': '🔍', 'design': '📐', 'yaml': '📄',
'yaml_conversion': '📄', 'documentation': '📝'
'documentation': '📝'
};
const timings = telemetryData.step_timings;
const totalElapsed = Object.values(timings).reduce((sum: number, t: any) => sum + (t?.elapsed_seconds || 0), 0);
Expand All @@ -747,7 +747,7 @@ const BatchStoryPage = () => {
const r = Array.isArray(stepResult.result) ? stepResult.result[0] : stepResult.result;
if (key === 'analysis') {
summary = `${r?.output?.platform_detected || ''} detected (${r?.output?.confidence_score || ''})`;
} else if (key === 'yaml' || key === 'yaml_conversion') {
} else if (key === 'yaml') {
const metrics = r?.termination_output?.overall_conversion_metrics;
if (metrics) summary = `${metrics.successful_conversions}/${metrics.total_files} files converted (${metrics.overall_accuracy})`;
} else if (key === 'design') {
Expand Down Expand Up @@ -800,7 +800,7 @@ const BatchStoryPage = () => {
};
const stepLabels: Record<string, string> = {
'analysis': 'Analysis', 'design': 'Design', 'yaml': 'YAML',
'yaml_conversion': 'YAML', 'documentation': 'Docs'
'documentation': 'Docs'
};

return Object.entries(agents)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def _format_exc_brief(exc: BaseException) -> str:

@dataclass(frozen=True)
class RateLimitRetryConfig:
max_retries: int = 5
base_delay_seconds: float = 2.0
max_delay_seconds: float = 30.0
max_retries: int = 8
base_delay_seconds: float = 5.0
max_delay_seconds: float = 120.0

@staticmethod
def from_env(
Expand All @@ -54,9 +54,9 @@ def _float(name: str, default: float) -> float:
return default

return RateLimitRetryConfig(
max_retries=max(0, _int(max_retries_env, 5)),
base_delay_seconds=max(0.0, _float(base_delay_env, 2.0)),
max_delay_seconds=max(0.0, _float(max_delay_env, 30.0)),
max_retries=max(0, _int(max_retries_env, 8)),
base_delay_seconds=max(0.0, _float(base_delay_env, 5.0)),
max_delay_seconds=max(0.0, _float(max_delay_env, 120.0)),
)


Expand All @@ -69,6 +69,15 @@ def _looks_like_rate_limit(error: BaseException) -> bool:
if status == 429:
return True

# Treat empty error messages as transient (likely connection reset or
# incomplete response from Azure front-end) — worth retrying.
if not msg or msg == str(type(error).__name__).lower():
return True

# Server errors (5xx) are transient and should be retried.
if isinstance(status, int) and 500 <= status < 600:
return True

cause = getattr(error, "__cause__", None)
if cause and cause is not error:
return _looks_like_rate_limit(cause)
Expand Down Expand Up @@ -246,14 +255,14 @@ class ContextTrimConfig:
"""

enabled: bool = True
# GPT-5.1 supports 272K input tokens (~800K chars). These defaults stay well
# within that budget while guarding against accidental large blob injection.
# Progressive trimming on retry will reduce these further if needed.
max_total_chars: int = 600_000
max_message_chars: int = 40_000
keep_last_messages: int = 50
keep_head_chars: int = 15_000
keep_tail_chars: int = 5_000
# GPT-5.1 supports 272K input tokens (~800K chars). With workspace context
# injected into system instructions (never trimmed) and Qdrant shared memory
# providing cross-step context, we can keep fewer conversation messages.
max_total_chars: int = 400_000
max_message_chars: int = 0 # Disabled — with keep_last_messages=15, per-message truncation is unnecessary
keep_last_messages: int = 15
keep_head_chars: int = 12_000
keep_tail_chars: int = 4_000
keep_system_messages: bool = True
retry_on_context_error: bool = True

Expand Down Expand Up @@ -284,7 +293,7 @@ def _bool(name: str, default: bool) -> bool:
enabled=_bool(enabled_env, True),
max_total_chars=max(0, _int(max_total_chars_env, 240_000)),
max_message_chars=max(0, _int(max_message_chars_env, 20_000)),
keep_last_messages=max(1, _int(keep_last_messages_env, 40)),
keep_last_messages=max(1, _int(keep_last_messages_env, 15)),
keep_head_chars=max(0, _int(keep_head_chars_env, 10_000)),
keep_tail_chars=max(0, _int(keep_tail_chars_env, 3_000)),
keep_system_messages=_bool(keep_system_messages_env, True),
Expand All @@ -299,42 +308,18 @@ def _trim_messages(
return list(messages)

# ──────────────────────────────────────────────────────────────────────
# Phase 0: Smart tool-result compression.
# Tool outputs (read_blob_content, save_content_to_blob, etc.) are the
# largest context consumers. Once an agent has responded after a tool
# call, the raw output is redundant — the agent's response is the
# distilled intelligence. We compress old tool results aggressively
# while keeping the most recent ones intact for the current agent turn.
# Phase 0: Summarize large save_content_to_blob calls.
# Write payloads are redundant once persisted — replace with a short
# summary. Read tool results are never truncated so the model always
# has the full file content to reason about.
# ──────────────────────────────────────────────────────────────────────
KEEP_RECENT_TOOL_RESULTS = 4 # Keep the N most recent tool results in full
TOOL_RESULT_MAX_CHARS = 500 # Truncate older tool results to this size
SAVE_ARG_MAX_CHARS = 200 # Truncate save_content_to_blob arguments

tool_result_indices: list[int] = []
for i, m in enumerate(messages):
role = _get_message_role(m)
text = _estimate_message_text(m)
if role == "tool" or (role is None and _looks_like_tool_result(text)):
tool_result_indices.append(i)
# Also detect save_content_to_blob in assistant/function messages
elif _looks_like_save_blob_call(text):
if len(text) > SAVE_ARG_MAX_CHARS:
# Extract just the blob name and byte count
summary = _summarize_save_blob(text, SAVE_ARG_MAX_CHARS)
messages[i] = _set_message_text(m, summary)

# Compress older tool results, keep recent ones in full
if len(tool_result_indices) > KEEP_RECENT_TOOL_RESULTS:
old_indices = tool_result_indices[:-KEEP_RECENT_TOOL_RESULTS]
for idx in old_indices:
m = messages[idx]
text = _estimate_message_text(m)
if len(text) > TOOL_RESULT_MAX_CHARS:
truncated = (
text[:TOOL_RESULT_MAX_CHARS]
+ f"\n[... tool output truncated from {len(text)} chars ...]"
)
messages[idx] = _set_message_text(m, truncated)
if _looks_like_save_blob_call(text) and len(text) > SAVE_ARG_MAX_CHARS:
summary = _summarize_save_blob(text, SAVE_ARG_MAX_CHARS)
messages[i] = _set_message_text(m, summary)

# Keep last N messages; optionally keep system messages from the head.
system_messages: list[Any] = []
Expand All @@ -354,14 +339,21 @@ def _trim_messages(
seen_fingerprints: set[tuple[str, str]] = set()
cleaned: list[Any] = []

for m in tail:
for idx, m in enumerate(tail):
text = _estimate_message_text(m)
fp = (text[:200], text[-200:])
if fp in seen_fingerprints:
continue
seen_fingerprints.add(fp)

if cfg.max_message_chars > 0 and len(text) > cfg.max_message_chars:
# Never truncate the last message — the agent needs it in full
# to reason about the most recent tool result or instruction.
is_last = idx == len(tail) - 1
if (
not is_last
and cfg.max_message_chars > 0
and len(text) > cfg.max_message_chars
):
text = _truncate_text(
text,
max_chars=cfg.max_message_chars,
Expand Down Expand Up @@ -584,6 +576,14 @@ async def _inner_get_response(
len(messages),
len(trimmed),
)
# Cool down before retrying to avoid triggering 429s immediately.
trim_delay = self._retry_config.base_delay_seconds
trim_delay = min(trim_delay, self._retry_config.max_delay_seconds)
logger.info(
"[AOAI_CTX_TRIM] sleeping %ss before retry",
round(trim_delay, 1),
)
await asyncio.sleep(trim_delay)
return await _retry_call(
lambda: parent_inner_get_response(
messages=trimmed, chat_options=chat_options, **kwargs
Expand Down Expand Up @@ -690,6 +690,18 @@ async def _tail():
if attempt_index >= attempts - 1:
# No more retries available.
raise

# Cool down before retrying — immediate retries after trimming
# tend to trigger 429s because the API hasn't recovered yet.
trim_delay = self._retry_config.base_delay_seconds * (
2**attempt_index
)
trim_delay = min(trim_delay, self._retry_config.max_delay_seconds)
logger.info(
"[AOAI_CTX_TRIM_STREAM] sleeping %ss before retry",
round(trim_delay, 1),
)
await asyncio.sleep(trim_delay)
continue

if not _looks_like_rate_limit(e) or attempt_index >= attempts - 1:
Expand Down
30 changes: 18 additions & 12 deletions src/processor/src/libs/agent_framework/mem0_async_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

"""Lazy-initialized async wrapper around the Mem0 vector-store memory backend."""

import os

from mem0 import AsyncMemory


Expand All @@ -17,6 +19,13 @@ async def get_memory(self):
return self._memory_instance

async def _create_memory(self):
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT", "")
chat_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME", "gpt-5.1")
embedding_deployment = os.getenv(
"AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME", "text-embedding-3-large"
)
api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")

config = {
"vector_store": {
"provider": "redis",
Expand All @@ -29,27 +38,24 @@ async def _create_memory(self):
"llm": {
"provider": "azure_openai",
"config": {
"model": "gpt-5.1",
"model": chat_deployment,
"temperature": 0.1,
"max_tokens": 100000,
"max_tokens": 4000,
"azure_kwargs": {
"azure_deployment": "gpt-5.1",
"api_version": "2024-12-01-preview",
"azure_endpoint": "https://aifappframework.cognitiveservices.azure.com/",
"azure_deployment": chat_deployment,
"api_version": api_version,
"azure_endpoint": endpoint,
},
},
},
"embedder": {
"provider": "azure_openai",
"config": {
"model": "text-embedding-3-large",
"model": embedding_deployment,
"azure_kwargs": {
"api_version": "2024-02-01",
"azure_deployment": "text-embedding-3-large",
"azure_endpoint": "https://aifappframework.openai.azure.com/",
"default_headers": {
"CustomHeader": "container migration",
},
"api_version": api_version,
"azure_deployment": embedding_deployment,
"azure_endpoint": endpoint,
},
},
},
Expand Down
Loading
Loading