microsoft · Dongbumlee · Mar 19, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
@@ -1,3 +1,37 @@
+# Include any files or directories that you don't want to be copied to your
+# container here (e.g., local build artifacts, temporary files, etc.).
+#
+# For more help, visit the .dockerignore file reference guide at
+# https://docs.docker.com/engine/reference/builder/#dockerignore-file
+
+**/.DS_Store
+**/__pycache__
+**/.venv
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/bin
+**/charts
+**/docker-compose*
+**/compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+LICENSE
+README.md
 # UV and Python cache directories
 **/__pycache__/
 **/*.py[cod]

@@ -0,0 +1,9 @@
+{
+    "chat.tools.terminal.autoApprove": {
+        "/^cd H:\\\\Works\\\\Code-Migration\\\\Container-Migration-Solution-Accelerator\\\\src\\\\backend-api ; python -m ruff check src/ --fix 2>&1$/": {
+            "approve": true,
+            "matchCommandLine": true
+        },
+        "npx eslint": true
+    }
+}
@@ -78,6 +78,16 @@ param aiModelVersion string = '2025-04-16'
 @description('Optional. AI model deployment token capacity. Lower this if initial provisioning fails due to capacity. Defaults to 50K tokens per minute to improve regional success rate.')
 param aiModelCapacity int = 500
 
+@minLength(1)
+@description('Optional. Name of the embedding model to deploy. Defaults to text-embedding-3-large.')
+param aiEmbeddingModelName string = 'text-embedding-3-large'
+
+@description('Optional. Version of the embedding model. Defaults to 1.')
+param aiEmbeddingModelVersion string = '1'
+
+@description('Optional. Embedding model deployment token capacity. Defaults to 500.')
+param aiEmbeddingModelCapacity int = 500
+
 @description('Optional. The tags to apply to all deployed Azure resources.')
 param tags resourceInput<'Microsoft.Resources/resourceGroups@2025-04-01'>.tags = {}
 
@@ -761,6 +771,18 @@ module existingAiFoundryAiServicesDeployments 'modules/ai-services-deployments.b
           capacity: aiModelCapacity
         }
       }
+      {
+        name: aiEmbeddingModelName
+        model: {
+          format: 'OpenAI'
+          name: aiEmbeddingModelName
+          version: aiEmbeddingModelVersion
+        }
+        sku: {
+          name: 'Standard'
+          capacity: aiEmbeddingModelCapacity
+        }
+      }
     ]
     roleAssignments: [
       // Service Principal permissions
@@ -857,6 +879,18 @@ module aiFoundry 'br/public:avm/ptn/ai-ml/ai-foundry:0.4.0' = if(!useExistingAiF
           capacity: aiModelCapacity
         }
       }
+      {
+        name: aiEmbeddingModelName
+        model: {
+          format: 'OpenAI'
+          name: aiEmbeddingModelName
+          version: aiEmbeddingModelVersion
+        }
+        sku: {
+          name: 'Standard'
+          capacity: aiEmbeddingModelCapacity
+        }
+      }
     ]
     tags: allTags
     enableTelemetry: enableTelemetry
@@ -905,6 +939,10 @@ module appConfiguration 'br/public:avm/res/app-configuration/configuration-store
         name: 'AZURE_OPENAI_CHAT_DEPLOYMENT_NAME'
         value: aiModelDeploymentName
       }
+      {
+        name: 'AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME'
+        value: aiEmbeddingModelName
+      }
       {
         name: 'AZURE_OPENAI_ENDPOINT'
         value: 'https://${aiServicesName}.cognitiveservices.azure.com/'

@@ -713,14 +713,14 @@ const BatchStoryPage = () => {
                       <Text size={500} weight="semibold" style={{ marginBottom: '12px', display: 'block' }}>Step Timeline</Text>
                       <div style={{ display: 'flex', flexDirection: 'column', gap: '8px' }}>
                         {(() => {
-                          const stepOrder = ['analysis', 'design', 'yaml', 'yaml_conversion', 'documentation'];
+                          const stepOrder = ['analysis', 'design', 'yaml', 'documentation'];
                           const stepLabels: Record<string, string> = {
                             'analysis': 'Analysis', 'design': 'Design', 'yaml': 'YAML Conversion',
-                            'yaml_conversion': 'YAML Conversion', 'documentation': 'Documentation'
+                            'documentation': 'Documentation'
                           };
                           const stepIcons: Record<string, string> = {
                             'analysis': '🔍', 'design': '📐', 'yaml': '📄',
-                            'yaml_conversion': '📄', 'documentation': '📝'
+                            'documentation': '📝'
                           };
                           const timings = telemetryData.step_timings;
                           const totalElapsed = Object.values(timings).reduce((sum: number, t: any) => sum + (t?.elapsed_seconds || 0), 0);
@@ -747,7 +747,7 @@ const BatchStoryPage = () => {
                                 const r = Array.isArray(stepResult.result) ? stepResult.result[0] : stepResult.result;
                                 if (key === 'analysis') {
                                   summary = `${r?.output?.platform_detected || ''} detected (${r?.output?.confidence_score || ''})`;
-                                } else if (key === 'yaml' || key === 'yaml_conversion') {
+                                } else if (key === 'yaml') {
                                   const metrics = r?.termination_output?.overall_conversion_metrics;
                                   if (metrics) summary = `${metrics.successful_conversions}/${metrics.total_files} files converted (${metrics.overall_accuracy})`;
                                 } else if (key === 'design') {
@@ -800,7 +800,7 @@ const BatchStoryPage = () => {
                           };
                           const stepLabels: Record<string, string> = {
                             'analysis': 'Analysis', 'design': 'Design', 'yaml': 'YAML',
-                            'yaml_conversion': 'YAML', 'documentation': 'Docs'
+                            'documentation': 'Docs'
                           };
 
                           return Object.entries(agents)

@@ -31,9 +31,9 @@ def _format_exc_brief(exc: BaseException) -> str:
 
 @dataclass(frozen=True)
 class RateLimitRetryConfig:
-    max_retries: int = 5
-    base_delay_seconds: float = 2.0
-    max_delay_seconds: float = 30.0
+    max_retries: int = 8
+    base_delay_seconds: float = 5.0
+    max_delay_seconds: float = 120.0
 
     @staticmethod
     def from_env(
@@ -54,9 +54,9 @@ def _float(name: str, default: float) -> float:
                 return default
 
         return RateLimitRetryConfig(
-            max_retries=max(0, _int(max_retries_env, 5)),
-            base_delay_seconds=max(0.0, _float(base_delay_env, 2.0)),
-            max_delay_seconds=max(0.0, _float(max_delay_env, 30.0)),
+            max_retries=max(0, _int(max_retries_env, 8)),
+            base_delay_seconds=max(0.0, _float(base_delay_env, 5.0)),
+            max_delay_seconds=max(0.0, _float(max_delay_env, 120.0)),
         )
 
 
@@ -69,6 +69,15 @@ def _looks_like_rate_limit(error: BaseException) -> bool:
     if status == 429:
         return True
 
+    # Treat empty error messages as transient (likely connection reset or
+    # incomplete response from Azure front-end) — worth retrying.
+    if not msg or msg == str(type(error).__name__).lower():
+        return True
+
+    # Server errors (5xx) are transient and should be retried.
+    if isinstance(status, int) and 500 <= status < 600:
+        return True
+
     cause = getattr(error, "__cause__", None)
     if cause and cause is not error:
         return _looks_like_rate_limit(cause)
@@ -246,14 +255,14 @@ class ContextTrimConfig:
     """
 
     enabled: bool = True
-    # GPT-5.1 supports 272K input tokens (~800K chars). These defaults stay well
-    # within that budget while guarding against accidental large blob injection.
-    # Progressive trimming on retry will reduce these further if needed.
-    max_total_chars: int = 600_000
-    max_message_chars: int = 40_000
-    keep_last_messages: int = 50
-    keep_head_chars: int = 15_000
-    keep_tail_chars: int = 5_000
+    # GPT-5.1 supports 272K input tokens (~800K chars). With workspace context
+    # injected into system instructions (never trimmed) and Qdrant shared memory
+    # providing cross-step context, we can keep fewer conversation messages.
+    max_total_chars: int = 400_000
+    max_message_chars: int = 0  # Disabled — with keep_last_messages=15, per-message truncation is unnecessary
+    keep_last_messages: int = 15
+    keep_head_chars: int = 12_000
+    keep_tail_chars: int = 4_000
     keep_system_messages: bool = True
     retry_on_context_error: bool = True
 
@@ -284,7 +293,7 @@ def _bool(name: str, default: bool) -> bool:
             enabled=_bool(enabled_env, True),
             max_total_chars=max(0, _int(max_total_chars_env, 240_000)),
             max_message_chars=max(0, _int(max_message_chars_env, 20_000)),
-            keep_last_messages=max(1, _int(keep_last_messages_env, 40)),
+            keep_last_messages=max(1, _int(keep_last_messages_env, 15)),
             keep_head_chars=max(0, _int(keep_head_chars_env, 10_000)),
             keep_tail_chars=max(0, _int(keep_tail_chars_env, 3_000)),
             keep_system_messages=_bool(keep_system_messages_env, True),
@@ -299,42 +308,18 @@ def _trim_messages(
         return list(messages)
 
     # ──────────────────────────────────────────────────────────────────────
-    # Phase 0: Smart tool-result compression.
-    # Tool outputs (read_blob_content, save_content_to_blob, etc.) are the
-    # largest context consumers. Once an agent has responded after a tool
-    # call, the raw output is redundant — the agent's response is the
-    # distilled intelligence. We compress old tool results aggressively
-    # while keeping the most recent ones intact for the current agent turn.
+    # Phase 0: Summarize large save_content_to_blob calls.
+    # Write payloads are redundant once persisted — replace with a short
+    # summary. Read tool results are never truncated so the model always
+    # has the full file content to reason about.
     # ──────────────────────────────────────────────────────────────────────
-    KEEP_RECENT_TOOL_RESULTS = 4  # Keep the N most recent tool results in full
-    TOOL_RESULT_MAX_CHARS = 500  # Truncate older tool results to this size
     SAVE_ARG_MAX_CHARS = 200  # Truncate save_content_to_blob arguments
 
-    tool_result_indices: list[int] = []
     for i, m in enumerate(messages):
-        role = _get_message_role(m)
         text = _estimate_message_text(m)
-        if role == "tool" or (role is None and _looks_like_tool_result(text)):
-            tool_result_indices.append(i)
-        # Also detect save_content_to_blob in assistant/function messages
-        elif _looks_like_save_blob_call(text):
-            if len(text) > SAVE_ARG_MAX_CHARS:
-                # Extract just the blob name and byte count
-                summary = _summarize_save_blob(text, SAVE_ARG_MAX_CHARS)
-                messages[i] = _set_message_text(m, summary)
-
-    # Compress older tool results, keep recent ones in full
-    if len(tool_result_indices) > KEEP_RECENT_TOOL_RESULTS:
-        old_indices = tool_result_indices[:-KEEP_RECENT_TOOL_RESULTS]
-        for idx in old_indices:
-            m = messages[idx]
-            text = _estimate_message_text(m)
-            if len(text) > TOOL_RESULT_MAX_CHARS:
-                truncated = (
-                    text[:TOOL_RESULT_MAX_CHARS]
-                    + f"\n[... tool output truncated from {len(text)} chars ...]"
-                )
-                messages[idx] = _set_message_text(m, truncated)
+        if _looks_like_save_blob_call(text) and len(text) > SAVE_ARG_MAX_CHARS:
+            summary = _summarize_save_blob(text, SAVE_ARG_MAX_CHARS)
+            messages[i] = _set_message_text(m, summary)
 
     # Keep last N messages; optionally keep system messages from the head.
     system_messages: list[Any] = []
@@ -354,14 +339,21 @@ def _trim_messages(
     seen_fingerprints: set[tuple[str, str]] = set()
     cleaned: list[Any] = []
 
-    for m in tail:
+    for idx, m in enumerate(tail):
         text = _estimate_message_text(m)
         fp = (text[:200], text[-200:])
         if fp in seen_fingerprints:
             continue
         seen_fingerprints.add(fp)
 
-        if cfg.max_message_chars > 0 and len(text) > cfg.max_message_chars:
+        # Never truncate the last message — the agent needs it in full
+        # to reason about the most recent tool result or instruction.
+        is_last = idx == len(tail) - 1
+        if (
+            not is_last
+            and cfg.max_message_chars > 0
+            and len(text) > cfg.max_message_chars
+        ):
             text = _truncate_text(
                 text,
                 max_chars=cfg.max_message_chars,
@@ -584,6 +576,14 @@ async def _inner_get_response(
                 len(messages),
                 len(trimmed),
             )
+            # Cool down before retrying to avoid triggering 429s immediately.
+            trim_delay = self._retry_config.base_delay_seconds
+            trim_delay = min(trim_delay, self._retry_config.max_delay_seconds)
+            logger.info(
+                "[AOAI_CTX_TRIM] sleeping %ss before retry",
+                round(trim_delay, 1),
+            )
+            await asyncio.sleep(trim_delay)
             return await _retry_call(
                 lambda: parent_inner_get_response(
                     messages=trimmed, chat_options=chat_options, **kwargs
@@ -690,6 +690,18 @@ async def _tail():
                     if attempt_index >= attempts - 1:
                         # No more retries available.
                         raise
+
+                    # Cool down before retrying — immediate retries after trimming
+                    # tend to trigger 429s because the API hasn't recovered yet.
+                    trim_delay = self._retry_config.base_delay_seconds * (
+                        2**attempt_index
+                    )
+                    trim_delay = min(trim_delay, self._retry_config.max_delay_seconds)
+                    logger.info(
+                        "[AOAI_CTX_TRIM_STREAM] sleeping %ss before retry",
+                        round(trim_delay, 1),
+                    )
+                    await asyncio.sleep(trim_delay)
                     continue
 
                 if not _looks_like_rate_limit(e) or attempt_index >= attempts - 1:

@@ -3,6 +3,8 @@
 
 """Lazy-initialized async wrapper around the Mem0 vector-store memory backend."""
 
+import os
+
 from mem0 import AsyncMemory
 
 
@@ -17,6 +19,13 @@ async def get_memory(self):
         return self._memory_instance
 
     async def _create_memory(self):
+        endpoint = os.getenv("AZURE_OPENAI_ENDPOINT", "")
+        chat_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME", "gpt-5.1")
+        embedding_deployment = os.getenv(
+            "AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME", "text-embedding-3-large"
+        )
+        api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
+
         config = {
             "vector_store": {
                 "provider": "redis",
@@ -29,27 +38,24 @@ async def _create_memory(self):
             "llm": {
                 "provider": "azure_openai",
                 "config": {
-                    "model": "gpt-5.1",
+                    "model": chat_deployment,
                     "temperature": 0.1,
-                    "max_tokens": 100000,
+                    "max_tokens": 4000,
                     "azure_kwargs": {
-                        "azure_deployment": "gpt-5.1",
-                        "api_version": "2024-12-01-preview",
-                        "azure_endpoint": "https://aifappframework.cognitiveservices.azure.com/",
+                        "azure_deployment": chat_deployment,
+                        "api_version": api_version,
+                        "azure_endpoint": endpoint,
                     },
                 },
             },
             "embedder": {
                 "provider": "azure_openai",
                 "config": {
-                    "model": "text-embedding-3-large",
+                    "model": embedding_deployment,
                     "azure_kwargs": {
-                        "api_version": "2024-02-01",
-                        "azure_deployment": "text-embedding-3-large",
-                        "azure_endpoint": "https://aifappframework.openai.azure.com/",
-                        "default_headers": {
-                            "CustomHeader": "container migration",
-                        },
+                        "api_version": api_version,
+                        "azure_deployment": embedding_deployment,
+                        "azure_endpoint": endpoint,
                     },
                 },
             },