From d6278f82823fd79cda9840fd512a16fea766be1b Mon Sep 17 00:00:00 2001 From: moizpgedge Date: Thu, 18 Jun 2026 22:30:50 +0500 Subject: [PATCH 1/2] fix: restart MCP container on KB config change KB config changes (path, provider, model, key) had no effect after an update: config.yaml is bind-mounted so SIGHUP-only reload kept the old KB path, causing "file not found" / SQLite errors. Embed a PGEDGE_CONFIG_VERSION hash in the MCP container env (like RAG) so a config change forces a restart and re-initializes the KB. Also refresh the service instance's UpdatedAt before an update redeploy so the monitor doesn't briefly mark it failed during the restart. --- .../orchestrator/swarm/service_instance.go | 9 ++++ .../orchestrator/swarm/service_spec.go | 22 ++++++-- .../orchestrator/swarm/service_spec_test.go | 51 ++++++++++++++++--- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/server/internal/orchestrator/swarm/service_instance.go b/server/internal/orchestrator/swarm/service_instance.go index 25786566..22a72c95 100644 --- a/server/internal/orchestrator/swarm/service_instance.go +++ b/server/internal/orchestrator/swarm/service_instance.go @@ -144,6 +144,15 @@ func (s *ServiceInstanceResource) Update(ctx context.Context, rc *resource.Conte } } + // Refresh UpdatedAt before redeploying so the monitor's running-state grace + // period covers the restart window. An update that changes the container + // (e.g. a config change) restarts it, during which the container has no + // status; without bumping UpdatedAt the monitor would briefly mark the + // instance failed mid-restart even though it recovers seconds later. + if dbSvc, err := do.Invoke[*database.Service](rc.Injector); err == nil { + _ = dbSvc.SetServiceInstanceState(ctx, s.DatabaseID, s.ServiceInstanceID, database.ServiceInstanceStateRunning) + } + return s.deploy(ctx, rc) } diff --git a/server/internal/orchestrator/swarm/service_spec.go b/server/internal/orchestrator/swarm/service_spec.go index 7535d376..0a8ebe38 100644 --- a/server/internal/orchestrator/swarm/service_spec.go +++ b/server/internal/orchestrator/swarm/service_spec.go @@ -42,11 +42,14 @@ func buildPostgRESTEnvVars() []string { } } -// ragConfigHash returns a short hex digest of the RAG service configuration. +// serviceConfigHash returns a short hex digest of a service's configuration. // It is embedded in the container spec as PGEDGE_CONFIG_VERSION so that Docker // Swarm detects a TaskTemplate change and restarts the container whenever the -// pipeline configuration or API keys change. -func ragConfigHash(config map[string]any) string { +// configuration or API keys change. This is required for services whose config +// lives in a bind-mounted file (config.yaml / pipelines): editing the file is +// invisible to Swarm, so without a TaskTemplate change the container is never +// restarted and the new config is never re-read. +func serviceConfigHash(config map[string]any) string { b, _ := json.Marshal(config) sum := sha256.Sum256(b) return fmt.Sprintf("%x", sum[:8]) @@ -182,6 +185,17 @@ func ServiceContainerSpec(opts *ServiceContainerSpecOptions) (swarm.ServiceSpec, // Override the default container entrypoint to specify config path on bind mount. command = []string{"/app/pgedge-postgres-mcp"} args = []string{"-config", "/app/data/config.yaml"} + // Embed a hash of the service config so that Docker Swarm detects a + // TaskTemplate change and restarts the container when the config changes. + // The MCP config (config.yaml) lives on a bind mount, so edits to it are + // invisible to Swarm. SIGHUP reloads the database client connections but + // does NOT re-initialize the knowledgebase, so KB config changes (path, + // provider, model, key) only take effect on a restart. Without this, a + // changed kb_database_host_path silently keeps using the old KB file. + // Connection details (hosts, target_session_attrs) are intentionally not + // part of the config map, so failover-driven reconnects still use SIGHUP + // without forcing a restart. + env = []string{"PGEDGE_CONFIG_VERSION=" + serviceConfigHash(opts.ServiceSpec.Config)} healthcheck = &container.HealthConfig{ Test: []string{"CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"}, StartPeriod: serviceHealthCheckStartPeriod, @@ -202,7 +216,7 @@ func ServiceContainerSpec(opts *ServiceContainerSpecOptions) (swarm.ServiceSpec, // Embed a hash of the service config so that Docker Swarm detects a // TaskTemplate change and restarts the container when pipelines or API // keys change. Without this, bind-mount updates are invisible to Swarm. - env = []string{"PGEDGE_CONFIG_VERSION=" + ragConfigHash(opts.ServiceSpec.Config)} + env = []string{"PGEDGE_CONFIG_VERSION=" + serviceConfigHash(opts.ServiceSpec.Config)} // No curl in the RHEL minimal image — use a TCP probe instead. healthcheck = &container.HealthConfig{ Test: []string{"CMD-SHELL", "exec 3<>/dev/tcp/127.0.0.1/8080"}, diff --git a/server/internal/orchestrator/swarm/service_spec_test.go b/server/internal/orchestrator/swarm/service_spec_test.go index 4a66e1b0..e7957f93 100644 --- a/server/internal/orchestrator/swarm/service_spec_test.go +++ b/server/internal/orchestrator/swarm/service_spec_test.go @@ -106,9 +106,11 @@ func TestServiceContainerSpec(t *testing.T) { if m.Target != "/app/data" { t.Errorf("mount target = %v, want /app/data", m.Target) } - // No env vars for config (config is via file) - if len(spec.Env) > 0 { - t.Errorf("expected no env vars, got %d: %v", len(spec.Env), spec.Env) + // Config is delivered via a bind-mounted file, but a config-version + // env var is set so Swarm restarts the container when the config + // changes (the file edit alone is invisible to Swarm). + if len(spec.Env) != 1 || !strings.HasPrefix(spec.Env[0], "PGEDGE_CONFIG_VERSION=") { + t.Errorf("expected only PGEDGE_CONFIG_VERSION env, got %d: %v", len(spec.Env), spec.Env) } // Healthcheck should be set if spec.Healthcheck == nil { @@ -361,8 +363,8 @@ func TestRagConfigHash(t *testing.T) { }, } - h1 := ragConfigHash(cfg) - h2 := ragConfigHash(cfg) + h1 := serviceConfigHash(cfg) + h2 := serviceConfigHash(cfg) assert.Equal(t, h1, h2, "same config must produce the same hash") assert.Len(t, h1, 16, "hash should be 16 hex chars (8 bytes)") @@ -371,7 +373,7 @@ func TestRagConfigHash(t *testing.T) { map[string]any{"name": "default", "api_key": "sk-xyz"}, }, } - assert.NotEqual(t, h1, ragConfigHash(changed), "different config must produce a different hash") + assert.NotEqual(t, h1, serviceConfigHash(changed), "different config must produce a different hash") } func TestServiceContainerSpec_RAGHasConfigVersionEnv(t *testing.T) { @@ -412,7 +414,7 @@ func TestServiceContainerSpec_RAGHasConfigVersionEnv(t *testing.T) { } } require.NotEmpty(t, configVersion, "RAG container spec must have PGEDGE_CONFIG_VERSION env var") - assert.Equal(t, ragConfigHash(config), configVersion) + assert.Equal(t, serviceConfigHash(config), configVersion) // Changing the config must produce a different env var value. opts.ServiceSpec.Config = map[string]any{ @@ -633,3 +635,38 @@ func TestServiceContainerSpec_MCP_KBMount(t *testing.T) { t.Error("KB mount must be read-only") } } + +func TestServiceContainerSpec_MCPHasConfigVersionEnv(t *testing.T) { + // The MCP config lives in a bind-mounted config.yaml. Editing it is invisible + // to Swarm, and SIGHUP does not re-initialize the knowledgebase, so a KB config + // change must alter the TaskTemplate to force a restart. We embed a config hash + // in PGEDGE_CONFIG_VERSION exactly as RAG does. + opts := makeMCPSpecOpts() + opts.ServiceSpec.Config = map[string]any{ + "kb_enabled": true, + "kb_embedding_provider": "openai", + "kb_embedding_model": "text-embedding-3-small", + "kb_database_host_path": "/var/lib/pgedge/kb/nla-kb.db", + } + + configVersion := func(o *ServiceContainerSpecOptions) string { + spec, err := ServiceContainerSpec(o) + require.NoError(t, err) + for _, e := range spec.TaskTemplate.ContainerSpec.Env { + if strings.HasPrefix(e, "PGEDGE_CONFIG_VERSION=") { + return strings.TrimPrefix(e, "PGEDGE_CONFIG_VERSION=") + } + } + return "" + } + + v1 := configVersion(opts) + require.NotEmpty(t, v1, "MCP container spec must have PGEDGE_CONFIG_VERSION env var") + assert.Equal(t, serviceConfigHash(opts.ServiceSpec.Config), v1) + + // Renaming the KB file (changing kb_database_host_path) must change the env + // var so Swarm restarts the container and the new KB path takes effect. + opts.ServiceSpec.Config["kb_database_host_path"] = "/var/lib/pgedge/kb/nla-kb-openai.db" + v2 := configVersion(opts) + assert.NotEqual(t, v1, v2, "changing the KB path must change PGEDGE_CONFIG_VERSION so the container restarts") +} From 57264534b7a240621fd60fc7e3e782c2de83babb Mon Sep 17 00:00:00 2001 From: moizpgedge Date: Thu, 18 Jun 2026 22:46:06 +0500 Subject: [PATCH 2/2] small fix --- .../internal/orchestrator/swarm/service_instance.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/server/internal/orchestrator/swarm/service_instance.go b/server/internal/orchestrator/swarm/service_instance.go index 22a72c95..b94a978b 100644 --- a/server/internal/orchestrator/swarm/service_instance.go +++ b/server/internal/orchestrator/swarm/service_instance.go @@ -149,8 +149,17 @@ func (s *ServiceInstanceResource) Update(ctx context.Context, rc *resource.Conte // (e.g. a config change) restarts it, during which the container has no // status; without bumping UpdatedAt the monitor would briefly mark the // instance failed mid-restart even though it recovers seconds later. - if dbSvc, err := do.Invoke[*database.Service](rc.Injector); err == nil { - _ = dbSvc.SetServiceInstanceState(ctx, s.DatabaseID, s.ServiceInstanceID, database.ServiceInstanceStateRunning) + // Best-effort: a failure here only risks that transient false "failed" + // state, so we log and continue rather than block the redeploy. + logger, logErr := do.Invoke[zerolog.Logger](rc.Injector) + dbSvc, refreshErr := do.Invoke[*database.Service](rc.Injector) + if refreshErr == nil { + refreshErr = dbSvc.SetServiceInstanceState(ctx, s.DatabaseID, s.ServiceInstanceID, database.ServiceInstanceStateRunning) + } + if refreshErr != nil && logErr == nil { + logger.Warn().Err(refreshErr). + Str("service_instance_id", s.ServiceInstanceID). + Msg("failed to refresh service instance UpdatedAt before redeploy; monitor may briefly mark it failed during the restart") } return s.deploy(ctx, rc)