fix: set OLLAMA_KEEP_ALIVE=-1 to prevent model eviction from RAM

barckcode · claude · barckcode · commit 5aab00665012 · 2026-03-23T23:27:56.000Z
On CPU-only servers, Ollama unloads models after 5 minutes of
inactivity. Combined with ~9 min inference times, this means the model
gets evicted between requests, causing repeated cold starts. Setting
KEEP_ALIVE=-1 keeps models loaded indefinitely.

Also clears the status_message after warm-up completes.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.4.5
+0.4.6
diff --git a/internal/api/handlers_teams.go b/internal/api/handlers_teams.go
@@ -403,6 +403,7 @@ func (s *Server) deployTeamAsync(team models.Team) {
 					slog.Warn("ollama model warm-up failed (non-fatal)", "team", team.Name, "model", ollamaModel, "error", err)
 					// Non-fatal: the model will load on first request (slower).
 				}
+				s.db.Model(&team).Update("status_message", "Ollama model ready: "+ollamaModel)
 			}
 
 			// Track SharedInfra with thread-safe ref counting.
diff --git a/internal/runtime/ollama.go b/internal/runtime/ollama.go
@@ -100,6 +100,12 @@ func (d *DockerRuntime) EnsureOllama(ctx context.Context) (string, error) {
 	resp, err := d.client.ContainerCreate(ctx,
 		&container.Config{
 			Image: OllamaImage,
+			Env: []string{
+				// Keep models loaded in RAM indefinitely. Without this, Ollama
+				// unloads models after 5 min of inactivity, causing multi-minute
+				// cold starts on CPU-only servers for the next request.
+				"OLLAMA_KEEP_ALIVE=-1",
+			},
 			Labels: map[string]string{
 				LabelInfra: "ollama",
 			},

Original file line number	Diff line number	Diff line change
`@@ -403,6 +403,7 @@ func (s *Server) deployTeamAsync(team models.Team) {`
`403`	`403`	`slog.Warn("ollama model warm-up failed (non-fatal)", "team", team.Name, "model", ollamaModel, "error", err)`
`404`	`404`	`// Non-fatal: the model will load on first request (slower).`
`405`	`405`	`}`
	`406`	`+ s.db.Model(&team).Update("status_message", "Ollama model ready: "+ollamaModel)`
`406`	`407`	`}`
`407`	`408`
`408`	`409`	`// Track SharedInfra with thread-safe ref counting.`