Skip to content

Commit 5aab006

Browse files
barckcodeclaude
andcommitted
fix: set OLLAMA_KEEP_ALIVE=-1 to prevent model eviction from RAM
On CPU-only servers, Ollama unloads models after 5 minutes of inactivity. Combined with ~9 min inference times, this means the model gets evicted between requests, causing repeated cold starts. Setting KEEP_ALIVE=-1 keeps models loaded indefinitely. Also clears the status_message after warm-up completes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ffbb7a8 commit 5aab006

3 files changed

Lines changed: 8 additions & 1 deletion

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.4.5
1+
0.4.6

internal/api/handlers_teams.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ func (s *Server) deployTeamAsync(team models.Team) {
403403
slog.Warn("ollama model warm-up failed (non-fatal)", "team", team.Name, "model", ollamaModel, "error", err)
404404
// Non-fatal: the model will load on first request (slower).
405405
}
406+
s.db.Model(&team).Update("status_message", "Ollama model ready: "+ollamaModel)
406407
}
407408

408409
// Track SharedInfra with thread-safe ref counting.

internal/runtime/ollama.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@ func (d *DockerRuntime) EnsureOllama(ctx context.Context) (string, error) {
100100
resp, err := d.client.ContainerCreate(ctx,
101101
&container.Config{
102102
Image: OllamaImage,
103+
Env: []string{
104+
// Keep models loaded in RAM indefinitely. Without this, Ollama
105+
// unloads models after 5 min of inactivity, causing multi-minute
106+
// cold starts on CPU-only servers for the next request.
107+
"OLLAMA_KEEP_ALIVE=-1",
108+
},
103109
Labels: map[string]string{
104110
LabelInfra: "ollama",
105111
},

0 commit comments

Comments
 (0)