docker · doringeman · Mar 5, 2026 · gemini-code-assist · Mar 5, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -44,7 +44,7 @@ RUN --mount=type=cache,target=/go/pkg/mod \
     CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w -X main.Version=${VERSION}" -o model-runner .
 
 # --- Get llama.cpp binary ---
-FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
+FROM docker/model-runner:llamacpp-${LLAMA_SERVER_VARIANT}-${LLAMA_SERVER_VERSION} AS llama-server
 
 # --- Final image ---
 FROM docker.io/${BASE_IMAGE} AS llamacpp

diff --git a/cmd/cli/commands/install-runner.go b/cmd/cli/commands/install-runner.go
@@ -253,6 +253,17 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
 		return nil
 	}
 
+	// On macOS/Windows, the llama.cpp backend uses deferred installation.
+	// Trigger installation (and binary update) via the running model runner.
+	if opts.backend == llamacpp.Name && llamacpp.NeedsDeferredInstall() {
+		cmd.Println("Installing llama.cpp backend...")
+		if err := desktopClient.InstallBackend(llamacpp.Name); err != nil {
+			return fmt.Errorf("failed to install llama.cpp backend: %w", err)
+		}
+		cmd.Println("llama.cpp backend installed successfully")
+		return nil
+	}
+
 	// The diffusers backend uses deferred installation: it pulls a Docker
 	// image, extracts a self-contained Python environment, and installs it
 	// to a well-known local folder. Trigger installation via the running

diff --git a/main.go b/main.go
@@ -10,7 +10,6 @@ import (
 	"net/http"
 	"os"
 	"os/signal"
-	"path/filepath"
 	"strings"
 	"syscall"
 	"time"
@@ -59,7 +58,10 @@ func main() {
 		llamacpp.SetDesiredServerVersion(v)
 	}
 
-	llamaServerPath := envconfig.LlamaServerPath()
+	llamaServerPath, err := envconfig.LlamaServerPath()
+	if err != nil {
+		log.Error("Failed to get llama.cpp server path", "error", err)
+	}
-	if err != nil {
-		log.Error("Failed to get llama.cpp server path", "error", err)
-	}
+ 	if err != nil {
+ 		log.Error("Failed to get llama.cpp server path", "error", err)
+		exitFunc(1)
+	}
-	if err != nil {
-		log.Error("Failed to get llama.cpp server path", "error", err)
-	}
+ 	if err != nil {
+ 		log.Error("Failed to get llama.cpp server path", "error", err)
+		exitFunc(1)
+	}
 	vllmServerPath := envconfig.VLLMServerPath()
 	sglangServerPath := envconfig.SGLangServerPath()
 	mlxServerPath := envconfig.MLXServerPath()
@@ -101,13 +103,6 @@ func main() {
 		return
 	}
 
-	updatedServerPath := func() string {
-		wd, _ := os.Getwd()
-		d := filepath.Join(wd, "updated-inference", "bin")
-		_ = os.MkdirAll(d, 0o755)
-		return d
-	}()
-
 	svc, err := routing.NewService(routing.ServiceConfig{
 		Log: log,
 		ClientConfig: models.ClientConfig{
@@ -117,17 +112,16 @@ func main() {
 		},
 		Backends: append(
 			routing.DefaultBackendDefs(routing.BackendsConfig{
-				Log:                  log,
-				LlamaCppVendoredPath: llamaServerPath,
-				LlamaCppUpdatedPath:  updatedServerPath,
-				LlamaCppConfig:       llamaCppConfig,
-				IncludeMLX:           true,
-				MLXPath:              mlxServerPath,
-				IncludeVLLM:          includeVLLM,
-				VLLMPath:             vllmServerPath,
-				VLLMMetalPath:        vllmMetalServerPath,
-				IncludeDiffusers:     true,
-				DiffusersPath:        diffusersServerPath,
+				Log:              log,
+				LlamaCppPath:     llamaServerPath,
+				LlamaCppConfig:   llamaCppConfig,
+				IncludeMLX:       true,
+				MLXPath:          mlxServerPath,
+				IncludeVLLM:      includeVLLM,
+				VLLMPath:         vllmServerPath,
+				VLLMMetalPath:    vllmMetalServerPath,
+				IncludeDiffusers: true,
+				DiffusersPath:    diffusersServerPath,
 			}),
 			routing.BackendDef{Name: sglang.Name, Init: func(mm *models.Manager) (inference.Backend, error) {
 				return sglang.New(log, mm, log.With("component", sglang.Name), nil, sglangServerPath)

diff --git a/pkg/envconfig/envconfig.go b/pkg/envconfig/envconfig.go
@@ -105,13 +105,17 @@ func TCPPort() string {
 	return Var("MODEL_RUNNER_PORT")
 }
 
-// LlamaServerPath returns the path to the llama.cpp server binary.
-// Configured via LLAMA_SERVER_PATH; defaults to the Docker Desktop bundle location.
-func LlamaServerPath() string {
+// LlamaServerPath returns the path to the directory containing the llama.cpp server binary.
+// Configured via LLAMA_SERVER_PATH; defaults to ~/.docker/model-runner/llama.cpp/bin.
+func LlamaServerPath() (string, error) {
 	if s := Var("LLAMA_SERVER_PATH"); s != "" {
-		return s
+		return s, nil
+	}
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return "", err
 	}
-	return "/Applications/Docker.app/Contents/Resources/model-runner/bin"
+	return filepath.Join(home, ".docker", "model-runner", "llama.cpp", "bin"), nil
 }
 
 // LlamaArgs returns custom arguments to pass to the llama.cpp server.

diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -319,3 +319,9 @@ type Backend interface {
 	// GetDiskUsage returns the disk usage of the backend.
 	GetDiskUsage() (int64, error)
 }
+
+// BackendUpdater is an optional interface for backends that support
+// downloading updated binaries after Install() has succeeded.
+type BackendUpdater interface {
+	UpdateBinary(ctx context.Context, httpClient *http.Client) error
+}
diff --git a/pkg/inference/backends/llamacpp/download.go b/pkg/inference/backends/llamacpp/download.go
@@ -20,10 +20,16 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )
 
+// NeedsDeferredInstall returns true on platforms where the llama.cpp binary
+// is downloaded on-demand rather than vendored in a container image.
+func NeedsDeferredInstall() bool {
+	return runtime.GOOS == "darwin" || runtime.GOOS == "windows"
+}
+
 //nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go)
 const (
 	hubNamespace = "docker"
-	hubRepo      = "docker-model-backend-llamacpp"
+	hubRepo      = "model-runner"
 )
 
 var (
@@ -51,7 +57,7 @@ func SetDesiredServerVersion(version string) {
 
 //nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go)
 func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
-	llamaCppPath, vendoredServerStoragePath, desiredVersion, desiredVariant string,
+	desiredVersion, desiredVariant string,
 ) error {
 	ShouldUpdateServerLock.Lock()
 	shouldUpdateServer := ShouldUpdateServer
@@ -61,8 +67,11 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
 		return errLlamaCppUpdateDisabled
 	}
 
-	log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "vendoredServerStoragePath", vendoredServerStoragePath, "llamaCppPath", llamaCppPath)
-	desiredTag := desiredVersion + "-" + desiredVariant
+	llamaCppPath := filepath.Join(l.installDir, l.binaryName())
+	versionFile := filepath.Join(l.installDir, ".llamacpp_version")
+
+	log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "installDir", l.installDir)
+	desiredTag := "llamacpp-" + desiredVariant + "-" + desiredVersion
 	url := fmt.Sprintf("https://hub.docker.com/v2/namespaces/%s/repositories/%s/tags/%s", hubNamespace, hubRepo, desiredTag)
 	resp, err := httpClient.Get(url)
 	if err != nil {
@@ -94,30 +103,18 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
 		return fmt.Errorf("could not find the %s tag", desiredTag)
 	}
 
-	bundledVersionFile := filepath.Join(vendoredServerStoragePath, "com.docker.llama-server.digest")
-	currentVersionFile := filepath.Join(filepath.Dir(llamaCppPath), ".llamacpp_version")
-
-	data, err := os.ReadFile(bundledVersionFile)
-	if err != nil {
-		return fmt.Errorf("failed to read bundled llama.cpp version: %w", err)
-	} else if strings.TrimSpace(string(data)) == latest {
-		l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), desiredTag, latest)
-		return errLlamaCppUpToDate
-	}
-
-	data, err = os.ReadFile(currentVersionFile)
-	if err != nil {
-		log.Warn("failed to read current llama.cpp version", "error", err)
-		log.Warn("proceeding to update llama.cpp binary")
-	} else if strings.TrimSpace(string(data)) == latest {
-		log.Info("current llama.cpp version is already up to date")
-		if _, statErr := os.Stat(llamaCppPath); statErr == nil {
-			l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
-			return nil
+	data, err := os.ReadFile(versionFile)
+	if err == nil {
+		if strings.TrimSpace(string(data)) == latest {
+			log.Info("current llama.cpp version is already up to date")
+			if _, statErr := os.Stat(llamaCppPath); statErr == nil {
+				l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
+				return errLlamaCppUpToDate
+			}
+			log.Info("llama.cpp binary missing despite version match, proceeding to download")
+		} else {
+			log.Info("current llama.cpp version is outdated, proceeding to update", "current", strings.TrimSpace(string(data)), "latest", latest)
 		}
-		log.Info("llama.cpp binary must be updated, proceeding to update it")
-	} else {
-		log.Info("current llama.cpp version is outdated, proceeding to update it", "current", strings.TrimSpace(string(data)), "latest", latest)
 	}
 
 	image := fmt.Sprintf("registry-1.docker.io/%s/%s@%s", hubNamespace, hubRepo, latest)
@@ -132,32 +129,33 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
 		return fmt.Errorf("could not extract image: %w", extractErr)
 	}
 
-	if err := os.RemoveAll(filepath.Dir(llamaCppPath)); err != nil && !errors.Is(err, os.ErrNotExist) {
+	libDir := filepath.Join(filepath.Dir(l.installDir), "lib")
+	if err := os.RemoveAll(l.installDir); err != nil && !errors.Is(err, os.ErrNotExist) {
 		return fmt.Errorf("failed to clear inference binary dir: %w", err)
 	}
-	if err := os.RemoveAll(filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil && !errors.Is(err, os.ErrNotExist) {
+	if err := os.RemoveAll(libDir); err != nil && !errors.Is(err, os.ErrNotExist) {
 		return fmt.Errorf("failed to clear inference library dir: %w", err)
 	}
 
-	if err := os.MkdirAll(filepath.Dir(filepath.Dir(llamaCppPath)), 0o755); err != nil {
+	if err := os.MkdirAll(filepath.Dir(l.installDir), 0o755); err != nil {
 		return fmt.Errorf("could not create directory for llama.cpp artifacts: %w", err)
 	}
 
 	rootDir := fmt.Sprintf("com.docker.llama-server.native.%s.%s.%s", runtime.GOOS, desiredVariant, runtime.GOARCH)
-	if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), filepath.Dir(llamaCppPath)); err != nil {
+	if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), l.installDir); err != nil {
 		return fmt.Errorf("could not move llama.cpp binary: %w", err)
 	}
 	if err := os.Chmod(llamaCppPath, 0o755); err != nil {
 		return fmt.Errorf("could not chmod llama.cpp binary: %w", err)
 	}
 
-	libDir := filepath.Join(downloadDir, rootDir, "lib")
-	fi, err := os.Stat(libDir)
+	srcLibDir := filepath.Join(downloadDir, rootDir, "lib")
+	fi, err := os.Stat(srcLibDir)
 	if err != nil && !errors.Is(err, os.ErrNotExist) {
 		return fmt.Errorf("failed to stat llama.cpp lib dir: %w", err)
 	}
 	if err == nil && fi.IsDir() {
-		if err := os.Rename(libDir, filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil {
+		if err := os.Rename(srcLibDir, libDir); err != nil {
 			return fmt.Errorf("could not move llama.cpp libs: %w", err)
 		}
 	}
@@ -166,7 +164,7 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
 	l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
 	log.Info(l.status)
 
-	if err := os.WriteFile(currentVersionFile, []byte(latest), 0o644); err != nil {
+	if err := os.WriteFile(versionFile, []byte(latest), 0o644); err != nil {
 		log.Warn("failed to save llama.cpp version", "error", err)
 	}
 

diff --git a/pkg/inference/backends/llamacpp/download_darwin.go b/pkg/inference/backends/llamacpp/download_darwin.go
@@ -7,11 +7,8 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )
 
-func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
-	llamaCppPath, vendoredServerStoragePath string,
-) error {
+func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error {
 	desiredVersion := GetDesiredServerVersion()
 	desiredVariant := "metal"
-	return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
-		desiredVariant)
+	return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant)
 }
diff --git a/pkg/inference/backends/llamacpp/download_linux.go b/pkg/inference/backends/llamacpp/download_linux.go
@@ -8,9 +8,7 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )
 
-func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client,
-	_, vendoredServerStoragePath string,
-) error {
-	l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), "", "")
+func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client) error {
+	l.setRunningStatus(log, filepath.Join(l.installDir, "com.docker.llama-server"), "", "")
 	return errLlamaCppUpdateDisabled
 }
diff --git a/pkg/inference/backends/llamacpp/download_windows.go b/pkg/inference/backends/llamacpp/download_windows.go
@@ -11,10 +11,8 @@ import (
 	"github.com/docker/model-runner/pkg/logging"
 )
 
-func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
-	llamaCppPath, vendoredServerStoragePath string,
-) error {
-	nvGPUInfoBin := filepath.Join(vendoredServerStoragePath, "com.docker.nv-gpu-info.exe")
+func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error {
+	nvGPUInfoBin := filepath.Join(l.installDir, "com.docker.nv-gpu-info.exe")
 	var canUseCUDA11, canUseOpenCL bool
 	var err error
 	ShouldUseGPUVariantLock.Lock()
@@ -43,6 +41,5 @@ func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger,
 		desiredVariant = "opencl"
 	}
 	l.status = inference.FormatInstalling(fmt.Sprintf("%s llama.cpp %s", inference.DetailCheckingForUpdates, desiredVariant))
-	return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
-		desiredVariant)
+	return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant)
 }