From a2db150733469e180249b0f8f4e1f59183ff628f Mon Sep 17 00:00:00 2001 From: Dorin Geman Date: Thu, 5 Mar 2026 18:47:19 +0200 Subject: [PATCH] feat: make llama.cpp a deferred backend on macOS/Windows Signed-off-by: Dorin Geman --- Dockerfile | 2 +- cmd/cli/commands/install-runner.go | 11 ++ main.go | 34 +++--- pkg/envconfig/envconfig.go | 14 ++- pkg/inference/backend.go | 6 + pkg/inference/backends/llamacpp/download.go | 68 +++++------ .../backends/llamacpp/download_darwin.go | 7 +- .../backends/llamacpp/download_linux.go | 6 +- .../backends/llamacpp/download_windows.go | 9 +- pkg/inference/backends/llamacpp/llamacpp.go | 115 ++++++++++-------- pkg/inference/scheduling/http_handler.go | 2 + pkg/inference/scheduling/installer.go | 18 +++ pkg/routing/backends.go | 9 +- 13 files changed, 171 insertions(+), 130 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0c62726c4..0adb70cd4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ RUN --mount=type=cache,target=/go/pkg/mod \ CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w -X main.Version=${VERSION}" -o model-runner . # --- Get llama.cpp binary --- -FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server +FROM docker/model-runner:llamacpp-${LLAMA_SERVER_VARIANT}-${LLAMA_SERVER_VERSION} AS llama-server # --- Final image --- FROM docker.io/${BASE_IMAGE} AS llamacpp diff --git a/cmd/cli/commands/install-runner.go b/cmd/cli/commands/install-runner.go index 0e5eee45f..ff9e87aec 100644 --- a/cmd/cli/commands/install-runner.go +++ b/cmd/cli/commands/install-runner.go @@ -253,6 +253,17 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error return nil } + // On macOS/Windows, the llama.cpp backend uses deferred installation. + // Trigger installation (and binary update) via the running model runner. + if opts.backend == llamacpp.Name && llamacpp.NeedsDeferredInstall() { + cmd.Println("Installing llama.cpp backend...") + if err := desktopClient.InstallBackend(llamacpp.Name); err != nil { + return fmt.Errorf("failed to install llama.cpp backend: %w", err) + } + cmd.Println("llama.cpp backend installed successfully") + return nil + } + // The diffusers backend uses deferred installation: it pulls a Docker // image, extracts a self-contained Python environment, and installs it // to a well-known local folder. Trigger installation via the running diff --git a/main.go b/main.go index 8034d9553..f7ab23cbf 100644 --- a/main.go +++ b/main.go @@ -10,7 +10,6 @@ import ( "net/http" "os" "os/signal" - "path/filepath" "strings" "syscall" "time" @@ -59,7 +58,10 @@ func main() { llamacpp.SetDesiredServerVersion(v) } - llamaServerPath := envconfig.LlamaServerPath() + llamaServerPath, err := envconfig.LlamaServerPath() + if err != nil { + log.Error("Failed to get llama.cpp server path", "error", err) + } vllmServerPath := envconfig.VLLMServerPath() sglangServerPath := envconfig.SGLangServerPath() mlxServerPath := envconfig.MLXServerPath() @@ -101,13 +103,6 @@ func main() { return } - updatedServerPath := func() string { - wd, _ := os.Getwd() - d := filepath.Join(wd, "updated-inference", "bin") - _ = os.MkdirAll(d, 0o755) - return d - }() - svc, err := routing.NewService(routing.ServiceConfig{ Log: log, ClientConfig: models.ClientConfig{ @@ -117,17 +112,16 @@ func main() { }, Backends: append( routing.DefaultBackendDefs(routing.BackendsConfig{ - Log: log, - LlamaCppVendoredPath: llamaServerPath, - LlamaCppUpdatedPath: updatedServerPath, - LlamaCppConfig: llamaCppConfig, - IncludeMLX: true, - MLXPath: mlxServerPath, - IncludeVLLM: includeVLLM, - VLLMPath: vllmServerPath, - VLLMMetalPath: vllmMetalServerPath, - IncludeDiffusers: true, - DiffusersPath: diffusersServerPath, + Log: log, + LlamaCppPath: llamaServerPath, + LlamaCppConfig: llamaCppConfig, + IncludeMLX: true, + MLXPath: mlxServerPath, + IncludeVLLM: includeVLLM, + VLLMPath: vllmServerPath, + VLLMMetalPath: vllmMetalServerPath, + IncludeDiffusers: true, + DiffusersPath: diffusersServerPath, }), routing.BackendDef{Name: sglang.Name, Init: func(mm *models.Manager) (inference.Backend, error) { return sglang.New(log, mm, log.With("component", sglang.Name), nil, sglangServerPath) diff --git a/pkg/envconfig/envconfig.go b/pkg/envconfig/envconfig.go index 3592b9139..4d8c74d29 100644 --- a/pkg/envconfig/envconfig.go +++ b/pkg/envconfig/envconfig.go @@ -105,13 +105,17 @@ func TCPPort() string { return Var("MODEL_RUNNER_PORT") } -// LlamaServerPath returns the path to the llama.cpp server binary. -// Configured via LLAMA_SERVER_PATH; defaults to the Docker Desktop bundle location. -func LlamaServerPath() string { +// LlamaServerPath returns the path to the directory containing the llama.cpp server binary. +// Configured via LLAMA_SERVER_PATH; defaults to ~/.docker/model-runner/llama.cpp/bin. +func LlamaServerPath() (string, error) { if s := Var("LLAMA_SERVER_PATH"); s != "" { - return s + return s, nil + } + home, err := os.UserHomeDir() + if err != nil { + return "", err } - return "/Applications/Docker.app/Contents/Resources/model-runner/bin" + return filepath.Join(home, ".docker", "model-runner", "llama.cpp", "bin"), nil } // LlamaArgs returns custom arguments to pass to the llama.cpp server. diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go index e7c678a3d..c788f81fa 100644 --- a/pkg/inference/backend.go +++ b/pkg/inference/backend.go @@ -319,3 +319,9 @@ type Backend interface { // GetDiskUsage returns the disk usage of the backend. GetDiskUsage() (int64, error) } + +// BackendUpdater is an optional interface for backends that support +// downloading updated binaries after Install() has succeeded. +type BackendUpdater interface { + UpdateBinary(ctx context.Context, httpClient *http.Client) error +} diff --git a/pkg/inference/backends/llamacpp/download.go b/pkg/inference/backends/llamacpp/download.go index 1c339d66b..00b613024 100644 --- a/pkg/inference/backends/llamacpp/download.go +++ b/pkg/inference/backends/llamacpp/download.go @@ -20,10 +20,16 @@ import ( "github.com/docker/model-runner/pkg/logging" ) +// NeedsDeferredInstall returns true on platforms where the llama.cpp binary +// is downloaded on-demand rather than vendored in a container image. +func NeedsDeferredInstall() bool { + return runtime.GOOS == "darwin" || runtime.GOOS == "windows" +} + //nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go) const ( hubNamespace = "docker" - hubRepo = "docker-model-backend-llamacpp" + hubRepo = "model-runner" ) var ( @@ -51,7 +57,7 @@ func SetDesiredServerVersion(version string) { //nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go) func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client, - llamaCppPath, vendoredServerStoragePath, desiredVersion, desiredVariant string, + desiredVersion, desiredVariant string, ) error { ShouldUpdateServerLock.Lock() shouldUpdateServer := ShouldUpdateServer @@ -61,8 +67,11 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge return errLlamaCppUpdateDisabled } - log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "vendoredServerStoragePath", vendoredServerStoragePath, "llamaCppPath", llamaCppPath) - desiredTag := desiredVersion + "-" + desiredVariant + llamaCppPath := filepath.Join(l.installDir, l.binaryName()) + versionFile := filepath.Join(l.installDir, ".llamacpp_version") + + log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "installDir", l.installDir) + desiredTag := "llamacpp-" + desiredVariant + "-" + desiredVersion url := fmt.Sprintf("https://hub.docker.com/v2/namespaces/%s/repositories/%s/tags/%s", hubNamespace, hubRepo, desiredTag) resp, err := httpClient.Get(url) if err != nil { @@ -94,30 +103,18 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge return fmt.Errorf("could not find the %s tag", desiredTag) } - bundledVersionFile := filepath.Join(vendoredServerStoragePath, "com.docker.llama-server.digest") - currentVersionFile := filepath.Join(filepath.Dir(llamaCppPath), ".llamacpp_version") - - data, err := os.ReadFile(bundledVersionFile) - if err != nil { - return fmt.Errorf("failed to read bundled llama.cpp version: %w", err) - } else if strings.TrimSpace(string(data)) == latest { - l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), desiredTag, latest) - return errLlamaCppUpToDate - } - - data, err = os.ReadFile(currentVersionFile) - if err != nil { - log.Warn("failed to read current llama.cpp version", "error", err) - log.Warn("proceeding to update llama.cpp binary") - } else if strings.TrimSpace(string(data)) == latest { - log.Info("current llama.cpp version is already up to date") - if _, statErr := os.Stat(llamaCppPath); statErr == nil { - l.setRunningStatus(log, llamaCppPath, desiredTag, latest) - return nil + data, err := os.ReadFile(versionFile) + if err == nil { + if strings.TrimSpace(string(data)) == latest { + log.Info("current llama.cpp version is already up to date") + if _, statErr := os.Stat(llamaCppPath); statErr == nil { + l.setRunningStatus(log, llamaCppPath, desiredTag, latest) + return errLlamaCppUpToDate + } + log.Info("llama.cpp binary missing despite version match, proceeding to download") + } else { + log.Info("current llama.cpp version is outdated, proceeding to update", "current", strings.TrimSpace(string(data)), "latest", latest) } - log.Info("llama.cpp binary must be updated, proceeding to update it") - } else { - log.Info("current llama.cpp version is outdated, proceeding to update it", "current", strings.TrimSpace(string(data)), "latest", latest) } image := fmt.Sprintf("registry-1.docker.io/%s/%s@%s", hubNamespace, hubRepo, latest) @@ -132,32 +129,33 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge return fmt.Errorf("could not extract image: %w", extractErr) } - if err := os.RemoveAll(filepath.Dir(llamaCppPath)); err != nil && !errors.Is(err, os.ErrNotExist) { + libDir := filepath.Join(filepath.Dir(l.installDir), "lib") + if err := os.RemoveAll(l.installDir); err != nil && !errors.Is(err, os.ErrNotExist) { return fmt.Errorf("failed to clear inference binary dir: %w", err) } - if err := os.RemoveAll(filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil && !errors.Is(err, os.ErrNotExist) { + if err := os.RemoveAll(libDir); err != nil && !errors.Is(err, os.ErrNotExist) { return fmt.Errorf("failed to clear inference library dir: %w", err) } - if err := os.MkdirAll(filepath.Dir(filepath.Dir(llamaCppPath)), 0o755); err != nil { + if err := os.MkdirAll(filepath.Dir(l.installDir), 0o755); err != nil { return fmt.Errorf("could not create directory for llama.cpp artifacts: %w", err) } rootDir := fmt.Sprintf("com.docker.llama-server.native.%s.%s.%s", runtime.GOOS, desiredVariant, runtime.GOARCH) - if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), filepath.Dir(llamaCppPath)); err != nil { + if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), l.installDir); err != nil { return fmt.Errorf("could not move llama.cpp binary: %w", err) } if err := os.Chmod(llamaCppPath, 0o755); err != nil { return fmt.Errorf("could not chmod llama.cpp binary: %w", err) } - libDir := filepath.Join(downloadDir, rootDir, "lib") - fi, err := os.Stat(libDir) + srcLibDir := filepath.Join(downloadDir, rootDir, "lib") + fi, err := os.Stat(srcLibDir) if err != nil && !errors.Is(err, os.ErrNotExist) { return fmt.Errorf("failed to stat llama.cpp lib dir: %w", err) } if err == nil && fi.IsDir() { - if err := os.Rename(libDir, filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil { + if err := os.Rename(srcLibDir, libDir); err != nil { return fmt.Errorf("could not move llama.cpp libs: %w", err) } } @@ -166,7 +164,7 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge l.setRunningStatus(log, llamaCppPath, desiredTag, latest) log.Info(l.status) - if err := os.WriteFile(currentVersionFile, []byte(latest), 0o644); err != nil { + if err := os.WriteFile(versionFile, []byte(latest), 0o644); err != nil { log.Warn("failed to save llama.cpp version", "error", err) } diff --git a/pkg/inference/backends/llamacpp/download_darwin.go b/pkg/inference/backends/llamacpp/download_darwin.go index a1c0c9d29..279ce67b0 100644 --- a/pkg/inference/backends/llamacpp/download_darwin.go +++ b/pkg/inference/backends/llamacpp/download_darwin.go @@ -7,11 +7,8 @@ import ( "github.com/docker/model-runner/pkg/logging" ) -func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client, - llamaCppPath, vendoredServerStoragePath string, -) error { +func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error { desiredVersion := GetDesiredServerVersion() desiredVariant := "metal" - return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion, - desiredVariant) + return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant) } diff --git a/pkg/inference/backends/llamacpp/download_linux.go b/pkg/inference/backends/llamacpp/download_linux.go index de7662473..8e64fed5b 100644 --- a/pkg/inference/backends/llamacpp/download_linux.go +++ b/pkg/inference/backends/llamacpp/download_linux.go @@ -8,9 +8,7 @@ import ( "github.com/docker/model-runner/pkg/logging" ) -func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client, - _, vendoredServerStoragePath string, -) error { - l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), "", "") +func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client) error { + l.setRunningStatus(log, filepath.Join(l.installDir, "com.docker.llama-server"), "", "") return errLlamaCppUpdateDisabled } diff --git a/pkg/inference/backends/llamacpp/download_windows.go b/pkg/inference/backends/llamacpp/download_windows.go index c60574c71..480a30216 100644 --- a/pkg/inference/backends/llamacpp/download_windows.go +++ b/pkg/inference/backends/llamacpp/download_windows.go @@ -11,10 +11,8 @@ import ( "github.com/docker/model-runner/pkg/logging" ) -func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client, - llamaCppPath, vendoredServerStoragePath string, -) error { - nvGPUInfoBin := filepath.Join(vendoredServerStoragePath, "com.docker.nv-gpu-info.exe") +func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error { + nvGPUInfoBin := filepath.Join(l.installDir, "com.docker.nv-gpu-info.exe") var canUseCUDA11, canUseOpenCL bool var err error ShouldUseGPUVariantLock.Lock() @@ -43,6 +41,5 @@ func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, desiredVariant = "opencl" } l.status = inference.FormatInstalling(fmt.Sprintf("%s llama.cpp %s", inference.DetailCheckingForUpdates, desiredVariant)) - return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion, - desiredVariant) + return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant) } diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 62cfbd637..c6438fd13 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "net/http" + "os" "os/exec" "path/filepath" "regexp" @@ -38,13 +39,11 @@ type llamaCpp struct { // modelManager is the shared model manager. modelManager *models.Manager // serverLog is the logger to use for the llama.cpp server process. - serverLog logging.Logger - updatedLlamaCpp bool - // vendoredServerStoragePath is the parent path of the vendored version of com.docker.llama-server. - vendoredServerStoragePath string - // updatedServerStoragePath is the parent path of the updated version of com.docker.llama-server. - // It is also where updates will be stored when downloaded. - updatedServerStoragePath string + serverLog logging.Logger + // installDir is the directory containing the llama.cpp binary. + // On macOS/Windows this is ~/.docker/model-runner/llama.cpp/bin (downloaded on demand). + // On Linux this is the vendored bin path inside the container (e.g. /app/bin). + installDir string // status is the state in which the llama.cpp backend is in. status string // config is the configuration for the llama.cpp backend. @@ -58,8 +57,7 @@ func New( log logging.Logger, modelManager *models.Manager, serverLog logging.Logger, - vendoredServerStoragePath string, - updatedServerStoragePath string, + installDir string, conf config.BackendConfig, ) (inference.Backend, error) { // If no config is provided, use the default configuration @@ -68,12 +66,11 @@ func New( } return &llamaCpp{ - log: log, - modelManager: modelManager, - serverLog: serverLog, - vendoredServerStoragePath: vendoredServerStoragePath, - updatedServerStoragePath: updatedServerStoragePath, - config: conf, + log: log, + modelManager: modelManager, + serverLog: serverLog, + installDir: installDir, + config: conf, }, nil } @@ -95,8 +92,6 @@ func (l *llamaCpp) UsesTCP() bool { // Install implements inference.Backend.Install. func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error { - l.updatedLlamaCpp = false - // We don't currently support this backend on Windows. We'll likely // never support it on Intel Macs. if (runtime.GOOS == "darwin" && runtime.GOARCH == "amd64") || @@ -104,26 +99,33 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error { return errors.New("platform not supported") } - llamaServerBin := "com.docker.llama-server" - if runtime.GOOS == "windows" { - llamaServerBin = "com.docker.llama-server.exe" - } - - // Temporary workaround for dynamically downloading llama.cpp from Docker Hub. - // Internet access and an available docker/docker-model-backend-llamacpp:latest on Docker Hub are required. - // Even if docker/docker-model-backend-llamacpp:latest has been downloaded before, we still require its - // digest to be equal to the one on Docker Hub. - llamaCppPath := filepath.Join(l.updatedServerStoragePath, llamaServerBin) - if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient, llamaCppPath, l.vendoredServerStoragePath); err != nil { - l.log.Info("Failed to ensure latest llama.cpp", "error", err) - if !errors.Is(err, errLlamaCppUpToDate) && !errors.Is(err, errLlamaCppUpdateDisabled) { - l.status = inference.FormatError(fmt.Sprintf("failed to install llama.cpp: %v", err)) - } - if errors.Is(err, context.Canceled) { - return err + llamaServerBin := l.binaryName() + + if NeedsDeferredInstall() { + binPath := filepath.Join(l.installDir, llamaServerBin) + if _, err := os.Stat(binPath); err == nil { + l.setRunningStatus(l.log, binPath, "", "") + } else { + if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient); err != nil { + l.log.Info("Failed to download llama.cpp", "error", err) + if !errors.Is(err, errLlamaCppUpToDate) && !errors.Is(err, errLlamaCppUpdateDisabled) { + l.status = inference.FormatError(fmt.Sprintf("failed to install llama.cpp: %v", err)) + } + if errors.Is(err, context.Canceled) { + return err + } + } } } else { - l.updatedLlamaCpp = true + if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient); err != nil { + l.log.Info("Failed to ensure latest llama.cpp", "error", err) + if !errors.Is(err, errLlamaCppUpToDate) && !errors.Is(err, errLlamaCppUpdateDisabled) { + l.status = inference.FormatError(fmt.Sprintf("failed to install llama.cpp: %v", err)) + } + if errors.Is(err, context.Canceled) { + return err + } + } } l.gpuSupported = l.checkGPUSupport(ctx) @@ -132,6 +134,23 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error { return nil } +// UpdateBinary implements inference.BackendUpdater. +func (l *llamaCpp) UpdateBinary(ctx context.Context, httpClient *http.Client) error { + versionFile := filepath.Join(l.installDir, ".llamacpp_version") + if _, err := os.Stat(versionFile); errors.Is(err, os.ErrNotExist) { + return nil + } + + if err := l.ensureLatestLlamaCpp(ctx, l.log, httpClient); err != nil { + if errors.Is(err, errLlamaCppUpToDate) || errors.Is(err, errLlamaCppUpdateDisabled) { + return nil + } + return err + } + l.gpuSupported = l.checkGPUSupport(ctx) + return nil +} + // Run implements inference.Backend.Run. func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode inference.BackendMode, config *inference.BackendConfiguration) error { bundle, err := l.modelManager.GetBundle(model) @@ -147,11 +166,6 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode } } - binPath := l.vendoredServerStoragePath - if l.updatedLlamaCpp { - binPath = l.updatedServerStoragePath - } - args, err := l.config.GetArgs(bundle, socket, mode, config) if err != nil { return fmt.Errorf("failed to get args for llama.cpp: %w", err) @@ -173,8 +187,8 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode return backends.RunBackend(ctx, backends.RunnerConfig{ BackendName: "llama.cpp", Socket: socket, - BinaryPath: filepath.Join(binPath, "com.docker.llama-server"), - SandboxPath: binPath, + BinaryPath: filepath.Join(l.installDir, "com.docker.llama-server"), + SandboxPath: l.installDir, SandboxConfig: sandbox.ConfigurationLlamaCpp, Args: args, Logger: l.log, @@ -188,7 +202,7 @@ func (l *llamaCpp) Status() string { } func (l *llamaCpp) GetDiskUsage() (int64, error) { - size, err := diskusage.Size(l.updatedServerStoragePath) + size, err := diskusage.Size(l.installDir) if err != nil { return 0, fmt.Errorf("error while getting store size: %w", err) } @@ -332,11 +346,14 @@ func getGGUFLayers(layers []oci.Layer) []oci.Layer { return filtered } -func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { - binPath := l.vendoredServerStoragePath - if l.updatedLlamaCpp { - binPath = l.updatedServerStoragePath +func (l *llamaCpp) binaryName() string { + if runtime.GOOS == "windows" { + return "com.docker.llama-server.exe" } + return "com.docker.llama-server" +} + +func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { var output bytes.Buffer llamaCppSandbox, err := sandbox.Create( ctx, @@ -345,8 +362,8 @@ func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { command.Stdout = &output command.Stderr = &output }, - binPath, - filepath.Join(binPath, "com.docker.llama-server"), + l.installDir, + filepath.Join(l.installDir, "com.docker.llama-server"), "--list-devices", ) if err != nil { diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go index a9f3077b9..4ed00ec17 100644 --- a/pkg/inference/scheduling/http_handler.go +++ b/pkg/inference/scheduling/http_handler.go @@ -397,6 +397,8 @@ func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) { return } + h.scheduler.log.Info("backend installation succeeded", "backend", req.Backend) + w.WriteHeader(http.StatusOK) } diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go index a31a975ed..d15f2e6e5 100644 --- a/pkg/inference/scheduling/installer.go +++ b/pkg/inference/scheduling/installer.go @@ -153,6 +153,24 @@ func (i *installer) run(ctx context.Context) { close(status.installed) } } + + // Background binary updates for backends that support it. + for name, backend := range i.backends { + select { + case <-ctx.Done(): + return + default: + } + if updater, ok := backend.(inference.BackendUpdater); ok { + i.log.Info("Checking for backend binary update", "backend", name) + if err := updater.UpdateBinary(ctx, i.httpClient); err != nil { + if ctx.Err() != nil { + return + } + i.log.Warn("Backend binary update failed", "backend", name, "error", err) + } + } + } } // wait waits for installation of the specified backend to complete or fail. diff --git a/pkg/routing/backends.go b/pkg/routing/backends.go index 3d0231171..c9ef4a5f0 100644 --- a/pkg/routing/backends.go +++ b/pkg/routing/backends.go @@ -21,9 +21,8 @@ type BackendsConfig struct { ServerLogFactory func(backendName string) logging.Logger // LlamaCpp settings (always included). - LlamaCppVendoredPath string - LlamaCppUpdatedPath string - LlamaCppConfig config.BackendConfig + LlamaCppPath string + LlamaCppConfig config.BackendConfig // Optional backends and their custom server paths. IncludeMLX bool @@ -49,8 +48,8 @@ func DefaultBackendDefs(cfg BackendsConfig) []BackendDef { } defs := []BackendDef{ - {Name: llamacpp.Name, Init: func(mm *models.Manager) (inference.Backend, error) { - return llamacpp.New(cfg.Log, mm, sl(llamacpp.Name), cfg.LlamaCppVendoredPath, cfg.LlamaCppUpdatedPath, cfg.LlamaCppConfig) + {Name: llamacpp.Name, Deferred: llamacpp.NeedsDeferredInstall(), Init: func(mm *models.Manager) (inference.Backend, error) { + return llamacpp.New(cfg.Log, mm, sl(llamacpp.Name), cfg.LlamaCppPath, cfg.LlamaCppConfig) }}, }