Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ RUN --mount=type=cache,target=/go/pkg/mod \
CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w -X main.Version=${VERSION}" -o model-runner .

# --- Get llama.cpp binary ---
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
FROM docker/model-runner:llamacpp-${LLAMA_SERVER_VARIANT}-${LLAMA_SERVER_VERSION} AS llama-server

# --- Final image ---
FROM docker.io/${BASE_IMAGE} AS llamacpp
Expand Down
11 changes: 11 additions & 0 deletions cmd/cli/commands/install-runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,17 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
return nil
}

// On macOS/Windows, the llama.cpp backend uses deferred installation.
// Trigger installation (and binary update) via the running model runner.
if opts.backend == llamacpp.Name && llamacpp.NeedsDeferredInstall() {
cmd.Println("Installing llama.cpp backend...")
if err := desktopClient.InstallBackend(llamacpp.Name); err != nil {
return fmt.Errorf("failed to install llama.cpp backend: %w", err)
}
cmd.Println("llama.cpp backend installed successfully")
return nil
}

// The diffusers backend uses deferred installation: it pulls a Docker
// image, extracts a self-contained Python environment, and installs it
// to a well-known local folder. Trigger installation via the running
Expand Down
34 changes: 14 additions & 20 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"net/http"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
Expand Down Expand Up @@ -59,7 +58,10 @@ func main() {
llamacpp.SetDesiredServerVersion(v)
}

llamaServerPath := envconfig.LlamaServerPath()
llamaServerPath, err := envconfig.LlamaServerPath()
if err != nil {
log.Error("Failed to get llama.cpp server path", "error", err)
}
Comment on lines +62 to +64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The program continues to run even if envconfig.LlamaServerPath() returns an error. If this happens, llamaServerPath will be an empty string, which will cause the model runner to attempt to write to the current working directory instead of the intended installation directory. This can lead to unexpected behavior and potential permission issues. This error should be treated as fatal, and the program should exit.

Suggested change
if err != nil {
log.Error("Failed to get llama.cpp server path", "error", err)
}
if err != nil {
log.Error("Failed to get llama.cpp server path", "error", err)
exitFunc(1)
}

vllmServerPath := envconfig.VLLMServerPath()
sglangServerPath := envconfig.SGLangServerPath()
mlxServerPath := envconfig.MLXServerPath()
Expand Down Expand Up @@ -101,13 +103,6 @@ func main() {
return
}

updatedServerPath := func() string {
wd, _ := os.Getwd()
d := filepath.Join(wd, "updated-inference", "bin")
_ = os.MkdirAll(d, 0o755)
return d
}()

svc, err := routing.NewService(routing.ServiceConfig{
Log: log,
ClientConfig: models.ClientConfig{
Expand All @@ -117,17 +112,16 @@ func main() {
},
Backends: append(
routing.DefaultBackendDefs(routing.BackendsConfig{
Log: log,
LlamaCppVendoredPath: llamaServerPath,
LlamaCppUpdatedPath: updatedServerPath,
LlamaCppConfig: llamaCppConfig,
IncludeMLX: true,
MLXPath: mlxServerPath,
IncludeVLLM: includeVLLM,
VLLMPath: vllmServerPath,
VLLMMetalPath: vllmMetalServerPath,
IncludeDiffusers: true,
DiffusersPath: diffusersServerPath,
Log: log,
LlamaCppPath: llamaServerPath,
LlamaCppConfig: llamaCppConfig,
IncludeMLX: true,
MLXPath: mlxServerPath,
IncludeVLLM: includeVLLM,
VLLMPath: vllmServerPath,
VLLMMetalPath: vllmMetalServerPath,
IncludeDiffusers: true,
DiffusersPath: diffusersServerPath,
}),
routing.BackendDef{Name: sglang.Name, Init: func(mm *models.Manager) (inference.Backend, error) {
return sglang.New(log, mm, log.With("component", sglang.Name), nil, sglangServerPath)
Expand Down
14 changes: 9 additions & 5 deletions pkg/envconfig/envconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,17 @@ func TCPPort() string {
return Var("MODEL_RUNNER_PORT")
}

// LlamaServerPath returns the path to the llama.cpp server binary.
// Configured via LLAMA_SERVER_PATH; defaults to the Docker Desktop bundle location.
func LlamaServerPath() string {
// LlamaServerPath returns the path to the directory containing the llama.cpp server binary.
// Configured via LLAMA_SERVER_PATH; defaults to ~/.docker/model-runner/llama.cpp/bin.
func LlamaServerPath() (string, error) {
if s := Var("LLAMA_SERVER_PATH"); s != "" {
return s
return s, nil
}
home, err := os.UserHomeDir()
if err != nil {
return "", err
}
return "/Applications/Docker.app/Contents/Resources/model-runner/bin"
return filepath.Join(home, ".docker", "model-runner", "llama.cpp", "bin"), nil
}

// LlamaArgs returns custom arguments to pass to the llama.cpp server.
Expand Down
6 changes: 6 additions & 0 deletions pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,9 @@ type Backend interface {
// GetDiskUsage returns the disk usage of the backend.
GetDiskUsage() (int64, error)
}

// BackendUpdater is an optional interface for backends that support
// downloading updated binaries after Install() has succeeded.
type BackendUpdater interface {
UpdateBinary(ctx context.Context, httpClient *http.Client) error
}
68 changes: 33 additions & 35 deletions pkg/inference/backends/llamacpp/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@ import (
"github.com/docker/model-runner/pkg/logging"
)

// NeedsDeferredInstall returns true on platforms where the llama.cpp binary
// is downloaded on-demand rather than vendored in a container image.
func NeedsDeferredInstall() bool {
return runtime.GOOS == "darwin" || runtime.GOOS == "windows"
}

//nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go)
const (
hubNamespace = "docker"
hubRepo = "docker-model-backend-llamacpp"
hubRepo = "model-runner"
)

var (
Expand Down Expand Up @@ -51,7 +57,7 @@ func SetDesiredServerVersion(version string) {

//nolint:unused // Used in platform-specific files (download_darwin.go, download_windows.go)
func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
llamaCppPath, vendoredServerStoragePath, desiredVersion, desiredVariant string,
desiredVersion, desiredVariant string,
) error {
ShouldUpdateServerLock.Lock()
shouldUpdateServer := ShouldUpdateServer
Expand All @@ -61,8 +67,11 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
return errLlamaCppUpdateDisabled
}

log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "vendoredServerStoragePath", vendoredServerStoragePath, "llamaCppPath", llamaCppPath)
desiredTag := desiredVersion + "-" + desiredVariant
llamaCppPath := filepath.Join(l.installDir, l.binaryName())
versionFile := filepath.Join(l.installDir, ".llamacpp_version")

log.Info("downloadLatestLlamaCpp", "desiredVersion", desiredVersion, "desiredVariant", desiredVariant, "installDir", l.installDir)
desiredTag := "llamacpp-" + desiredVariant + "-" + desiredVersion
url := fmt.Sprintf("https://hub.docker.com/v2/namespaces/%s/repositories/%s/tags/%s", hubNamespace, hubRepo, desiredTag)
resp, err := httpClient.Get(url)
if err != nil {
Expand Down Expand Up @@ -94,30 +103,18 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
return fmt.Errorf("could not find the %s tag", desiredTag)
}

bundledVersionFile := filepath.Join(vendoredServerStoragePath, "com.docker.llama-server.digest")
currentVersionFile := filepath.Join(filepath.Dir(llamaCppPath), ".llamacpp_version")

data, err := os.ReadFile(bundledVersionFile)
if err != nil {
return fmt.Errorf("failed to read bundled llama.cpp version: %w", err)
} else if strings.TrimSpace(string(data)) == latest {
l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), desiredTag, latest)
return errLlamaCppUpToDate
}

data, err = os.ReadFile(currentVersionFile)
if err != nil {
log.Warn("failed to read current llama.cpp version", "error", err)
log.Warn("proceeding to update llama.cpp binary")
} else if strings.TrimSpace(string(data)) == latest {
log.Info("current llama.cpp version is already up to date")
if _, statErr := os.Stat(llamaCppPath); statErr == nil {
l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
return nil
data, err := os.ReadFile(versionFile)
if err == nil {
if strings.TrimSpace(string(data)) == latest {
log.Info("current llama.cpp version is already up to date")
if _, statErr := os.Stat(llamaCppPath); statErr == nil {
l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
return errLlamaCppUpToDate
}
log.Info("llama.cpp binary missing despite version match, proceeding to download")
} else {
log.Info("current llama.cpp version is outdated, proceeding to update", "current", strings.TrimSpace(string(data)), "latest", latest)
}
log.Info("llama.cpp binary must be updated, proceeding to update it")
} else {
log.Info("current llama.cpp version is outdated, proceeding to update it", "current", strings.TrimSpace(string(data)), "latest", latest)
}

image := fmt.Sprintf("registry-1.docker.io/%s/%s@%s", hubNamespace, hubRepo, latest)
Expand All @@ -132,32 +129,33 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
return fmt.Errorf("could not extract image: %w", extractErr)
}

if err := os.RemoveAll(filepath.Dir(llamaCppPath)); err != nil && !errors.Is(err, os.ErrNotExist) {
libDir := filepath.Join(filepath.Dir(l.installDir), "lib")
if err := os.RemoveAll(l.installDir); err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("failed to clear inference binary dir: %w", err)
}
if err := os.RemoveAll(filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil && !errors.Is(err, os.ErrNotExist) {
if err := os.RemoveAll(libDir); err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("failed to clear inference library dir: %w", err)
}

if err := os.MkdirAll(filepath.Dir(filepath.Dir(llamaCppPath)), 0o755); err != nil {
if err := os.MkdirAll(filepath.Dir(l.installDir), 0o755); err != nil {
return fmt.Errorf("could not create directory for llama.cpp artifacts: %w", err)
}

rootDir := fmt.Sprintf("com.docker.llama-server.native.%s.%s.%s", runtime.GOOS, desiredVariant, runtime.GOARCH)
if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), filepath.Dir(llamaCppPath)); err != nil {
if err := os.Rename(filepath.Join(downloadDir, rootDir, "bin"), l.installDir); err != nil {
return fmt.Errorf("could not move llama.cpp binary: %w", err)
}
if err := os.Chmod(llamaCppPath, 0o755); err != nil {
return fmt.Errorf("could not chmod llama.cpp binary: %w", err)
}

libDir := filepath.Join(downloadDir, rootDir, "lib")
fi, err := os.Stat(libDir)
srcLibDir := filepath.Join(downloadDir, rootDir, "lib")
fi, err := os.Stat(srcLibDir)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("failed to stat llama.cpp lib dir: %w", err)
}
if err == nil && fi.IsDir() {
if err := os.Rename(libDir, filepath.Join(filepath.Dir(filepath.Dir(llamaCppPath)), "lib")); err != nil {
if err := os.Rename(srcLibDir, libDir); err != nil {
return fmt.Errorf("could not move llama.cpp libs: %w", err)
}
}
Expand All @@ -166,7 +164,7 @@ func (l *llamaCpp) downloadLatestLlamaCpp(ctx context.Context, log logging.Logge
l.setRunningStatus(log, llamaCppPath, desiredTag, latest)
log.Info(l.status)

if err := os.WriteFile(currentVersionFile, []byte(latest), 0o644); err != nil {
if err := os.WriteFile(versionFile, []byte(latest), 0o644); err != nil {
log.Warn("failed to save llama.cpp version", "error", err)
}

Expand Down
7 changes: 2 additions & 5 deletions pkg/inference/backends/llamacpp/download_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@ import (
"github.com/docker/model-runner/pkg/logging"
)

func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
llamaCppPath, vendoredServerStoragePath string,
) error {
func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error {
desiredVersion := GetDesiredServerVersion()
desiredVariant := "metal"
return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
desiredVariant)
return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant)
}
6 changes: 2 additions & 4 deletions pkg/inference/backends/llamacpp/download_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ import (
"github.com/docker/model-runner/pkg/logging"
)

func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client,
_, vendoredServerStoragePath string,
) error {
l.setRunningStatus(log, filepath.Join(vendoredServerStoragePath, "com.docker.llama-server"), "", "")
func (l *llamaCpp) ensureLatestLlamaCpp(_ context.Context, log logging.Logger, _ *http.Client) error {
l.setRunningStatus(log, filepath.Join(l.installDir, "com.docker.llama-server"), "", "")
return errLlamaCppUpdateDisabled
}
9 changes: 3 additions & 6 deletions pkg/inference/backends/llamacpp/download_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ import (
"github.com/docker/model-runner/pkg/logging"
)

func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client,
llamaCppPath, vendoredServerStoragePath string,
) error {
nvGPUInfoBin := filepath.Join(vendoredServerStoragePath, "com.docker.nv-gpu-info.exe")
func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger, httpClient *http.Client) error {
nvGPUInfoBin := filepath.Join(l.installDir, "com.docker.nv-gpu-info.exe")
var canUseCUDA11, canUseOpenCL bool
var err error
ShouldUseGPUVariantLock.Lock()
Expand Down Expand Up @@ -43,6 +41,5 @@ func (l *llamaCpp) ensureLatestLlamaCpp(ctx context.Context, log logging.Logger,
desiredVariant = "opencl"
}
l.status = inference.FormatInstalling(fmt.Sprintf("%s llama.cpp %s", inference.DetailCheckingForUpdates, desiredVariant))
return l.downloadLatestLlamaCpp(ctx, log, httpClient, llamaCppPath, vendoredServerStoragePath, desiredVersion,
desiredVariant)
return l.downloadLatestLlamaCpp(ctx, log, httpClient, desiredVersion, desiredVariant)
}
Loading