Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions invokeai/app/invocations/cogview4_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
CogView4ConditioningInfo,
ConditioningFieldData,
)
from invokeai.backend.util.devices import TorchDevice

# The CogView4 GLM Text Encoder max sequence length set based on the default in diffusers.
COGVIEW4_GLM_MAX_SEQ_LEN = 1024
Expand All @@ -37,6 +36,8 @@ class CogView4TextEncoderInvocation(BaseInvocation):
@torch.no_grad()
def invoke(self, context: InvocationContext) -> CogView4ConditioningOutput:
glm_embeds = self._glm_encode(context, max_seq_len=COGVIEW4_GLM_MAX_SEQ_LEN)
# Move embeddings to CPU for storage to save VRAM
glm_embeds = glm_embeds.detach().to("cpu")
conditioning_data = ConditioningFieldData(conditionings=[CogView4ConditioningInfo(glm_embeds=glm_embeds)])
conditioning_name = context.conditioning.save(conditioning_data)
return CogView4ConditioningOutput.build(conditioning_name)
Expand Down Expand Up @@ -85,7 +86,7 @@ def _glm_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Ten
)
text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
prompt_embeds = glm_text_encoder(
text_input_ids.to(TorchDevice.choose_torch_device()), output_hidden_states=True
text_input_ids.to(glm_text_encoder.device), output_hidden_states=True
).hidden_states[-2]

assert isinstance(prompt_embeds, torch.Tensor)
Expand Down
4 changes: 2 additions & 2 deletions invokeai/app/invocations/compel.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
textual_inversion_manager=ti_manager,
dtype_for_device_getter=TorchDevice.choose_torch_dtype,
truncate_long_prompts=False,
device=TorchDevice.choose_torch_device(),
device=text_encoder.device, # Use the device the model is actually on
split_long_text_mode=SplitLongTextMode.SENTENCES,
)

Expand Down Expand Up @@ -212,7 +212,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
truncate_long_prompts=False, # TODO:
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, # TODO: clip skip
requires_pooled=get_pooled,
device=TorchDevice.choose_torch_device(),
device=text_encoder.device, # Use the device the model is actually on
split_long_text_mode=SplitLongTextMode.SENTENCES,
)

Expand Down
6 changes: 6 additions & 0 deletions invokeai/app/invocations/flux_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def invoke(self, context: InvocationContext) -> FluxConditioningOutput:
# scoped. This ensures that the T5 model can be freed and gc'd before loading the CLIP model (if necessary).
t5_embeddings = self._t5_encode(context)
clip_embeddings = self._clip_encode(context)

# Move embeddings to CPU for storage to save VRAM
# They will be moved to the appropriate device when used by the denoiser
t5_embeddings = t5_embeddings.detach().to("cpu")
clip_embeddings = clip_embeddings.detach().to("cpu")

conditioning_data = ConditioningFieldData(
conditionings=[FLUXConditioningInfo(clip_embeds=clip_embeddings, t5_embeds=t5_embeddings)]
)
Expand Down
14 changes: 11 additions & 3 deletions invokeai/app/invocations/sd3_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_CLIP_PREFIX
from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, SD3ConditioningInfo
from invokeai.backend.util.devices import TorchDevice

# The SD3 T5 Max Sequence Length set based on the default in diffusers.
SD3_T5_MAX_SEQ_LEN = 256
Expand Down Expand Up @@ -69,6 +68,15 @@ def invoke(self, context: InvocationContext) -> SD3ConditioningOutput:
if self.t5_encoder is not None:
t5_embeddings = self._t5_encode(context, SD3_T5_MAX_SEQ_LEN)

# Move all embeddings to CPU for storage to save VRAM
# They will be moved to the appropriate device when used by the denoiser
clip_l_embeddings = clip_l_embeddings.detach().to("cpu")
clip_l_pooled_embeddings = clip_l_pooled_embeddings.detach().to("cpu")
clip_g_embeddings = clip_g_embeddings.detach().to("cpu")
clip_g_pooled_embeddings = clip_g_pooled_embeddings.detach().to("cpu")
if t5_embeddings is not None:
t5_embeddings = t5_embeddings.detach().to("cpu")

conditioning_data = ConditioningFieldData(
conditionings=[
SD3ConditioningInfo(
Expand Down Expand Up @@ -117,7 +125,7 @@ def _t5_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Tens
f" {max_seq_len} tokens: {removed_text}"
)

prompt_embeds = t5_text_encoder(text_input_ids.to(TorchDevice.choose_torch_device()))[0]
prompt_embeds = t5_text_encoder(text_input_ids.to(t5_text_encoder.device))[0]

assert isinstance(prompt_embeds, torch.Tensor)
return prompt_embeds
Expand Down Expand Up @@ -180,7 +188,7 @@ def _clip_encode(
f" {tokenizer_max_length} tokens: {removed_text}"
)
prompt_embeds = clip_text_encoder(
input_ids=text_input_ids.to(TorchDevice.choose_torch_device()), output_hidden_states=True
input_ids=text_input_ids.to(clip_text_encoder.device), output_hidden_states=True
)
pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2]
Expand Down
6 changes: 5 additions & 1 deletion invokeai/app/invocations/z_image_text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class ZImageTextEncoderInvocation(BaseInvocation):
@torch.no_grad()
def invoke(self, context: InvocationContext) -> ZImageConditioningOutput:
prompt_embeds = self._encode_prompt(context, max_seq_len=Z_IMAGE_MAX_SEQ_LEN)
# Move embeddings to CPU for storage to save VRAM
prompt_embeds = prompt_embeds.detach().to("cpu")
conditioning_data = ConditioningFieldData(conditionings=[ZImageConditioningInfo(prompt_embeds=prompt_embeds)])
conditioning_name = context.conditioning.save(conditioning_data)
return ZImageConditioningOutput(
Expand All @@ -69,7 +71,6 @@ def _encode_prompt(self, context: InvocationContext, max_seq_len: int) -> torch.
Based on the ZImagePipeline._encode_prompt method from diffusers.
"""
prompt = self.prompt
device = TorchDevice.choose_torch_device()

text_encoder_info = context.models.load(self.qwen3_encoder.text_encoder)
tokenizer_info = context.models.load(self.qwen3_encoder.tokenizer)
Expand All @@ -78,6 +79,9 @@ def _encode_prompt(self, context: InvocationContext, max_seq_len: int) -> torch.
(_, text_encoder) = exit_stack.enter_context(text_encoder_info.model_on_device())
(_, tokenizer) = exit_stack.enter_context(tokenizer_info.model_on_device())

# Use the device that the text_encoder is actually on
device = text_encoder.device

# Apply LoRA models to the text encoder
lora_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
exit_stack.enter_context(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class ModelRecordChanges(BaseModelExcludeNull):
default_settings: Optional[MainModelDefaultSettings | LoraModelDefaultSettings | ControlAdapterDefaultSettings] = (
Field(description="Default settings for this model", default=None)
)
cpu_only: Optional[bool] = Field(description="Whether this model should run on CPU only", default=None)

# Checkpoint-specific changes
# TODO(MM2): Should we expose these? Feels footgun-y...
Expand Down
7 changes: 4 additions & 3 deletions invokeai/backend/flux/modules/conditioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from torch import Tensor, nn
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast

from invokeai.backend.util.devices import TorchDevice


class HFEncoder(nn.Module):
def __init__(
Expand Down Expand Up @@ -33,8 +31,11 @@ def forward(self, text: list[str]) -> Tensor:
return_tensors="pt",
)

# Move inputs to the same device as the model to support cpu_only models
model_device = next(self.hf_module.parameters()).device

outputs = self.hf_module(
input_ids=batch_encoding["input_ids"].to(TorchDevice.choose_torch_device()),
input_ids=batch_encoding["input_ids"].to(model_device),
attention_mask=None,
output_hidden_states=False,
)
Expand Down
1 change: 1 addition & 0 deletions invokeai/backend/model_manager/configs/clip_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class CLIPEmbed_Diffusers_Config_Base(Diffusers_Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.CLIPEmbed] = Field(default=ModelType.CLIPEmbed)
format: Literal[ModelFormat.Diffusers] = Field(default=ModelFormat.Diffusers)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
1 change: 1 addition & 0 deletions invokeai/backend/model_manager/configs/clip_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class CLIPVision_Diffusers_Config(Diffusers_Config_Base, Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.CLIPVision] = Field(default=ModelType.CLIPVision)
format: Literal[ModelFormat.Diffusers] = Field(default=ModelFormat.Diffusers)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class LlavaOnevision_Diffusers_Config(Diffusers_Config_Base, Config_Base):

type: Literal[ModelType.LlavaOnevision] = Field(default=ModelType.LlavaOnevision)
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
1 change: 1 addition & 0 deletions invokeai/backend/model_manager/configs/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class MainModelDefaultSettings(BaseModel):
width: int | None = Field(default=None, multiple_of=8, ge=64, description="Default width for this model")
height: int | None = Field(default=None, multiple_of=8, ge=64, description="Default height for this model")
guidance: float | None = Field(default=None, ge=1, description="Default Guidance for this model")
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

model_config = ConfigDict(extra="forbid")

Expand Down
3 changes: 3 additions & 0 deletions invokeai/backend/model_manager/configs/qwen3_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class Qwen3Encoder_Checkpoint_Config(Checkpoint_Config_Base, Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.Qwen3Encoder] = Field(default=ModelType.Qwen3Encoder)
format: Literal[ModelFormat.Checkpoint] = Field(default=ModelFormat.Checkpoint)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down Expand Up @@ -87,6 +88,7 @@ class Qwen3Encoder_Qwen3Encoder_Config(Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.Qwen3Encoder] = Field(default=ModelType.Qwen3Encoder)
format: Literal[ModelFormat.Qwen3Encoder] = Field(default=ModelFormat.Qwen3Encoder)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down Expand Up @@ -130,6 +132,7 @@ class Qwen3Encoder_GGUF_Config(Checkpoint_Config_Base, Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.Qwen3Encoder] = Field(default=ModelType.Qwen3Encoder)
format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
1 change: 1 addition & 0 deletions invokeai/backend/model_manager/configs/siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class SigLIP_Diffusers_Config(Diffusers_Config_Base, Config_Base):
type: Literal[ModelType.SigLIP] = Field(default=ModelType.SigLIP)
format: Literal[ModelFormat.Diffusers] = Field(default=ModelFormat.Diffusers)
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
2 changes: 2 additions & 0 deletions invokeai/backend/model_manager/configs/t5_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class T5Encoder_T5Encoder_Config(Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.T5Encoder] = Field(default=ModelType.T5Encoder)
format: Literal[ModelFormat.T5Encoder] = Field(default=ModelFormat.T5Encoder)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down Expand Up @@ -50,6 +51,7 @@ class T5Encoder_BnBLLMint8_Config(Config_Base):
base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
type: Literal[ModelType.T5Encoder] = Field(default=ModelType.T5Encoder)
format: Literal[ModelFormat.BnbQuantizedLlmInt8b] = Field(default=ModelFormat.BnbQuantizedLlmInt8b)
cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")

@classmethod
def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Expand Down
30 changes: 30 additions & 0 deletions invokeai/backend/model_manager/load/load_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from pathlib import Path
from typing import Optional

import torch

from invokeai.app.services.config import InvokeAIAppConfig
from invokeai.backend.model_manager.configs.base import Diffusers_Config_Base
from invokeai.backend.model_manager.configs.factory import AnyModelConfig
Expand Down Expand Up @@ -66,6 +68,30 @@ def _get_model_path(self, config: AnyModelConfig) -> Path:
model_base = self._app_config.models_path
return (model_base / config.path).resolve()

def _get_execution_device(
self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None
) -> Optional[torch.device]:
"""Determine the execution device for a model based on its configuration.

CPU-only execution is only applied to text encoder submodels to save VRAM while keeping
the denoiser on GPU for performance. Conditioning tensors are moved to GPU after encoding.

Returns:
torch.device("cpu") if the model should run on CPU only, None otherwise (use cache default).
"""
# Check if this is a text encoder submodel of a main model with cpu_only setting
if hasattr(config, "default_settings") and config.default_settings is not None:
if hasattr(config.default_settings, "cpu_only") and config.default_settings.cpu_only is True:
# Only apply CPU execution to text encoder submodels
if submodel_type in [SubModelType.TextEncoder, SubModelType.TextEncoder2, SubModelType.TextEncoder3]:
return torch.device("cpu")

# Check if this is a standalone text encoder config with cpu_only field (T5Encoder, Qwen3Encoder, etc.)
if hasattr(config, "cpu_only") and config.cpu_only is True:
return torch.device("cpu")

return None

def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubModelType] = None) -> CacheRecord:
stats_name = ":".join([config.base, config.type, config.name, (submodel_type or "")])
try:
Expand All @@ -77,9 +103,13 @@ def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubMod
self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
loaded_model = self._load_model(config, submodel_type)

# Determine execution device from model config, considering submodel type
execution_device = self._get_execution_device(config, submodel_type)

self._ram_cache.put(
get_model_cache_key(config.key, submodel_type),
model=loaded_model,
execution_device=execution_device,
)

return self._ram_cache.get(key=get_model_cache_key(config.key, submodel_type), stats_name=stats_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ def is_in_vram(self) -> bool:
"""Return true if the model is currently in VRAM."""
return self._is_in_vram

@property
def compute_device(self) -> torch.device:
"""Return the compute device for this model."""
return self._compute_device

def full_load_to_vram(self) -> int:
"""Load all weights into VRAM (if supported by the model).
Returns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ def cur_vram_bytes(self) -> int:
)
return self._cur_vram_bytes

@property
def compute_device(self) -> torch.device:
"""Return the compute device for this model."""
return self._compute_device

def full_load_to_vram(self) -> int:
"""Load all weights into VRAM."""
return self.partial_load_to_vram(self.total_bytes())
Expand Down
Loading