Modalities
diff --git a/‎src/modalities/config/instantiation_models.py‎
Lines changed: 12 additions & 9 deletions b/‎src/modalities/config/instantiation_models.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎src/modalities/dataloader/preprocessing/tokenization/tokenized_file_writer.py‎
Lines changed: 42 additions & 17 deletions b/‎src/modalities/dataloader/preprocessing/tokenization/tokenized_file_writer.py‎
Lines changed: 42 additions & 17 deletions
diff --git a/‎src/modalities/models/gpt2/llama3_like_initialization.py‎
Lines changed: 178 additions & 0 deletions b/‎src/modalities/models/gpt2/llama3_like_initialization.py‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎src/modalities/registry/components.py‎
Lines changed: 7 additions & 0 deletions b/‎src/modalities/registry/components.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎tests/end2end_tests/configs/gpt2_warm_start_from_step_4_fsdp2_grad_accu.yaml‎
Lines changed: 2 additions & 2 deletions b/‎tests/end2end_tests/configs/gpt2_warm_start_from_step_4_fsdp2_grad_accu.yaml‎
Lines changed: 2 additions & 2 deletions
@@ -1,3 +1,4 @@
+import logging
 import os
 from pathlib import Path
 from typing import Annotated, Any, Optional
@@ -27,6 +28,8 @@
 from modalities.util import warn_rank_0
 from modalities.utils.profilers.profilers import SteppableNoProfiler
 
+logger = logging.getLogger(__name__)
+
 
 class CudaEnvSettings(BaseModel):
     local_rank: Annotated[int, Field(strict=True, ge=0)]
@@ -46,6 +49,7 @@ class ConsistencyEnforcement(BaseModel):
     enforce_last_step_logged: bool = True
     enforce_last_step_evaluated: bool = True
     enforce_last_step_checkpointed: bool = True
+    enforce_enough_tokens_in_dataset: bool = True
 
 
 class Intervals(BaseModel):
@@ -192,15 +196,14 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel
 
     @model_validator(mode="after")
     def _check_token_amount_in_dataset(self) -> "TrainingComponentsInstantiationModel":
-        if (
-            len(self.train_dataset) * self.settings.step_profile.sequence_length
-            < self.settings.training_target.num_target_tokens
-        ):
-            raise ValueError(
-                "Not enough tokens in the dataset. "
-                f"Actual: {len(self.train_dataset) * self.settings.step_profile.sequence_length}, "
-                f"Expected: >={self.settings.training_target.num_target_tokens}"
-            )
+        dataset_tokens = len(self.train_dataset) * self.settings.step_profile.sequence_length
+        expected_tokens = self.settings.training_target.num_target_tokens
+        if dataset_tokens < expected_tokens:
+            msg = f"Not enough tokens in dataset. Actual: {dataset_tokens}, Expected: >={expected_tokens}"
+            if self.settings.consistency_enforcement.enforce_enough_tokens_in_dataset:
+                raise ValueError(msg)
+            else:
+                logger.warning(msg)
         return self
 
 
 
@@ -1,7 +1,6 @@
 import math
 import os
 import pickle
-from itertools import repeat
 from pathlib import Path
 from typing import BinaryIO
 
@@ -82,30 +81,56 @@ def _write_index_segment(file_descriptor: BinaryIO, index_list: list[tuple[int,
     def _write_data_segment(
         file_descriptor: BinaryIO, token_data: list[np.ndarray], token_size_in_bytes: int, write_batch_size: int
     ) -> list[tuple[int, int]]:
-        def encoded_token_to_bytes(encoded_token: int, token_size_in_bytes: int) -> bytes:
-            # Converts an token_ids to its byte representation.
-            try:
-                token_bytes = encoded_token.to_bytes(token_size_in_bytes, byteorder="little", signed=False)
-            except OverflowError as e:
-                raise ValueError(f"Token {encoded_token} cannot be represented by {token_size_in_bytes} bytes.") from e
-            return token_bytes
-
-        samples = []
-        index_list = []
+        # Fast path: vectorized cast + tobytes (no per-token Python work).
+        # Preserves little-endian unsigned representation and overflow checks.
+
+        if token_size_in_bytes == 1:
+            dtype = np.dtype("u1")
+        elif token_size_in_bytes == 2:
+            dtype = np.dtype("<u2")  # force little-endian
+        elif token_size_in_bytes == 4:
+            dtype = np.dtype("<u4")  # force little-endian
+        else:
+            raise ValueError("Currently only support token byte sizes of 1, 2, and 4.")
+
+        max_allowed = 2 ** (8 * token_size_in_bytes) - 1
+
+        samples: list[bytes] = []
+        index_list: list[tuple[int, int]] = []
         curr_offset = 0
+        pending = 0
+
         for sample_tokens in token_data:
-            # convert token_ids to byte representation
-            sample_token_byte_string = b"".join(
-                map(encoded_token_to_bytes, sample_tokens.tolist(), repeat(token_size_in_bytes))
-            )
+            arr = np.asarray(sample_tokens)
+
+            # ---- Overflow / range check (preserves original semantics) ----
+            if arr.size:
+                min_val = int(arr.min())
+                max_val = int(arr.max())
+                if min_val < 0 or max_val > max_allowed:
+                    raise ValueError(
+                        f"Token values out of range for {token_size_in_bytes} bytes: "
+                        f"min={min_val}, max={max_val}, allowed=[0, {max_allowed}]"
+                    )
+            # ----------------------------------------------------------------
+
+            # Cast to correct unsigned little-endian dtype
+            arr = np.asarray(arr, dtype=dtype, order="C")
+            sample_token_byte_string = arr.tobytes(order="C")
+
             samples.append(sample_token_byte_string)
             index_list.append((curr_offset, len(sample_token_byte_string)))
             curr_offset += len(sample_token_byte_string)
-            if len(samples) % write_batch_size == 0:
+
+            pending += 1
+            if pending >= write_batch_size:
                 file_descriptor.write(b"".join(samples))
-                samples = []
+                samples.clear()
+                pending = 0
+
         if len(samples) > 0:
             file_descriptor.write(b"".join(samples))
+
         return index_list
 
     @staticmethod
 
@@ -0,0 +1,178 @@
+import math
+import re
+from typing import Annotated, Callable
+
+import torch
+import torch.nn as nn
+from pydantic import BaseModel, Field
+
+from modalities.nn.model_initialization.initialization_if import ModelInitializationIF
+from modalities.utils.logger_utils import get_logger
+
+logger = get_logger(name="llama3 initialization")
+
+
+class Llama3InitializerConfig(BaseModel):
+    num_layers: Annotated[int, Field(strict=True, gt=0)]
+    n_embd: Annotated[int, Field(strict=True, gt=0)]
+    depth_init: bool = True
+
+
+class Llama3Initializer(ModelInitializationIF):
+    """
+    Follows weight initialization distributions and parameterization for Llama3 as described in TorchTitan.
+    """
+
+    def __init__(self, num_layers: int, n_embd: int, depth_init: bool) -> None:
+        """
+        Initializes the Llama3Initializer.
+        Args:
+            num_layers: The number of transformer layers in the model. Used to calculate std for certain parameters.
+            n_embd: The embedding dimension of the model. Used to calculate std and truncation for certain parameters.
+            depth_init: Whether to use depth-aware initialization for certain parameters, where the std
+                        is scaled based on the layer's depth in the model. If False, a constant std is
+                        used for all layers baed on num_layers.
+        """
+        super().__init__()
+        self.depth_init = depth_init
+
+        self.regex_to_init = {
+            # embedding weights
+            r"transformer\.wte\.weight": (nn.init.normal_, {"mean": 0.0, "std": 1}),
+            # lm head weights
+            r"transformer\.lm_head\.weight": (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": 1 / math.sqrt(n_embd),
+                    "a": -3 / math.sqrt(n_embd),
+                    "b": 3 / math.sqrt(n_embd),
+                },
+            ),
+            # qkv projections
+            r"transformer\.h\.\d+\.attn\.(q_attn|k_attn|v_attn)\.weight": (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": 0.02,
+                    "a": -2,
+                    "b": 2,
+                },
+            ),
+            # final attention projection in attention block
+            r"transformer\.h\.\d+\.attn\.c_proj\.weight": (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": (
+                        (lambda layer_id: 0.02 / math.sqrt(2 * (layer_id + 1)))
+                        if depth_init
+                        else 0.02 / math.sqrt(2 * num_layers)
+                    ),
+                    "a": -2,
+                    "b": 2,
+                },
+            ),
+            # SwiGLU
+            r"transformer\.h\.\d+\.mlp\.(W)\.weight": (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": 0.02,
+                    "a": -2,
+                    "b": 2,
+                },
+            ),
+            r"transformer\.h\.\d+\.mlp\.(V|W_2)\.weight": (
+                trunc_normal_,
+                {
+                    "mean": 0.0,
+                    "std": (
+                        (lambda layer_id: 0.02 / math.sqrt(2 * (layer_id + 1)))
+                        if depth_init
+                        else 0.02 / math.sqrt(2 * num_layers)
+                    ),
+                    "a": -2,
+                    "b": 2,
+                },
+            ),
+        }
+
+    def initialize_in_place(self, model: nn.Module):
+        self._init_by_fqn_regex(model, self.regex_to_init, depth_init=self.depth_init)
+
+    @staticmethod
+    def _init_by_fqn_regex(model: nn.Module, regex_to_init: dict[str, tuple[Callable, dict]], depth_init: bool):
+        hits = {k: 0 for k in regex_to_init.keys()}
+
+        for parameter_name, p in model.named_parameters():
+            if parameter_name.endswith("bias"):
+                raise ValueError(
+                    f"Bias initialization is not allowed for Llama3Initializer. Found bias parameter: {parameter_name}"
+                )
+            match_count = 0
+            for weight_regex in regex_to_init.keys():
+                if re.fullmatch(weight_regex, parameter_name):
+                    init_fn, arg_dict = regex_to_init[weight_regex]
+                    if arg_dict["std"] is not None and callable(arg_dict["std"]):
+                        # If std is a function, call it with the layer_id
+                        layer_id_match = re.search(r"transformer\.h\.(\d+)\.", parameter_name)
+                        if layer_id_match is not None:
+                            layer_id = int(layer_id_match.group(1))
+                            arg_dict = arg_dict.copy()  # create a copy of the arg_dict to avoid mutating the original
+                            arg_dict["std"] = arg_dict["std"](layer_id)
+                        else:
+                            raise ValueError(
+                                f"Could not extract layer_id from parameter name {parameter_name} "
+                                "for dynamic std calculation"
+                            )
+                    init_fn(p, **arg_dict)
+                    match_count += 1
+                    hits[weight_regex] += 1
+
+            if match_count == 0:
+                logger.warning(f"Parameter {parameter_name} did not match any regex for initialization")
+            elif match_count > 1:
+                raise ValueError(
+                    f"Parameter {parameter_name} matched multiple regexes for initialization, which is not allowed"
+                )
+
+        for k, count in hits.items():
+            if count == 0:
+                raise ValueError(
+                    f"Regex {k} did not match any FQNs. The model specification probably does not match LLama3."
+                )
+
+
+def trunc_normal_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+):
+    """
+    Fills the input tensor with values sampled from a truncated normal distribution.
+    Values are drawn from a normal distribution with the given mean and standard
+    deviation. Any sampled values outside the range defined by a and b are resampled
+    until they fall within the bounds.
+
+    To avoid numerical instability in torch.nn.init.trunc_normal_, the initialization
+    is always performed using float32 precision. The result is then cast back to the
+    original data type of the input tensor.
+
+    Args:
+        tensor: an n dimensional torch Tensor
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the lower bound for truncation
+        b: the upper bound for truncation
+
+    Returns:
+        The input tensor filled with values from the truncated normal distribution.
+    """
+    # This function is copied from from Meta's open-source project TorchTitan,
+    # licensed under the BSD 3-Clause License.
+    tmp = tensor.float()
+    nn.init.trunc_normal_(tmp, mean=mean, std=std, a=a, b=b)
+    tensor.copy_(tmp)
@@ -92,6 +92,7 @@
 )
 from modalities.models.gpt2.collator import GPT2LLMCollateFn
 from modalities.models.gpt2.gpt2_model import GPT2LLMConfig
+from modalities.models.gpt2.llama3_like_initialization import Llama3Initializer, Llama3InitializerConfig
 from modalities.models.huggingface.huggingface_model import HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig
 from modalities.models.model_factory import GPT2ModelFactory, ModelFactory
 from modalities.models.parallelism.pipeline_parallelism import ComponentSelectorFromPipeline, PipelineFactory
@@ -240,6 +241,12 @@ class ComponentEntity:
         ComposedInitializationRoutines.get_composed_model_initializer,
         ComposedModelInitializationConfig,
     ),
+    ComponentEntity(
+        "model_initialization",
+        "gpt2_llama3_like",
+        Llama3Initializer,
+        Llama3InitializerConfig,
+    ),
     # losses
     ComponentEntity("loss", "clm_cross_entropy_loss", CLMCrossEntropyLoss, CLMCrossEntropyLossConfig),
     # optimizers
 
@@ -177,7 +177,7 @@ app_state_raw:
   component_key: app_state
   variant_key: raw
   config:
-    model: 
+    model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
@@ -288,7 +288,7 @@ optimizer:
     eps: 1e-8
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
-    wrapped_model: 
+    wrapped_model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE