diff --git a/source/_data/SymbioticLab.bib b/source/_data/SymbioticLab.bib index 07eaca04..3e766826 100644 --- a/source/_data/SymbioticLab.bib +++ b/source/_data/SymbioticLab.bib @@ -2506,3 +2506,27 @@ @InProceedings{ara:agentskills26 Scientific publication compresses a branching, iterative research process into a linear narrative, discarding the majority of what was discovered along the way. This compilation imposes two structural costs: a Storytelling Tax, where failed experiments, rejected hypotheses, and the branching exploration process are discarded to fit a linear narrative; and an Engineering Tax, where the gap between reviewer-sufficient prose and agent-sufficient specification leaves critical implementation details unwritten. Tolerable for human readers, these costs become critical when AI agents must understand, reproduce, and extend published work. We introduce the Agent-Native Research Artifact (Ara), a protocol that replaces the narrative paper with an agent-executable research package structured around four layers: scientific logic, executable code with full specifications, an exploration graph that preserves the failures compilation discards, and evidence grounding every claim in raw outputs. We complement the protocol with the Ara ecosystem, a coordinated set of agent skills—the Live Research Manager (LRM) that captures decisions and dead ends during ordinary development, the Ara Compiler that translates legacy PDFs and repos into Aras, and the Ara Seal, a three-level review pipeline (analogous to a grammar checker for prose)—so artifacts are produced, imported, and verified automatically while human reviewers focus on significance, novelty, and taste. On PaperBench and RE-Bench, Ara raises question-answering accuracy from 72.4% to 93.7% and reproduction success from 57.4% to 64.4%. On RE-Bench's five open-ended extension tasks, preserved failure traces in Ara accelerate progress, but can also constrain a capable agent from stepping outside the prior-run box depending on the agent's capabilities. } } + +@PhDThesis{insujang:dissertation, + author = {Insu Jang}, + title = {Efficient Multimodal Model Training at Scale}, + year = {2026}, + month = {June}, + institution = {University of Michigan}, + + publist_link = {paper || insujang-dissertation.pdf}, + publist_confkey = {dissertation}, + publist_abstract = { +As large language models (LLMs) evolve into multimodal foundation models, distributed training across massive GPU clusters has become indispensable. Because distributed training necessitates frequent, collective state synchronizations across thousands of devices, any imbalance in execution time directly translates to systemic idle time, as the entire cluster must stall and wait for the slowest device. Consequently, maximizing training throughput requires consistently load-balancing the system across all participating accelerators. + +However, achieving and maintaining this optimal balance is severely disrupted by two fundamental challenges in large-scale multimodal training: resource imbalances and workload imbalances. On the resource side, operating at the scale of thousands of GPUs inherently increases hardware failure events. When failures occur, they dynamically alter the cluster topology, creating variability in the total number of available GPUs and heterogeneity in the number of active GPUs per model replica. On the workload side, distributing diverse modalities to be executed in parallel across the system inherently causes severe workload imbalance. Because different modalities impose vastly different computational footprints, distributing their execution across GPUs introduces profound workload heterogeneity. Furthermore, as the relative mixture of these modalities changes from batch to batch, it creates unpredictable workload variability throughout the training process. Together, this intertwined heterogeneity and variability -- spanning both the underlying hardware resources and the multimodal training workloads -- make it exceedingly difficult to prevent straggler effects and sustain high training efficiency. + +This dissertation studies how to systematically address heterogeneity and variability in both resources and workloads to make distributed multimodal training highly efficient and robust. We have developed three solutions to provide fault-tolerance and workload balancing for multimodal training. First, we propose Oobleck, a fault-tolerant hybrid-parallel training framework that uses heterogeneous pipeline templates to address resource variability. A pipeline template is a specification of a model replica for a given number of GPUs, and Oobleck uses a composition of heterogeneous pipeline templates to utilize all available GPUs. When failures happen, Oobleck can reinstantiate pipelines from the pipeline templates and copy missing model states from other replicas to recover from failures. This apporach allows Oobleck to achieve efficient workload rebalancing without checkpointing. + +Second, we present Cornstarch, a distributed multimodal training framework that addresses intra-batch multimodal workload heterogeneity and variability. Different from existing distributed multimodal training frameworks that focus only on first-order model and data heterogeneity, Cornstarch discovers latent higher-order heterogeneity and variability in two parallelization dimensions: pipeline parallelism and context parallelism. More specifically, Cornstarch considers the frozen status of model components to balance the pipeline stages in pipeline parallelism, and it analyzes variable imbalance of workload distribution in context parallelism. + +Third, we introduce Entrain that balances inter-batch multimodal workload variability. The relative workload ratio of heterogeneous modalities varies across batches, thus optimizing the parallel configuration for one batch may not be optimal for another one. A natural intuition would be to dynamically adapt the parallel configuration for each batch. However, Entrain shows a counterintuitive result that a single, static parallel configuration suffices for optimal load balancing with macroscopic batch-level profiling. While at macroscopic scale, the workload ratio between modalities converges to a stable constant, variability persists at the microscopic scale, which is exposed when a batch is split into microbatches in pipeline parallelism. Entrain addresses this with a hierarchical microbatch assignment and deferral optimization to stabilize variability across microbatches. + +Together, these works provide comprehensive solutions to address heterogeneity and variability for highly efficient and robust distributed multimodal training. + } +} \ No newline at end of file diff --git a/source/publications/files/insujang:dissertation/insujang-dissertation.pdf b/source/publications/files/insujang:dissertation/insujang-dissertation.pdf new file mode 100644 index 00000000..112edd60 Binary files /dev/null and b/source/publications/files/insujang:dissertation/insujang-dissertation.pdf differ