diff --git a/transformer_lens/benchmarks/main_benchmark.py b/transformer_lens/benchmarks/main_benchmark.py index 81d078f44..c527c5957 100644 --- a/transformer_lens/benchmarks/main_benchmark.py +++ b/transformer_lens/benchmarks/main_benchmark.py @@ -658,7 +658,6 @@ def run_benchmark_suite( test_weight_processing_individually: bool = False, phases: list[int] | None = None, trust_remote_code: bool = False, - conserve_memory: bool = False, scoring_model: PreTrainedModel | None = None, scoring_tokenizer: PreTrainedTokenizerBase | None = None, ) -> List[BenchmarkResult]: @@ -691,11 +690,6 @@ def run_benchmark_suite( tests that check each processing flag individually (default: False) phases: Optional list of phase numbers to run (e.g., [1, 2, 3]). If None, runs all phases. trust_remote_code: Whether to trust remote code for custom architectures. - conserve_memory: When True, Phase 1 avoids loading a separate HF model - and instead uses bridge.original_model for component benchmarks and - forward pass comparison. This halves Phase 1 peak memory (1.0x vs 2.0x) - at the cost of losing the independent HF loading cross-check (~5% - weakening). Default is False (full dual-load for maximum test coverage). scoring_model: Optional pre-loaded GPT-2 scoring model for Phase 4. When provided with scoring_tokenizer, avoids reloading for each model in batch. scoring_tokenizer: Optional pre-loaded tokenizer for Phase 4 scoring model. @@ -1024,24 +1018,10 @@ def cleanup_model(model, model_name_str: str): if verbose: print(f"⚠ Could not apply architecture patches: {patch_err}") - # ---------------------------------------------------------------- - # Phase 1 memory strategy (controlled by conserve_memory flag): - # - # conserve_memory=False (default): - # Load separate HF model, capture logits to CPU, load Bridge, - # run component benchmark with both models (brief 2.0x), delete - # HF immediately after, forward pass uses saved logits (1.0x). - # - # conserve_memory=True: - # Skip separate HF model entirely. Load Bridge only (1.0x - # throughout). Component benchmark uses bridge.original_model - # as the HF reference. Forward pass compares bridge output - # against bridge.original_model logits. - # ---------------------------------------------------------------- hf_saved_logits = None hf_saved_loss = None - if use_hf_reference and not conserve_memory and should_run_phase(1): + if use_hf_reference and should_run_phase(1): try: if verbose: print("Loading HuggingFace reference model...") @@ -1146,28 +1126,12 @@ def cleanup_model(model, model_name_str: str): # Run Phase 1 benchmarks if should_run_phase(1) and bridge_unprocessed: if verbose: - mode_label = " [conserve-memory]" if conserve_memory else "" - print(f"Running Phase 1 benchmarks{mode_label}...\n") + print("Running Phase 1 benchmarks...\n") # Component-level benchmarks if verbose: print("1. Component-Level Benchmarks") - if conserve_memory: - # conserve_memory mode: use bridge.original_model as the HF - # reference (no separate HF load, 1.0x peak throughout). - try: - component_result = benchmark_all_components( - bridge_unprocessed, bridge_unprocessed.original_model - ) - add_result(component_result) - if verbose: - status = "✓" if component_result.passed else "✗" - print(f"{status} {component_result.message}") - print(" (reference: bridge.original_model)\n") - except Exception as e: - if verbose: - print(f"✗ Component benchmark failed: {e}\n") - elif hf_model is not None: + if hf_model is not None: # Full mode: component benchmark with independent HF model (brief 2.0x) try: component_result = benchmark_all_components(bridge_unprocessed, hf_model) @@ -1242,27 +1206,7 @@ def cleanup_model(model, model_name_str: str): # matmul non-determinism can exceed the float32 default of 1e-3 p1_atol = 1e-3 if dtype == torch.float32 else 5e-3 - if conserve_memory: - # conserve_memory mode: capture reference logits from - # bridge.original_model (same tokenization as bridge). - try: - tokens = bridge_unprocessed.to_tokens(test_text) - with torch.no_grad(): - hf_out = bridge_unprocessed.original_model(tokens) - ref_logits = hf_out.logits.detach() - add_result( - benchmark_forward_pass( - bridge_unprocessed, - test_text, - reference_logits=ref_logits, - atol=p1_atol, - ) - ) - del ref_logits - except Exception as e: - if verbose: - print(f"✗ Forward pass benchmark failed: {e}\n") - elif hf_saved_logits is not None: + if hf_saved_logits is not None: # Full mode: use pre-captured HF logits (bridge only, 1.0x) try: add_result( @@ -2028,13 +1972,6 @@ def main(): action="store_true", help="Trust remote code for custom architectures (e.g., OpenELM)", ) - parser.add_argument( - "--conserve-memory", - action="store_true", - help="Reduce Phase 1 peak memory from 2.0x to 1.0x by using " - "bridge.original_model instead of loading a separate HF model", - ) - args = parser.parse_args() results = run_benchmark_suite( @@ -2045,7 +1982,6 @@ def main(): enable_compatibility_mode=not args.no_compat, verbose=not args.quiet, trust_remote_code=args.trust_remote_code, - conserve_memory=args.conserve_memory, ) if args.update_registry: diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 4f8b2686e..ceb12d3ce 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -7,8 +7,8 @@ "scan_duration_seconds": 2.6 }, "total_architectures": 32, - "total_models": 5742, - "total_verified": 625, + "total_models": 5745, + "total_verified": 666, "models": [ { "architecture_id": "Qwen2ForCausalLM", @@ -1885,14 +1885,14 @@ { "architecture_id": "LlamaForCausalLM", "model_id": "NousResearch/Meta-Llama-3.1-8B-Instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 99.4, "phase7_score": null }, { @@ -2002,14 +2002,14 @@ { "architecture_id": "LlamaForCausalLM", "model_id": "HuggingFaceTB/SmolLM-135M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 94.5, "phase7_score": null }, { @@ -2353,14 +2353,14 @@ { "architecture_id": "Gemma2ForCausalLM", "model_id": "google/gemma-2-9b-it", - "status": 2, - "verified_date": "2026-02-24", + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": "Estimated 64.3 GB exceeds 35.0 GB limit", - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 99.2, "phase7_score": null }, { @@ -4043,14 +4043,14 @@ { "architecture_id": "Gemma2ForCausalLM", "model_id": "google/gemma-2-9b", - "status": 2, - "verified_date": "2026-02-22", + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": "Estimated 124.5 GB exceeds 78.0 GB limit", - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 94.5, "phase7_score": null }, { @@ -5044,14 +5044,14 @@ { "architecture_id": "MistralForCausalLM", "model_id": "BioMistral/BioMistral-7B", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 96.0, "phase7_score": null }, { @@ -5213,14 +5213,14 @@ { "architecture_id": "MistralForCausalLM", "model_id": "Intel/neural-chat-7b-v3-3", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 97.1, "phase7_score": null }, { @@ -5447,14 +5447,14 @@ { "architecture_id": "LlamaForCausalLM", "model_id": "HuggingFaceTB/SmolLM2-360M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 93.0, "phase7_score": null }, { @@ -5811,14 +5811,14 @@ { "architecture_id": "Qwen2ForCausalLM", "model_id": "SakanaAI/TinySwallow-1.5B", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 98.6, "phase7_score": null }, { @@ -6188,14 +6188,14 @@ { "architecture_id": "LlamaForCausalLM", "model_id": "HuggingFaceTB/SmolLM-360M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 98.8, "phase7_score": null }, { @@ -10816,10 +10816,10 @@ { "architecture_id": "MixtralForCausalLM", "model_id": "hfl/chinese-mixtral-instruct", - "status": 0, - "verified_date": null, + "status": 2, + "verified_date": "2026-03-17", "metadata": null, - "note": null, + "note": "Estimated 106.4 GB exceeds 63.9 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -13728,14 +13728,14 @@ { "architecture_id": "Phi3ForCausalLM", "model_id": "microsoft/MediPhi-Instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 99.2, "phase7_score": null }, { @@ -27133,14 +27133,14 @@ { "architecture_id": "GemmaForCausalLM", "model_id": "unsloth/gemma-7b", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 95.4, "phase7_score": null }, { @@ -28472,14 +28472,14 @@ { "architecture_id": "GemmaForCausalLM", "model_id": "unsloth/gemma-2b", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": null, + "phase4_score": 92.4, "phase7_score": null }, { @@ -63811,38 +63811,38 @@ "architecture_id": "LlavaForConditionalGeneration", "model_id": "llava-hf/llava-1.5-7b-hf", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 90.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 97.1, "phase7_score": 100.0 }, { "architecture_id": "LlavaForConditionalGeneration", "model_id": "llava-hf/llava-1.5-13b-hf", - "status": 2, - "verified_date": "2026-03-13", + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": "Estimated 114.9 GB exceeds 100.3 GB limit", + "note": "Core verification completed", "phase1_score": 100.0, "phase2_score": null, "phase3_score": null, - "phase4_score": 97.6, + "phase4_score": 98.1, "phase7_score": 100.0 }, { "architecture_id": "LlavaForConditionalGeneration", "model_id": "llava-hf/bakLlava-v1-hf", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 90.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 98.6, "phase7_score": 100.0 }, @@ -63850,12 +63850,12 @@ "architecture_id": "LlavaForConditionalGeneration", "model_id": "llava-hf/llava-interleave-qwen-0.5b-hf", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 91.7, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 89.0, "phase7_score": 100.0 }, @@ -63863,38 +63863,38 @@ "architecture_id": "LlavaForConditionalGeneration", "model_id": "llava-hf/llava-interleave-qwen-7b-hf", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 98.4, "phase7_score": 100.0 }, { "architecture_id": "LlavaForConditionalGeneration", "model_id": "fancyfeast/llama-joycaption-beta-one-hf-llava", - "status": 1, - "verified_date": "2026-03-12", + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": "Core verification completed", + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 90.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 94.1, "phase7_score": 100.0 }, { "architecture_id": "LlavaForConditionalGeneration", "model_id": "fancyfeast/llama-joycaption-alpha-two-hf-llava", - "status": 1, - "verified_date": "2026-03-12", + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": "Core verification completed", + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 90.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 96.8, "phase7_score": 100.0 }, @@ -63902,12 +63902,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-4b-it", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 97.6, "phase7_score": 100.0 }, @@ -63915,12 +63915,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-4b-pt", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 95.2, "phase7_score": 100.0 }, @@ -63928,12 +63928,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-12b-it", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 98.3, "phase7_score": 100.0 }, @@ -63941,12 +63941,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-12b-pt", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 95.7, "phase7_score": 100.0 }, @@ -63954,10 +63954,10 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-27b-it", "status": 2, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, - "note": "Estimated 189.6 GB exceeds 94.2 GB limit", - "phase1_score": 100.0, + "note": "Estimated 189.6 GB exceeds 121.0 GB limit", + "phase1_score": null, "phase2_score": null, "phase3_score": null, "phase4_score": 99.0, @@ -63967,10 +63967,10 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "google/gemma-3-27b-pt", "status": 2, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, - "note": "Estimated 189.6 GB exceeds 94.2 GB limit", - "phase1_score": 100.0, + "note": "Estimated 189.6 GB exceeds 121.0 GB limit", + "phase1_score": null, "phase2_score": null, "phase3_score": null, "phase4_score": 93.7, @@ -63980,12 +63980,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "mlabonne/gemma-3-12b-it-abliterated", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 99.4, "phase7_score": 100.0 }, @@ -63993,12 +63993,12 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "unsloth/gemma-3-4b-it", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 97.6, "phase7_score": 100.0 }, @@ -64006,25 +64006,25 @@ "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "unsloth/gemma-3-12b-it", "status": 1, - "verified_date": "2026-03-12", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 98.3, "phase7_score": 100.0 }, { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "DreamFast/gemma-3-12b-it-heretic", - "status": 1, - "verified_date": "2026-03-12", + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": "Core verification completed (multimodal tests skipped \u2014 no processor)", + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", "phase1_score": 100.0, - "phase2_score": 100.0, - "phase3_score": 85.0, + "phase2_score": null, + "phase3_score": null, "phase4_score": 99.4, "phase7_score": null }, @@ -64032,7 +64032,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "llava-hf/llava-v1.6-mistral-7b-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64045,7 +64045,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64058,7 +64058,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "llava-hf/llava-v1.6-vicuna-7b-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64071,7 +64071,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "llava-hf/llava-v1.6-vicuna-13b-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64084,7 +64084,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "llava-hf/llama3-llava-next-8b-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64097,7 +64097,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "ibm-granite/granite-vision-3.3-2b", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64110,7 +64110,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "ibm-granite/granite-vision-3.2-2b", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification passed, but text quality poor. Needs review", "phase1_score": 100.0, @@ -64123,7 +64123,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "ibm-granite/granite-vision-3.1-2b-preview", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification passed, but text quality poor. Needs review", "phase1_score": 100.0, @@ -64136,7 +64136,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "ibm-granite/granite-vision-3.3-2b-chart2csv-preview", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64149,7 +64149,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "royokong/e5-v", "status": 3, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification failed: multimodal tests skipped (processor unavailable)", "phase1_score": 100.0, @@ -64162,7 +64162,7 @@ "architecture_id": "LlavaNextForConditionalGeneration", "model_id": "tiiuae/falcon-11B-vlm", "status": 3, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)", "phase1_score": 0.0, @@ -64175,7 +64175,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "llava-hf/llava-onevision-qwen2-0.5b-si-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64188,7 +64188,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "llava-hf/llava-onevision-qwen2-7b-ov-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64201,7 +64201,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "llava-hf/llava-onevision-qwen2-7b-si-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64214,7 +64214,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "llava-hf/llava-onevision-qwen2-7b-ov-chat-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64227,7 +64227,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "NCSOFT/VARCO-VISION-2.0-1.7B", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64240,7 +64240,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "NCSOFT/VARCO-VISION-2.0-1.7B-OCR", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64253,7 +64253,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "AIML-TUDA/LlavaGuard-v1.2-7B-OV-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64266,7 +64266,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "AIML-TUDA/LlavaGuard-v1.2-0.5B-OV-hf", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64279,7 +64279,7 @@ "architecture_id": "LlavaOnevisionForConditionalGeneration", "model_id": "BSC-LT/Salamandra-VL-7B-2512", "status": 1, - "verified_date": "2026-03-13", + "verified_date": "2026-03-18", "metadata": null, "note": "Core verification completed", "phase1_score": 100.0, @@ -64406,13 +64406,14 @@ { "architecture_id": "GPTNeoXForCausalLM", "model_id": "EleutherAI/pythia-14m", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 75.7 }, { "architecture_id": "Qwen3ForCausalLM", @@ -64642,13 +64643,14 @@ { "architecture_id": "GraniteForCausalLM", "model_id": "ibm-granite/granite-3.2-8b-instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.8 }, { "architecture_id": "LlamaForCausalLM", @@ -64774,13 +64776,14 @@ { "architecture_id": "GraniteForCausalLM", "model_id": "ibm-granite/granite-3.1-2b-instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 96.5 }, { "architecture_id": "GraniteMoeHybridForCausalLM", @@ -64796,13 +64799,14 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "farbodtavakkoli/OTel-LLM-12B-Safety", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 99.4 }, { "architecture_id": "Qwen2ForCausalLM", @@ -64818,13 +64822,14 @@ { "architecture_id": "T5ForConditionalGeneration", "model_id": "MBZUAI/LaMini-Flan-T5-77M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification passed, but text quality poor. Needs review", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 47.4 }, { "architecture_id": "GraniteMoeHybridForCausalLM", @@ -64840,13 +64845,14 @@ { "architecture_id": "T5ForConditionalGeneration", "model_id": "teapotai/tinyteapot", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 88.5 }, { "architecture_id": "Qwen3ForCausalLM", @@ -64931,13 +64937,14 @@ { "architecture_id": "GraniteForCausalLM", "model_id": "ibm-granite/granite-guardian-3.3-8b", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.6 }, { "architecture_id": "LlamaForCausalLM", @@ -64975,13 +64982,14 @@ { "architecture_id": "GraniteForCausalLM", "model_id": "ibm-granite/granite-3.0-2b-instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 56.1 }, { "architecture_id": "Qwen3ForCausalLM", @@ -65030,13 +65038,14 @@ { "architecture_id": "GPTNeoXForCausalLM", "model_id": "EleutherAI/pythia-31m", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 88.6 }, { "architecture_id": "GraniteForCausalLM", @@ -65130,10 +65139,10 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "aisingapore/Gemma-SEA-LION-v4-27B-IT", - "status": 0, - "verified_date": null, + "status": 2, + "verified_date": "2026-03-18", "metadata": null, - "note": null, + "note": "Estimated 189.6 GB exceeds 121.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null @@ -65383,13 +65392,14 @@ { "architecture_id": "GPT2LMHeadModel", "model_id": "karpathy/gpt2_1558M_final4_hf", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 90.5 }, { "architecture_id": "GraniteForCausalLM", @@ -65549,13 +65559,14 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "DreamFast/gemma-3-12b-it-heretic-v2", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 98.8 }, { "architecture_id": "GraniteForCausalLM", @@ -65692,13 +65703,14 @@ { "architecture_id": "GraniteMoeForCausalLM", "model_id": "ibm-granite/granite-guardian-3.2-3b-a800m", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 98.5 }, { "architecture_id": "LlamaForCausalLM", @@ -65791,13 +65803,14 @@ { "architecture_id": "Olmo2ForCausalLM", "model_id": "allenai/OLMo-2-0425-1B-RLVR1", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 93.3 }, { "architecture_id": "LlamaForCausalLM", @@ -65923,13 +65936,14 @@ { "architecture_id": "GraniteMoeForCausalLM", "model_id": "ibm-granite/granite-3.0-3b-a800m-instruct", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 91.4 }, { "architecture_id": "Gemma2ForCausalLM", @@ -66264,13 +66278,14 @@ { "architecture_id": "GraniteMoeForCausalLM", "model_id": "ibm-granite/granite-3.1-1b-a400m-base", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 99.4 }, { "architecture_id": "LlamaForCausalLM", @@ -66286,13 +66301,14 @@ { "architecture_id": "GPT2LMHeadModel", "model_id": "ckiplab/gpt2-base-chinese", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 72.9 }, { "architecture_id": "Qwen3ForCausalLM", @@ -66374,11 +66390,11 @@ { "architecture_id": "BloomForCausalLM", "model_id": "bigscience/bloom-1b7-intermediate", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)", + "phase1_score": 0.0, "phase2_score": null, "phase3_score": null }, @@ -66517,13 +66533,14 @@ { "architecture_id": "T5ForConditionalGeneration", "model_id": "MBZUAI/LaMini-Flan-T5-783M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification passed, but text quality poor. Needs review", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 47.6 }, { "architecture_id": "Qwen3ForCausalLM", @@ -66748,13 +66765,14 @@ { "architecture_id": "StableLmForCausalLM", "model_id": "stabilityai/stablelm-2-1_6b-chat", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 91.0 }, { "architecture_id": "LlamaForCausalLM", @@ -66803,13 +66821,14 @@ { "architecture_id": "GPTNeoForCausalLM", "model_id": "roneneldan/TinyStories-Instruct-33M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 93.5 }, { "architecture_id": "LlamaForCausalLM", @@ -66880,10 +66899,10 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "YanLabs/gemma-3-27b-it-abliterated-normpreserve", - "status": 0, - "verified_date": null, + "status": 2, + "verified_date": "2026-03-18", "metadata": null, - "note": null, + "note": "Estimated 189.6 GB exceeds 121.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null @@ -67892,13 +67911,14 @@ { "architecture_id": "Olmo2ForCausalLM", "model_id": "allenai/OLMo-2-0425-1B-early-training", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 91.5 }, { "architecture_id": "MistralForCausalLM", @@ -68607,11 +68627,11 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "lthn/LEM-Gemma3-4B", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)", + "phase1_score": 0.0, "phase2_score": null, "phase3_score": null }, @@ -68640,13 +68660,14 @@ { "architecture_id": "Olmo3ForCausalLM", "model_id": "allenai/Olmo-3-7B-RL-Zero-IF", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 96.2 }, { "architecture_id": "GPTNeoXForCausalLM", @@ -68981,13 +69002,15 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "McGill-NLP/AfriqueGemma-4B", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 95.3, + "phase7_score": 100.0 }, { "architecture_id": "MistralForCausalLM", @@ -69025,13 +69048,15 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "McGill-NLP/AfriqueGemma-12B", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.0, + "phase7_score": 100.0 }, { "architecture_id": "LlamaForCausalLM", @@ -69454,13 +69479,14 @@ { "architecture_id": "OlmoForCausalLM", "model_id": "allenai/OLMo-7B-Instruct-hf", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 96.2 }, { "architecture_id": "Qwen2ForCausalLM", @@ -69872,13 +69898,14 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "vanta-research/scout-4b", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.9 }, { "architecture_id": "GptOssForCausalLM", @@ -70279,13 +70306,15 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "mshojaei77/gemma-3-4b-persian-v0", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.4, + "phase7_score": 100.0 }, { "architecture_id": "Qwen3ForCausalLM", @@ -70301,13 +70330,14 @@ { "architecture_id": "GPTNeoForCausalLM", "model_id": "roneneldan/TinyStories-Instruct-1M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 95.1 }, { "architecture_id": "LlamaForCausalLM", @@ -70444,13 +70474,14 @@ { "architecture_id": "GPTNeoForCausalLM", "model_id": "roneneldan/TinyStories-Instruct-28M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 92.6 }, { "architecture_id": "GPT2LMHeadModel", @@ -70477,13 +70508,14 @@ { "architecture_id": "GPTNeoForCausalLM", "model_id": "roneneldan/TinyStories-2Layers-33M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 93.6 }, { "architecture_id": "LlamaForCausalLM", @@ -70565,13 +70597,14 @@ { "architecture_id": "GPTNeoForCausalLM", "model_id": "roneneldan/TinyStories-Instruct-8M", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-17", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 91.1 }, { "architecture_id": "MistralForCausalLM", @@ -70598,13 +70631,14 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "vanta-research/atom-v1-preview-4b", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 98.8 }, { "architecture_id": "GPTNeoXForCausalLM", @@ -70917,10 +70951,10 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "McG-221/gemma-3-27b-it-abliterated-refined-vision-mlx-8Bit", - "status": 0, - "verified_date": null, + "status": 2, + "verified_date": "2026-03-18", "metadata": null, - "note": null, + "note": "Estimated 189.6 GB exceeds 121.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null @@ -71863,13 +71897,15 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "soob3123/Veiled-Calla-12B", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification completed", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 100.0, + "phase7_score": 100.0 }, { "architecture_id": "Qwen3ForCausalLM", @@ -72215,13 +72251,14 @@ { "architecture_id": "Gemma3ForConditionalGeneration", "model_id": "ClinicDx1/ClinicDx", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-03-18", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Core verification failed: multimodal tests skipped (processor unavailable)", + "phase1_score": 100.0, "phase2_score": null, - "phase3_score": null + "phase3_score": null, + "phase4_score": 97.3 }, { "architecture_id": "GPT2LMHeadModel", @@ -73081,6 +73118,42 @@ "phase2_score": null, "phase3_score": null, "phase4_score": 85.7 + }, + { + "architecture_id": "GPTNeoXForCausalLM", + "model_id": "EleutherAI/pythia-70m", + "status": 1, + "verified_date": "2026-03-17", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 78.7 + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "mistralai/Mistral-7B-v0.3", + "status": 1, + "verified_date": "2026-03-17", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 94.3 + }, + { + "architecture_id": "MixtralForCausalLM", + "model_id": "mistralai/Mixtral-8x7B-v0.1", + "status": 1, + "verified_date": "2026-03-17", + "metadata": null, + "note": "Core verification completed", + "phase1_score": 100.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": 93.4 } ] } diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py index 0af63b0bd..43e9da6e4 100644 --- a/transformer_lens/tools/model_registry/verify_models.py +++ b/transformer_lens/tools/model_registry/verify_models.py @@ -277,16 +277,13 @@ def estimate_benchmark_memory_gb( n_params: int, dtype: str = "float32", phases: Optional[list[int]] = None, - conserve_memory: bool = False, ) -> float: """Estimate peak memory needed for benchmark suite. Phases run sequentially, so peak memory is the maximum of any single phase, not the sum. The multiplier represents how many model copies exist at peak: - Phase 1 (conserve_memory=True): Bridge only (uses bridge.original_model - as reference) → 1.0x model + overhead - Phase 1 (conserve_memory=False): Briefly loads HF ref + Bridge → 2.0x peak + Phase 1: Briefly loads HF ref + Bridge → 2.0x peak Phase 2: Bridge + HookedTransformer (separate copy) → 2.0x model + overhead Phase 3: Same as Phase 2 (processed versions) → 2.0x model + overhead Phase 4: Bridge + GPT-2 scorer (~500MB) → ~1.0x model + 0.5 GB @@ -295,7 +292,6 @@ def estimate_benchmark_memory_gb( n_params: Number of model parameters dtype: Data type for memory calculation phases: Which phases will be run (None = all phases) - conserve_memory: Whether --conserve-memory mode is enabled Returns: Estimated peak memory in GB @@ -317,13 +313,10 @@ def estimate_benchmark_memory_gb( phases = [1, 2, 3, 4] for p in phases: - if p == 1: - copies = 1.0 if conserve_memory else 2.0 - phase_peaks.append(model_size_gb * copies * (1 + overhead_fraction)) - elif p in (2, 3): - # Bridge + HookedTransformer = 2 full model copies - copies = 2.0 - phase_peaks.append(model_size_gb * copies * (1 + overhead_fraction)) + if p in (1, 2, 3): + # Phase 1: HF ref + Bridge = 2 copies briefly + # Phase 2/3: Bridge + HookedTransformer = 2 copies + phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) elif p == 4: # Bridge + GPT-2 scorer phase_peaks.append(model_size_gb * (1 + overhead_fraction) + gpt2_overhead_gb) @@ -678,7 +671,6 @@ def verify_models( phases: Optional[list[int]] = None, quiet: bool = False, progress: Optional[VerificationProgress] = None, - conserve_memory: bool = False, ) -> VerificationProgress: """Run verification benchmarks on a list of model candidates. @@ -692,7 +684,6 @@ def verify_models( phases: Which benchmark phases to run (default: [1, 2, 3, 4]) quiet: Suppress verbose output progress: Existing progress for resume - conserve_memory: Reduce Phase 1 peak memory by using bridge.original_model Returns: VerificationProgress with results @@ -783,9 +774,7 @@ def verify_models( continue # Step 2: Check memory - estimated_mem = estimate_benchmark_memory_gb( - n_params, dtype, phases=phases, conserve_memory=conserve_memory - ) + estimated_mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) candidate.estimated_memory_gb = estimated_mem if not quiet: print( @@ -828,10 +817,6 @@ def verify_models( } torch_dtype = _dtype_map[dtype] - # Multimodal models always use conserve_memory to avoid loading two - # large models simultaneously (causes MPS memory-pressure divergence). - effective_conserve_memory = conserve_memory or arch in _MULTIMODAL_ARCHITECTURES - if not quiet: print(f" Running phases {phases} in a single benchmark call...") try: @@ -843,7 +828,6 @@ def verify_models( use_ht_reference=use_ht_reference, verbose=not quiet, phases=phases, - conserve_memory=effective_conserve_memory, trust_remote_code=needs_remote_code, scoring_model=_scoring_model, scoring_tokenizer=_scoring_tokenizer, @@ -1081,7 +1065,6 @@ def _print_dry_run( dtype: str, max_memory_gb: float, phases: Optional[list[int]] = None, - conserve_memory: bool = False, ) -> None: """Print what would be tested in a dry run.""" print(f"\nDry run: {len(candidates)} models would be tested") @@ -1102,9 +1085,7 @@ def _print_dry_run( for c in models: try: n_params = estimate_model_params(c.model_id) - mem = estimate_benchmark_memory_gb( - n_params, dtype, phases=phases, conserve_memory=conserve_memory - ) + mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) status = "OK" if mem <= max_memory_gb else "SKIP (too large)" if mem > max_memory_gb: skippable += 1 @@ -1238,12 +1219,6 @@ def main() -> None: action="store_true", help="Re-run previously failed models instead of skipping them", ) - parser.add_argument( - "--conserve-memory", - action="store_true", - help="Reduce Phase 1 peak memory from 2.0x to 1.0x by using " - "bridge.original_model instead of a separate HF model", - ) parser.add_argument( "--reverify", action="store_true", @@ -1342,7 +1317,6 @@ def main() -> None: args.dtype, max_memory_gb, phases=args.phases, - conserve_memory=args.conserve_memory, ) return @@ -1361,7 +1335,6 @@ def main() -> None: phases=args.phases, quiet=args.quiet, progress=progress, - conserve_memory=args.conserve_memory, ) elapsed = time.time() - start