Add test for CLM score parity to biofoundation

eric-czech · eric-czech · commit 4e428d6bc1a9 · 2025-10-08T09:30:54.000-04:00
diff --git a/experiments/plantcad/README.md b/experiments/plantcad/README.md
@@ -12,7 +12,6 @@ Original tutorial: https://gist.github.com/eric-czech/31e5b79689d322f7becb94a109
 git clone https://github.com/marin-community/marin.git
 cd marin
 uv venv --python 3.11
-uv sync
 ```
 
 ### Remote (SkyPilot)
@@ -29,7 +28,7 @@ envs:
 workdir: .
 setup: |
   uv venv --python 3.11
-  uv sync --extra=cuda12
+  uv sync --extra=cuda12 --extra=dna
   for var in HUGGING_FACE_HUB_TOKEN WANDB_API_KEY; do
     declare -n ref=$var
     grep -q "^export $var=" ~/.bashrc || echo "export $var=$ref" >> ~/.bashrc
diff --git a/experiments/plantcad/evaluation.py b/experiments/plantcad/evaluation.py
@@ -39,8 +39,11 @@
 from levanter.callbacks import StepInfo
 from marin.utilities.json_encoder import CustomJsonEncoder
 
-from experiments.plantcad.utils import get_available_gpus, get_nucleotide_token_ids, get_plantcad_tokenizer
+from experiments.plantcad.utils import get_available_gpus, get_nucleotide_token_ids
+from levanter.utils.hf_utils import HfTokenizer
 from marin.execution.executor import InputName, this_output_path, versioned
+from jax.sharding import Mesh
+from haliax.partitioning import ResourceMapping
 
 logger = logging.getLogger("ray")
 
@@ -49,9 +52,6 @@
 class DnaEvalBaseConfig:
     """Base configuration for DNA evaluation with fields needed for training callbacks"""
 
-    model_config: str
-    """Model configuration size (e.g., '300m', '100m', etc.)"""
-
     dataset_path: str = "plantcad/evolutionary-constraint-example"
     """Dataset repository path"""
 
@@ -292,12 +292,12 @@ def create_alternate_sequences(
     assert 0 <= ref_cts.max().item() <= 1
     if (invalid := ref_cts == 0).any().item():
         pos = nucleotide_positions[Batch, invalid]
-        tok = tokens_expanded[Batch, invalid, Position, pos]
+        tok = tokens_expanded[Batch, invalid][Position, pos]
         raise ValueError(
-            "Found invalid sequences in batch with OOV nucleotides at target positions; "
-            f"Target positions: {pos} "
-            f"Valid nucleotide token IDs: {nucleotide_token_ids} "
-            f"Invalid tokens: {tok} "
+            "Found invalid sequences in batch with OOV nucleotides at target positions;\n"
+            f"Target positions: {pos.array} \n"
+            f"Valid nucleotide token IDs: {nucleotide_token_ids} \n"
+            f"Invalid tokens: {tok.array} "
         )
     ref = hax.argmax(ref_mask, axis=Variant)
     assert ref.axes == (Batch,)
@@ -439,7 +439,7 @@ def score_eval_dataset(
         assert isinstance(pos, list)
 
         # Tokenize and convert to JAX arrays
-        tokenized = tokenizer(sequences, padding=True, truncation=True, max_length=512, return_tensors="np")
+        tokenized = tokenizer(sequences, padding=False, add_special_tokens=False, truncation=False, return_tensors="np")
         tokens = hax.named(tokenized["input_ids"], ("batch", "position"))
         nucleotide_positions = hax.named(pos, ("batch",))
 
@@ -473,12 +473,25 @@ def score_eval_dataset(
 # ------------------------------------------------------------------------------------------------
 
 
-def create_dna_eval_callback(config: DnaEvalBaseConfig) -> Callable[[StepInfo], None]:
-    """Create a training callback for DNA evaluation."""
+def create_dna_eval_callback(
+    config: DnaEvalBaseConfig,
+    tokenizer: HfTokenizer,
+    device_mesh: Mesh,
+    compute_axis_mapping: ResourceMapping,
+    parameter_axis_mapping: ResourceMapping,
+) -> Callable[[StepInfo], None]:
+    """Create a training callback for DNA evaluation.
+
+    Args:
+        config: DNA evaluation configuration
+        tokenizer: Tokenizer provided by Levanter's training loop
+        device_mesh: JAX device mesh for distributed computation
+        compute_axis_mapping: Axis mapping for computation (used during model inference)
+        parameter_axis_mapping: Axis mapping for parameter storage (used for model sharding)
 
-    # Load tokenizer
-    # TODO: fix this; how can the tokenizer be referenced during training without reloading?
-    tokenizer = get_plantcad_tokenizer()
+    Returns:
+        Callback function that evaluates DNA conservation at training steps
+    """
 
     # Load and validate dataset once when creating the callback
     dataset = load_eval_dataset(config)
@@ -488,7 +501,7 @@ def dna_conservation_callback(step_info: StepInfo) -> None:
         logger.debug(f"Running DNA conservation evaluation ({step=})")
         eval_model = step_info.eval_model
 
-        # Create logit function for Levanter model
+        # Create logit function for Levanter model with proper axis mapping
         def logit_function(
             tokens: ht.Int[ht.NamedArray, "batch position"],
         ) -> ht.Float[ht.NamedArray, "batch position vocab"]:
@@ -501,7 +514,6 @@ def logit_function(
             logit_function=logit_function,
             eval_dataset=dataset,
             batch_size=config.batch_size,
-            # TODO: make configurable or disable?
             log_progress=True,
         )
 
@@ -710,18 +722,12 @@ def run_conservation_eval(config: DnaEvalConfig) -> dict[str, float]:
 # Usage examples:
 
 # 1. Training callback (uses Levanter model from training state):
-# config = DnaEvalConfig(
-#     checkpoint_path="/path/to/checkpoint",  # Not used for callbacks
-#     model_config="300m",
-#     dataset_path="plantcad/evolutionary-constraint-example",
-#     dataset_config="10k"
-# )
-# trainer.add_hook(create_dna_eval_callback(config), every=1000)
+# The plugin system handles this automatically via PlantCADEvaluationPlugin
+# See plugin.py for implementation details
 
 # 2. Standalone evaluation with HuggingFace checkpoint:
 # config = DnaEvalConfig(
 #     checkpoint_path="/path/to/hf/checkpoint",
-#     model_config="300m",
 #     device="cuda",  # or "cpu" for CPU inference
 #     num_workers=None,  # defaults to number of GPUs
 #     dataset_path="plantcad/evolutionary-constraint-example",
diff --git a/experiments/plantcad/plugin.py b/experiments/plantcad/plugin.py
@@ -16,11 +16,13 @@
 """PlantCAD evaluation plugin for Levanter training."""
 
 import logging
-from typing import Any
 from collections.abc import Callable
 
 from levanter.eval import EvalPlugin
 from levanter.callbacks import StepInfo
+from levanter.utils.hf_utils import HfTokenizer
+from jax.sharding import Mesh
+from haliax.partitioning import ResourceMapping
 from experiments.plantcad.evaluation import DnaEvalBaseConfig, create_dna_eval_callback
 
 logger = logging.getLogger("ray")
@@ -29,11 +31,39 @@
 class PlantCADEvaluationPlugin(EvalPlugin):
     """PlantCAD DNA conservation evaluation plugin for Levanter."""
 
-    def __init__(self, config: dict[str, Any]):
-        # Store the dict config directly
-        self.config = DnaEvalBaseConfig(**config)
-        logger.info(f"Initialized PlantCAD evaluation plugin with config: {self.config}")
+    def __init__(self):
+        logger.info("Initialized PlantCAD evaluation plugin")
 
-    def create_callback(self, **kwargs) -> Callable[[StepInfo], None]:
-        """Create DNA conservation evaluation callback."""
-        return create_dna_eval_callback(self.config)
+    def create_callback(
+        self,
+        *,
+        tokenizer: HfTokenizer,
+        device_mesh: Mesh,
+        compute_axis_mapping: ResourceMapping,
+        parameter_axis_mapping: ResourceMapping,
+        batch_size: int,
+    ) -> Callable[[StepInfo], None]:
+        """Create DNA conservation evaluation callback.
+
+        Args:
+            tokenizer: Tokenizer for the model
+            device_mesh: JAX device mesh for distributed computation
+            compute_axis_mapping: Axis mapping for computation
+            parameter_axis_mapping: Axis mapping for parameter storage
+            batch_size: Evaluation batch size
+
+        Returns:
+            Callback function for DNA conservation evaluation
+        """
+        # Cut batch size in half because current eval runs with
+        # model in full precision rather than bf16
+        # TODO: implement mixed-precision in eval
+        config = DnaEvalBaseConfig(batch_size=batch_size // 2)
+        logger.info(f"Creating conservation evaluation callback with config: {config}")
+        return create_dna_eval_callback(
+            config=config,
+            tokenizer=tokenizer,
+            device_mesh=device_mesh,
+            compute_axis_mapping=compute_axis_mapping,
+            parameter_axis_mapping=parameter_axis_mapping,
+        )
diff --git a/experiments/plantcad/scripts/exp_pc1_train.py b/experiments/plantcad/scripts/exp_pc1_train.py
@@ -31,11 +31,13 @@
 from experiments.simple_train_config import SimpleTrainConfig
 from marin.execution.executor import executor_main
 from marin.resources import GpuConfig
+from experiments.plantcad.plugin import PlantCADEvaluationPlugin
+from experiments.plantcad.evaluation import resolve_checkpoint_path
 
 logger = logging.getLogger("ray")
 
 # Run iteration
-run_number = 12
+run_number = 16
 
 # Resources
 num_gpus = get_available_gpus(local_only=True)
@@ -45,8 +47,10 @@
 learning_rate = 3e-4
 
 # Batch size
-# TODO: How do you tune micro/macro batch size instead of gloabl batch?
-#       This needs to not be a function of device count.
+# Ideally global batch would be fixed and device count wouldn't
+# matter (w/ grad accum), however OOMs are unavoidable on GPUs
+# unless the global batch varies as a function of device count.
+# TODO: find out how to fix global batch on GPUs
 micro_batch_size = 256
 global_batch_size = micro_batch_size * num_gpus
 
@@ -56,14 +60,22 @@
 # Training configuration
 num_train_steps = target_examples // global_batch_size
 steps_per_export = num_train_steps // 10
-steps_per_eval = num_train_steps // 10
+steps_per_cycle = num_train_steps // 10
+steps_per_eval = num_train_steps // 100
 
 # Model configuration - use 600M by default
 model_size = "600m"
 plant_model_config = get_plantcad_config(model_size)
 
 # PlantCAD1 training dataset
 plant_data_tokenized = get_plantcad_training_dataset(use_pretokenized=True)
+plugin_class = PlantCADEvaluationPlugin
+
+hf_checkpoint_path = (
+    "hf://plantcad/_dev_marin_plantcad1_v2_train/local_store/checkpoints/plantcad-train-600m-r12-7ea0fc/hf/step-26782"
+)
+if hf_checkpoint_path is not None:
+    hf_checkpoint_path = resolve_checkpoint_path(hf_checkpoint_path)
 
 # Training configuration
 train_config = SimpleTrainConfig(
@@ -73,27 +85,21 @@
     lr_schedule="inv",
     warmup=0.05,
     decay=0.1,
-    cycle_length=steps_per_eval,
+    cycle_length=steps_per_cycle,
     train_batch_size=global_batch_size,
     per_device_eval_parallelism=micro_batch_size,
     steps_per_eval=steps_per_eval,
     num_train_steps=num_train_steps,
     learning_rate=learning_rate,
     steps_per_export=steps_per_export,
-    eval_plugins=[
-        {
-            "plugin": "experiments.plantcad.plugin.PlantCADEvaluationPlugin",
-            "config": {
-                "model_config": model_size,
-                "dataset_config": "10k",
-                # TODO: I regularly get OOMs with the same per-device batch size on this
-                # eval even though it only runs on one device?  Cut it down by half for now..
-                "batch_size": micro_batch_size // 2,
-                "max_samples": 10000,
-            },
-            "steps": steps_per_eval,
-        }
-    ],
+    initialize_from_hf=hf_checkpoint_path,
+    # TODO: figure out why this is broken
+    # eval_plugins=[
+    #     EvalPluginConfig(
+    #         plugin_class=f"{plugin_class.__module__}.{plugin_class.__qualname__}",
+    #         steps=steps_per_cycle,
+    #     )
+    # ]
 )
 
 # Create training step
diff --git a/experiments/plantcad/tests/test_evaluation.py b/experiments/plantcad/tests/test_evaluation.py
@@ -12,10 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import numpy as np
 import pytest
 import jax
 import jax.numpy as jnp
 import haliax as hax
+from datasets import load_dataset
+from huggingface_hub import snapshot_download
+from levanter.models.llama import LlamaConfig
+from levanter.utils.jax_utils import use_cpu_device
+from transformers import PretrainedConfig as HfConfig, AutoTokenizer
 from experiments.plantcad.evaluation import (
     create_alternate_sequences,
     compute_sequence_logprob,
@@ -277,6 +284,68 @@ def test_compute_causal_conservation():
         )
 
 
+def test_compute_causal_conservation_accuracy():
+    """End-to-end parity test against reference scores.
+
+    Reference scores come from https://github.com/Open-Athena/biofoundation/commit/23f6745defdd54cac09b43c066f249789bf74d56
+    """
+    # Download model and dataset
+    data_path = snapshot_download(
+        repo_id="plantcad/ci",
+        repo_type="dataset",
+        allow_patterns="unit_tests/evolutionary_constraint/ref_logprob_clm_sim/*",
+    )
+    ds = load_dataset("plantcad/ci", name="ut_ec_ref_logprob_clm_sim", split="train")
+    model_dir = os.path.join(data_path, "unit_tests/evolutionary_constraint/ref_logprob_clm_sim/model")
+
+    # Load tokenizer and config
+    hf_config = HfConfig.from_pretrained(model_dir)
+    config = LlamaConfig.from_hf_config(hf_config)
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+    # Load sequences and positions
+    sequences = ds["seq"] if "seq" in ds.column_names else ds["sequence"]
+    positions = np.asarray(ds["pos"], dtype=np.int32)
+    tokens_np = np.asarray([tokenizer(s, add_special_tokens=False)["input_ids"] for s in sequences], dtype=np.int32)
+    tokens = hax.named(jnp.array(tokens_np), ("batch", "position"))
+    nucleotide_positions = hax.named(jnp.array(positions), ("batch",))
+    nucleotide_token_ids = [int(tokenizer.convert_tokens_to_ids(nt)) for nt in "ACGT"]
+
+    # Load model
+    converter = config.hf_checkpoint_converter().replaced(reference_checkpoint=model_dir, tokenizer=tokenizer)
+    with use_cpu_device():
+        model = converter.load_pretrained(
+            config.model_type,
+            ref=model_dir,
+            resize_vocab_to_match_tokenizer=False,
+            dtype=jnp.float32,
+        )
+
+    def logit_fn(x):
+        return model(x)
+
+    # Compute conservation scores
+    actual = compute_causal_conservation(
+        tokens=tokens,
+        logit_function=logit_fn,
+        nucleotide_positions=nucleotide_positions,
+        nucleotide_token_ids=nucleotide_token_ids,
+    )
+
+    # Compare with expected scores
+    expected = np.asarray(ds["score"], dtype=np.float32)
+    our_scores_np = np.asarray(actual.array, dtype=np.float32)
+
+    assert len(our_scores_np) == len(expected) == 8
+    assert jnp.all(jnp.isfinite(actual.array))
+    assert np.all(np.isfinite(expected))
+
+    # Order parity
+    assert np.array_equal(np.argsort(-expected), np.argsort(-our_scores_np))
+    # Value parity within tolerance
+    np.testing.assert_allclose(our_scores_np, expected, rtol=1e-3, atol=1e-3)
+
+
 def _assert_batch_variants(alt_array, batch_idx, expected_variants, seq_length, batch_name):
     """Helper to assert variant sequences match expected values for a batch."""
     for variant_idx in range(4):
diff --git a/experiments/simple_train_config.py b/experiments/simple_train_config.py
@@ -14,12 +14,12 @@
 
 import dataclasses
 from dataclasses import dataclass
-from typing import Any
 
 from levanter.callbacks.watch import WatchConfig
 from levanter.optim import OptimizerConfig
 from levanter.schedule import IntSchedule
 
+from levanter.eval import EvalPluginConfig
 from marin.resources import ResourceConfig, TpuPodConfig
 
 
@@ -88,8 +88,8 @@ class SimpleTrainConfig:
     watch: WatchConfig = dataclasses.field(default_factory=WatchConfig)
     """Config for watching gradients, parameters, etc. Default is to log norms of gradients and parameters."""
 
-    eval_plugins: list[dict[str, Any]] | None = None
-    """List of evaluation plugin configs. Each should have 'plugin' (module.class) and 'config' keys."""
+    eval_plugins: list[EvalPluginConfig] | None = None
+    """List of evaluation plugin configs."""
 
     @property
     def tpu_type(self) -> str | None:
diff --git a/pyproject.toml b/pyproject.toml