microsoft · rlundeen2 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/.env_example b/.env_example
@@ -56,6 +56,11 @@ AZURE_OPENAI_GPT4_CHAT_KEY="xxxxx"
 AZURE_OPENAI_GPT4_CHAT_MODEL="deployment-name"
 AZURE_OPENAI_GPT4_CHAT_UNDERLYING_MODEL=""
 
+AZURE_OPENAI_GPT5_4_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
+AZURE_OPENAI_GPT5_4_KEY="xxxxx"
+AZURE_OPENAI_GPT5_4_MODEL="gpt-5.4"
+AZURE_OPENAI_GPT5_4_UNDERLYING_MODEL="gpt-5.4"
+
 # Endpoints that host models with fewer safety mechanisms (e.g. via adversarial fine tuning
 # or content filters turned off) can be defined below and used in adversarial attack testing scenarios.
 AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"

diff --git a/build_scripts/evaluate_scorers.py b/build_scripts/evaluate_scorers.py
@@ -9,8 +9,11 @@
 
 Usage:
     python build_scripts/evaluate_scorers.py
+    python build_scripts/evaluate_scorers.py --tags refusal
+    python build_scripts/evaluate_scorers.py --tags refusal,default
 """
 
+import argparse
 import asyncio
 import sys
 import time
@@ -23,16 +26,21 @@
 from pyrit.setup.initializers import ScorerInitializer, TargetInitializer
 
 
-async def evaluate_scorers() -> None:
+async def evaluate_scorers(tags: list[str] | None = None) -> None:
     """
     Evaluate multiple scorers against their configured datasets.
 
     This will:
     1. Initialize PyRIT with in-memory database
     2. Register all scorers from ScorerInitializer into the ScorerRegistry
-    3. Iterate through all registered scorers
+    3. Iterate through registered scorers (optionally filtered by tags)
     4. Run evaluate_async() on each scorer
     5. Save results to scorer_evals directory
+
+    Args:
+        tags: Optional list of tags to filter which scorers to evaluate.
+            When provided, only scorers matching any of the tags are evaluated.
+            When None, all scorers are evaluated.
     """
     print("Initializing PyRIT...")
     target_init = TargetInitializer()
@@ -43,10 +51,22 @@ async def evaluate_scorers() -> None:
     )
 
     registry = ScorerRegistry.get_registry_singleton()
-    scorer_names = registry.get_names()
+
+    # Filter scorers by tags if specified
+    if tags:
+        scorer_names: list[str] = []
+        for tag in tags:
+            entries = registry.get_by_tag(tag=tag)
+            scorer_names.extend(entry.name for entry in entries if entry.name not in scorer_names)
+        scorer_names.sort()
+        print(f"\nFiltering by tags: {tags}")
+    else:
+        scorer_names = registry.get_names()
 
     if not scorer_names:
         print("No scorers registered. Check environment variable configuration.")
+        if tags:
+            print(f"  (filtered by tags: {tags})")
         return
 
     print(f"\nEvaluating {len(scorer_names)} scorer(s)...\n")
@@ -95,21 +115,40 @@ async def evaluate_scorers() -> None:
     print("=" * 60)
 
 
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate PyRIT scorers against human-labeled datasets.",
+    )
+    parser.add_argument(
+        "--tags",
+        type=str,
+        default=None,
+        help="Comma-separated list of tags to filter which scorers to evaluate (e.g., --tags refusal,default)",
+    )
+    return parser.parse_args()
+
+
 if __name__ == "__main__":
+    args = parse_args()
+    tag_list = [t.strip() for t in args.tags.split(",")] if args.tags else None
+
     print("=" * 60)
     print("PyRIT Scorer Evaluation Script")
     print("=" * 60)
     print("This script will evaluate multiple scorers against human-labeled")
     print("datasets. This is a long-running process that may take several")
     print("minutes to hours depending on the number of scorers and datasets.")
     print()
+    if tag_list:
+        print(f"Filtering by tags: {tag_list}")
     print("Results will be saved to the registry files in:")
     print(f"  {SCORER_EVALS_PATH}")
     print("=" * 60)
     print()
 
     try:
-        asyncio.run(evaluate_scorers())
+        asyncio.run(evaluate_scorers(tags=tag_list))
     except KeyboardInterrupt:
         print("\n\nEvaluation interrupted by user.")
         sys.exit(1)

diff --git a/doc/code/scoring/8_scorer_metrics.ipynb b/doc/code/scoring/8_scorer_metrics.ipynb
@@ -278,7 +278,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 11 scorer configurations in the registry\n",
+      "Found 10 scorer configurations in the registry\n",
       "\n",
       "Top 5 configurations by F1 Score:\n",
       "--------------------------------------------------------------------------------\n",
@@ -295,12 +295,12 @@
       "\u001b[36m            • model_name: gpt-4o\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 84.84%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0185\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8606\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7928\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9412\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.27s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 83.29%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0188\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8472\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.7593\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9581\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 0.80s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -314,12 +314,12 @@
       "\u001b[36m            • model_name: gpt-4o-unsafe\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 79.26%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0209\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8259\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7088\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9893\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.52s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 79.24%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0204\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8210\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.7041\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9843\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 0.99s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -334,12 +334,12 @@
       "\u001b[36m            • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 78.46%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0212\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8204\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7008\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9893\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.77s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 77.72%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0209\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8095\u001b[0m\n",
+      "\u001b[31m      • Precision: 0.6900\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9791\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 1.36s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -350,12 +350,12 @@
       "\u001b[36m      • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 78.46%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0212\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.7582\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.8581\u001b[0m\n",
-      "\u001b[31m      • Recall: 0.6791\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 2.39s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 81.27%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0196\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.7836\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.8874\u001b[0m\n",
+      "\u001b[36m      • Recall: 0.7016\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 2.01s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -366,19 +366,19 @@
       "\u001b[36m      • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 73.40%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0228\u001b[0m\n",
-      "\u001b[31m      • F1 Score: 0.6732\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.8655\u001b[0m\n",
-      "\u001b[31m      • Recall: 0.5508\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 2.23s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 74.18%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0220\u001b[0m\n",
+      "\u001b[31m      • F1 Score: 0.6731\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.8678\u001b[0m\n",
+      "\u001b[31m      • Recall: 0.5497\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 1.83s\u001b[0m\n",
       "\n",
       "================================================================================\n",
-      "Best Accuracy:  84.84%\n",
+      "Best Accuracy:  83.29%\n",
       "Best Precision: 0.989\n",
-      "Best Recall:    0.989\n",
-      "Fastest:        0.129 seconds\n",
-      "Slowest:        3.520 seconds\n"
+      "Best Recall:    0.984\n",
+      "Fastest:        0.134 seconds\n",
+      "Slowest:        2.390 seconds\n"
      ]
     }
    ],
@@ -650,6 +650,81 @@
     "  - For harm scorers: 0.0-1.0 float values\n",
     "- `data_type`: Type of content (defaults to \"text\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18",
+   "metadata": {},
+   "source": [
+    "## Batch Evaluation with `evaluate_scorers.py`\n",
+    "\n",
+    "While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script\n",
+    "evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer\n",
+    "prompts, adding new variants, or updating human-labeled datasets.\n",
+    "\n",
+    "The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),\n",
+    "then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in\n",
+    "`pyrit/datasets/scorer_evals/`.\n",
+    "\n",
+    "### Basic Usage\n",
+    "\n",
+    "```bash\n",
+    "# Evaluate all registered scorers (long-running — can take hours)\n",
+    "python build_scripts/evaluate_scorers.py\n",
+    "\n",
+    "# Evaluate only scorers with specific tags\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal,default\n",
+    "```\n",
+    "\n",
+    "### Tags\n",
+    "\n",
+    "`ScorerInitializer` applies tags to scorers during registration. These tags let you target\n",
+    "specific subsets for evaluation:\n",
+    "\n",
+    "- `refusal` — The 4 standalone refusal scorer variants\n",
+    "- `default` — All scorers registered by default\n",
+    "- `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)\n",
+    "- `best_objective_f1` — The objective scorer with the highest F1\n",
+    "\n",
+    "### Recommended Workflow: Refusal → Dependent Scorers\n",
+    "\n",
+    "When refusal scorer prompts or datasets change, the recommended workflow is:\n",
+    "\n",
+    "**Step 1: Evaluate refusal scorers first**\n",
+    "\n",
+    "```bash\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal\n",
+    "```\n",
+    "\n",
+    "This evaluates only the 4 refusal variants and writes results to\n",
+    "`refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which\n",
+    "refusal variant has the best F1 and tag it as `best_refusal_f1`.\n",
+    "\n",
+    "**Step 2: Re-evaluate all scorers**\n",
+    "\n",
+    "```bash\n",
+    "python build_scripts/evaluate_scorers.py\n",
+    "```\n",
+    "\n",
+    "On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best\n",
+    "refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping\n",
+    "the best refusal scorer). This ensures objective scorers that depend on refusal detection use the\n",
+    "best-performing refusal prompt.\n",
+    "\n",
+    "Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are\n",
+    "automatically skipped, so re-running the full script is efficient.\n",
+    "\n",
+    "**Step 3: Commit updated metrics**\n",
+    "\n",
+    "```bash\n",
+    "git add pyrit/datasets/scorer_evals/\n",
+    "git commit -m \"chore: update scorer metrics\"\n",
+    "```\n",
+    "\n",
+    "The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime\n",
+    "to select the best scorers."
+   ]
   }
  ],
  "metadata": {

diff --git a/doc/code/scoring/8_scorer_metrics.py b/doc/code/scoring/8_scorer_metrics.py
@@ -354,3 +354,73 @@
 #   - For objective scorers: 0 or 1 (converted to bool)
 #   - For harm scorers: 0.0-1.0 float values
 # - `data_type`: Type of content (defaults to "text")
+
+# %% [markdown]
+# ## Batch Evaluation with `evaluate_scorers.py`
+#
+# While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script
+# evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer
+# prompts, adding new variants, or updating human-labeled datasets.
+#
+# The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),
+# then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in
+# `pyrit/datasets/scorer_evals/`.
+#
+# ### Basic Usage
+#
+# ```bash
+# # Evaluate all registered scorers (long-running — can take hours)
+# python build_scripts/evaluate_scorers.py
+#
+# # Evaluate only scorers with specific tags
+# python build_scripts/evaluate_scorers.py --tags refusal
+# python build_scripts/evaluate_scorers.py --tags refusal,default
+# ```
+#
+# ### Tags
+#
+# `ScorerInitializer` applies tags to scorers during registration. These tags let you target
+# specific subsets for evaluation:
+#
+# - `refusal` — The 4 standalone refusal scorer variants
+# - `default` — All scorers registered by default
+# - `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)
+# - `best_objective_f1` — The objective scorer with the highest F1
+#
+# ### Recommended Workflow: Refusal → Dependent Scorers
+#
+# When refusal scorer prompts or datasets change, the recommended workflow is:
+#
+# **Step 1: Evaluate refusal scorers first**
+#
+# ```bash
+# python build_scripts/evaluate_scorers.py --tags refusal
+# ```
+#
+# This evaluates only the 4 refusal variants and writes results to
+# `refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which
+# refusal variant has the best F1 and tag it as `best_refusal_f1`.
+#
+# **Step 2: Re-evaluate all scorers**
+#
+# ```bash
+# python build_scripts/evaluate_scorers.py
+# ```
+#
+# On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best
+# refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping
+# the best refusal scorer). This ensures objective scorers that depend on refusal detection use the
+# best-performing refusal prompt.
+#
+# Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are
+# automatically skipped, so re-running the full script is efficient.
+#
+# **Step 3: Commit updated metrics**
+#
+# ```bash
+# git add pyrit/datasets/scorer_evals/
+# git commit -m "chore: update scorer metrics"
+# ```
+#
+# The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime
+# to select the best scorers.