Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env_example
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ AZURE_OPENAI_GPT4_CHAT_KEY="xxxxx"
AZURE_OPENAI_GPT4_CHAT_MODEL="deployment-name"
AZURE_OPENAI_GPT4_CHAT_UNDERLYING_MODEL=""

AZURE_OPENAI_GPT5_4_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
AZURE_OPENAI_GPT5_4_KEY="xxxxx"
AZURE_OPENAI_GPT5_4_MODEL="gpt-5.4"
AZURE_OPENAI_GPT5_4_UNDERLYING_MODEL="gpt-5.4"

# Endpoints that host models with fewer safety mechanisms (e.g. via adversarial fine tuning
# or content filters turned off) can be defined below and used in adversarial attack testing scenarios.
AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
Expand Down
47 changes: 43 additions & 4 deletions build_scripts/evaluate_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@

Usage:
python build_scripts/evaluate_scorers.py
python build_scripts/evaluate_scorers.py --tags refusal
python build_scripts/evaluate_scorers.py --tags refusal,default
"""

import argparse
import asyncio
import sys
import time
Expand All @@ -23,16 +26,21 @@
from pyrit.setup.initializers import ScorerInitializer, TargetInitializer


async def evaluate_scorers() -> None:
async def evaluate_scorers(tags: list[str] | None = None) -> None:
"""
Evaluate multiple scorers against their configured datasets.

This will:
1. Initialize PyRIT with in-memory database
2. Register all scorers from ScorerInitializer into the ScorerRegistry
3. Iterate through all registered scorers
3. Iterate through registered scorers (optionally filtered by tags)
4. Run evaluate_async() on each scorer
5. Save results to scorer_evals directory

Args:
tags: Optional list of tags to filter which scorers to evaluate.
When provided, only scorers matching any of the tags are evaluated.
When None, all scorers are evaluated.
"""
print("Initializing PyRIT...")
target_init = TargetInitializer()
Expand All @@ -43,10 +51,22 @@ async def evaluate_scorers() -> None:
)

registry = ScorerRegistry.get_registry_singleton()
scorer_names = registry.get_names()

# Filter scorers by tags if specified
if tags:
scorer_names: list[str] = []
for tag in tags:
entries = registry.get_by_tag(tag=tag)
scorer_names.extend(entry.name for entry in entries if entry.name not in scorer_names)
scorer_names.sort()
print(f"\nFiltering by tags: {tags}")
else:
scorer_names = registry.get_names()

if not scorer_names:
print("No scorers registered. Check environment variable configuration.")
if tags:
print(f" (filtered by tags: {tags})")
return

print(f"\nEvaluating {len(scorer_names)} scorer(s)...\n")
Expand Down Expand Up @@ -95,21 +115,40 @@ async def evaluate_scorers() -> None:
print("=" * 60)


def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Evaluate PyRIT scorers against human-labeled datasets.",
)
parser.add_argument(
"--tags",
type=str,
default=None,
help="Comma-separated list of tags to filter which scorers to evaluate (e.g., --tags refusal,default)",
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
tag_list = [t.strip() for t in args.tags.split(",")] if args.tags else None

print("=" * 60)
print("PyRIT Scorer Evaluation Script")
print("=" * 60)
print("This script will evaluate multiple scorers against human-labeled")
print("datasets. This is a long-running process that may take several")
print("minutes to hours depending on the number of scorers and datasets.")
print()
if tag_list:
print(f"Filtering by tags: {tag_list}")
print("Results will be saved to the registry files in:")
print(f" {SCORER_EVALS_PATH}")
print("=" * 60)
print()

try:
asyncio.run(evaluate_scorers())
asyncio.run(evaluate_scorers(tags=tag_list))
except KeyboardInterrupt:
print("\n\nEvaluation interrupted by user.")
sys.exit(1)
Expand Down
145 changes: 110 additions & 35 deletions doc/code/scoring/8_scorer_metrics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Found 11 scorer configurations in the registry\n",
"Found 10 scorer configurations in the registry\n",
"\n",
"Top 5 configurations by F1 Score:\n",
"--------------------------------------------------------------------------------\n",
Expand All @@ -295,12 +295,12 @@
"\u001b[36m • model_name: gpt-4o\u001b[0m\n",
"\n",
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
"\u001b[36m • Accuracy: 84.84%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0185\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8606\u001b[0m\n",
"\u001b[36m • Precision: 0.7928\u001b[0m\n",
"\u001b[32m • Recall: 0.9412\u001b[0m\n",
"\u001b[36m • Average Score Time: 1.27s\u001b[0m\n",
"\u001b[36m • Accuracy: 83.29%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0188\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8472\u001b[0m\n",
"\u001b[36m • Precision: 0.7593\u001b[0m\n",
"\u001b[32m • Recall: 0.9581\u001b[0m\n",
"\u001b[36m • Average Score Time: 0.80s\u001b[0m\n",
"\n",
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
Expand All @@ -314,12 +314,12 @@
"\u001b[36m • model_name: gpt-4o-unsafe\u001b[0m\n",
"\n",
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
"\u001b[36m • Accuracy: 79.26%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8259\u001b[0m\n",
"\u001b[36m • Precision: 0.7088\u001b[0m\n",
"\u001b[32m • Recall: 0.9893\u001b[0m\n",
"\u001b[36m • Average Score Time: 1.52s\u001b[0m\n",
"\u001b[36m • Accuracy: 79.24%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8210\u001b[0m\n",
"\u001b[36m • Precision: 0.7041\u001b[0m\n",
"\u001b[32m • Recall: 0.9843\u001b[0m\n",
"\u001b[36m • Average Score Time: 0.99s\u001b[0m\n",
"\n",
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
Expand All @@ -334,12 +334,12 @@
"\u001b[36m • temperature: 0.9\u001b[0m\n",
"\n",
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
"\u001b[36m • Accuracy: 78.46%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8204\u001b[0m\n",
"\u001b[36m • Precision: 0.7008\u001b[0m\n",
"\u001b[32m • Recall: 0.9893\u001b[0m\n",
"\u001b[36m • Average Score Time: 1.77s\u001b[0m\n",
"\u001b[36m • Accuracy: 77.72%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n",
"\u001b[36m • F1 Score: 0.8095\u001b[0m\n",
"\u001b[31m • Precision: 0.6900\u001b[0m\n",
"\u001b[32m • Recall: 0.9791\u001b[0m\n",
"\u001b[36m • Average Score Time: 1.36s\u001b[0m\n",
"\n",
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
Expand All @@ -350,12 +350,12 @@
"\u001b[36m • temperature: 0.9\u001b[0m\n",
"\n",
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
"\u001b[36m • Accuracy: 78.46%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n",
"\u001b[36m • F1 Score: 0.7582\u001b[0m\n",
"\u001b[36m • Precision: 0.8581\u001b[0m\n",
"\u001b[31m • Recall: 0.6791\u001b[0m\n",
"\u001b[36m • Average Score Time: 2.39s\u001b[0m\n",
"\u001b[36m • Accuracy: 81.27%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0196\u001b[0m\n",
"\u001b[36m • F1 Score: 0.7836\u001b[0m\n",
"\u001b[36m • Precision: 0.8874\u001b[0m\n",
"\u001b[36m • Recall: 0.7016\u001b[0m\n",
"\u001b[36m • Average Score Time: 2.01s\u001b[0m\n",
"\n",
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
Expand All @@ -366,19 +366,19 @@
"\u001b[36m • temperature: 0.9\u001b[0m\n",
"\n",
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
"\u001b[36m • Accuracy: 73.40%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0228\u001b[0m\n",
"\u001b[31m • F1 Score: 0.6732\u001b[0m\n",
"\u001b[36m • Precision: 0.8655\u001b[0m\n",
"\u001b[31m • Recall: 0.5508\u001b[0m\n",
"\u001b[36m • Average Score Time: 2.23s\u001b[0m\n",
"\u001b[36m • Accuracy: 74.18%\u001b[0m\n",
"\u001b[36m • Accuracy Std Error: ±0.0220\u001b[0m\n",
"\u001b[31m • F1 Score: 0.6731\u001b[0m\n",
"\u001b[36m • Precision: 0.8678\u001b[0m\n",
"\u001b[31m • Recall: 0.5497\u001b[0m\n",
"\u001b[36m • Average Score Time: 1.83s\u001b[0m\n",
"\n",
"================================================================================\n",
"Best Accuracy: 84.84%\n",
"Best Accuracy: 83.29%\n",
"Best Precision: 0.989\n",
"Best Recall: 0.989\n",
"Fastest: 0.129 seconds\n",
"Slowest: 3.520 seconds\n"
"Best Recall: 0.984\n",
"Fastest: 0.134 seconds\n",
"Slowest: 2.390 seconds\n"
]
}
],
Expand Down Expand Up @@ -650,6 +650,81 @@
" - For harm scorers: 0.0-1.0 float values\n",
"- `data_type`: Type of content (defaults to \"text\")"
]
},
{
"cell_type": "markdown",
"id": "18",
"metadata": {},
"source": [
"## Batch Evaluation with `evaluate_scorers.py`\n",
"\n",
"While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script\n",
"evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer\n",
"prompts, adding new variants, or updating human-labeled datasets.\n",
"\n",
"The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),\n",
"then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in\n",
"`pyrit/datasets/scorer_evals/`.\n",
"\n",
"### Basic Usage\n",
"\n",
"```bash\n",
"# Evaluate all registered scorers (long-running — can take hours)\n",
"python build_scripts/evaluate_scorers.py\n",
"\n",
"# Evaluate only scorers with specific tags\n",
"python build_scripts/evaluate_scorers.py --tags refusal\n",
"python build_scripts/evaluate_scorers.py --tags refusal,default\n",
"```\n",
"\n",
"### Tags\n",
"\n",
"`ScorerInitializer` applies tags to scorers during registration. These tags let you target\n",
"specific subsets for evaluation:\n",
"\n",
"- `refusal` — The 4 standalone refusal scorer variants\n",
"- `default` — All scorers registered by default\n",
"- `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)\n",
"- `best_objective_f1` — The objective scorer with the highest F1\n",
"\n",
"### Recommended Workflow: Refusal → Dependent Scorers\n",
"\n",
"When refusal scorer prompts or datasets change, the recommended workflow is:\n",
"\n",
"**Step 1: Evaluate refusal scorers first**\n",
"\n",
"```bash\n",
"python build_scripts/evaluate_scorers.py --tags refusal\n",
"```\n",
"\n",
"This evaluates only the 4 refusal variants and writes results to\n",
"`refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which\n",
"refusal variant has the best F1 and tag it as `best_refusal_f1`.\n",
"\n",
"**Step 2: Re-evaluate all scorers**\n",
"\n",
"```bash\n",
"python build_scripts/evaluate_scorers.py\n",
"```\n",
"\n",
"On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best\n",
"refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping\n",
"the best refusal scorer). This ensures objective scorers that depend on refusal detection use the\n",
"best-performing refusal prompt.\n",
"\n",
"Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are\n",
"automatically skipped, so re-running the full script is efficient.\n",
"\n",
"**Step 3: Commit updated metrics**\n",
"\n",
"```bash\n",
"git add pyrit/datasets/scorer_evals/\n",
"git commit -m \"chore: update scorer metrics\"\n",
"```\n",
"\n",
"The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime\n",
"to select the best scorers."
]
}
],
"metadata": {
Expand Down
70 changes: 70 additions & 0 deletions doc/code/scoring/8_scorer_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,3 +354,73 @@
# - For objective scorers: 0 or 1 (converted to bool)
# - For harm scorers: 0.0-1.0 float values
# - `data_type`: Type of content (defaults to "text")

# %% [markdown]
# ## Batch Evaluation with `evaluate_scorers.py`
#
# While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script
# evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer
# prompts, adding new variants, or updating human-labeled datasets.
#
# The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),
# then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in
# `pyrit/datasets/scorer_evals/`.
#
# ### Basic Usage
#
# ```bash
# # Evaluate all registered scorers (long-running — can take hours)
# python build_scripts/evaluate_scorers.py
#
# # Evaluate only scorers with specific tags
# python build_scripts/evaluate_scorers.py --tags refusal
# python build_scripts/evaluate_scorers.py --tags refusal,default
# ```
#
# ### Tags
#
# `ScorerInitializer` applies tags to scorers during registration. These tags let you target
# specific subsets for evaluation:
#
# - `refusal` — The 4 standalone refusal scorer variants
# - `default` — All scorers registered by default
# - `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)
# - `best_objective_f1` — The objective scorer with the highest F1
#
# ### Recommended Workflow: Refusal → Dependent Scorers
#
# When refusal scorer prompts or datasets change, the recommended workflow is:
#
# **Step 1: Evaluate refusal scorers first**
#
# ```bash
# python build_scripts/evaluate_scorers.py --tags refusal
# ```
#
# This evaluates only the 4 refusal variants and writes results to
# `refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which
# refusal variant has the best F1 and tag it as `best_refusal_f1`.
#
# **Step 2: Re-evaluate all scorers**
#
# ```bash
# python build_scripts/evaluate_scorers.py
# ```
#
# On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best
# refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping
# the best refusal scorer). This ensures objective scorers that depend on refusal detection use the
# best-performing refusal prompt.
#
# Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are
# automatically skipped, so re-running the full script is efficient.
#
# **Step 3: Commit updated metrics**
#
# ```bash
# git add pyrit/datasets/scorer_evals/
# git commit -m "chore: update scorer metrics"
# ```
#
# The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime
# to select the best scorers.
Loading
Loading