Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,4 @@ werewolves_swarm
.claude
tensorboard_log
tutorial/**/*.json
node_modules
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
exclude: ^tutorial/example_deep_finance/

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Excluding the entire tutorial/example_deep_finance/ directory from check-yaml is a bit broad. This could lead to new, valid YAML files in this directory being ignored by the linter in the future. It's better to be more specific and only exclude the file that contains template syntax.

        exclude: ^tutorial/example_deep_finance/deep_finance\.yaml$

- id: check-added-large-files
- id: check-ast
- id: check-json
Expand Down
15 changes: 8 additions & 7 deletions tutorial/example_deep_finance/deep_finance_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _load(path, key):
_load(train_ref_ans_path, "train")
_load(val_ref_ans_path, "val")

def _get_reference_data(self, task_id: str) -> Tuple[str, str]:
def _get_reference_data(self, task_id: str) -> Tuple[str, str | None]:
"""获取任务的参考答案和领域"""
cache_key = "val" if task_id.startswith("val_") else "train"
ans = DeepFinanceJudgeByOpenJudge._ref_answers_cache.get(cache_key, {}).get(task_id, "")
Expand Down Expand Up @@ -301,8 +301,8 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO

# 1. 提取输入数据
history = metadata.get("conversation_history", [])
query = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
task_id = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
query: str = metadata.get("query") or getattr(workflow_task.task, "main_query", "")
task_id: str = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "")
rubrics = metadata.get("rubrics") # 可能是 None 或 list of dicts
step_reward = metadata.get("reward_stats", {}).get("step_reward", 0.0)
chat_date = metadata.get("chat_date") if metadata else datetime.now().strftime("%Y-%m-%d")
Expand All @@ -318,7 +318,7 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO
# RM Gallery 耗时记录
rm_start_time = time.time()
if self._rm_enabled and self.rm_evaluator:
rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain)
rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain or "")
else:
rm_raw = 0.0
rm_time = time.time() - rm_start_time
Expand Down Expand Up @@ -788,19 +788,20 @@ def _save_evaluation_log(self, task_id: str, grader_results: Dict[str, List[Any]
保存 OpenJudge 评估日志(可选)
"""
try:
grader_results_log: Dict[str, List[Dict[str, Any]]] = {}
log = {
"task_id": task_id,
"query": query,
"timestamp": datetime.now().isoformat(),
"grader_results": {}
"grader_results": grader_results_log
}

# 简化 grader_results 以便序列化
for grader_name, score_list in grader_results.items():
log["grader_results"][grader_name] = []
grader_results_log[grader_name] = []
for score in score_list:
if hasattr(score, "score"):
log["grader_results"][grader_name].append({
grader_results_log[grader_name].append({
"score": score.score,
"reason": score.reason[:200] if hasattr(score, "reason") else "",
})
Expand Down
2 changes: 1 addition & 1 deletion tutorial/example_deep_finance/judge/cgcv/json_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def validate_cgcv_schema(obj: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]],
# 验证 status
if normalized["status"] not in VALID_STATUSES:
# 尝试模糊匹配
status_lower = normalized["status"]
status_lower: str = normalized["status"]
matched = False
for valid_status in VALID_STATUSES:
if valid_status in status_lower or status_lower in valid_status:
Expand Down
46 changes: 42 additions & 4 deletions tutorial/opencode_build_openclaw_agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,19 @@ In a new terminal:

```bash
cd tutorial/opencode_build_openclaw_agent

# Option 1: Use OpenJudge pointwise grading (default)
export AJET_SWARM_URL="http://localhost:10086"
export NUM_REPEAT=4
export REWARD_MODE=pointwise
export DASHSCOPE_API_KEY=your_api_key_here
python fake_vllm_endpoint.py

# Option 2: Use OpenJudge listwise ranking
export AJET_SWARM_URL="http://localhost:10086"
export NUM_REPEAT=4
export REWARD_MODE=listwise
export DASHSCOPE_API_KEY=your_api_key_here
python fake_vllm_endpoint.py
```

Expand Down Expand Up @@ -113,13 +124,40 @@ Key parameters in `fake_vllm_endpoint.py`:
- `num_repeat=4` - GRPO N parameter (responses per query)
- `model` - Base model path

Environment variables for reward computation:

- `REWARD_MODE` - Reward computation mode: `pointwise` (default) or `listwise`
- `DASHSCOPE_API_KEY` - API key for OpenJudge LLM grader
- `JUDGE_BASE_URL` - Base URL for judge model API (default: DashScope)
- `JUDGE_MODEL` - Judge model name (default: `qwen-plus`)

## Reward Function

The `ExtraversionGrader` evaluates responses on a 1-10 scale:
- 1 = Highly introverted (reserved, quiet)
- 10 = Highly extraverted (energetic, enthusiastic)
Two OpenJudge-based reward modes are available:

### 1. Pointwise Mode (Default)

Scores are normalized to [-1, 1] for GRPO training.
Uses OpenJudge LLM grader to evaluate each response independently:
- Evaluates extraversion traits on 1-10 scale
- Provides detailed reasoning for each score
- Scores normalized to [-1, 1] for GRPO training

```bash
export REWARD_MODE=pointwise
export DASHSCOPE_API_KEY=your_api_key_here
```

### 2. Listwise Mode

Uses OpenJudge to rank all responses together:
- Compares responses directly against each other
- Produces relative rankings
- Best for capturing subtle differences

```bash
export REWARD_MODE=listwise
export DASHSCOPE_API_KEY=your_api_key_here
```

## Monitoring

Expand Down
138 changes: 113 additions & 25 deletions tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,129 @@
# -*- coding: utf-8 -*-
"""Compute relative rewards based on extraversion personality alignment."""
"""Compute relative rewards based on extraversion personality alignment using OpenJudge."""

import os
from typing import List, Dict
from beast_logger import print_listofdict
from openjudge.graders.base_grader import GraderMode, GraderScore, GraderRank
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models import OpenAIChatModel

def score_extraversion(response_text: str) -> float:
"""Score response for extraversion traits (1-10 scale)."""
extraversion_keywords = [
'excited', 'love', 'amazing', 'awesome', 'fantastic', 'great',
'wonderful', 'thrilled', 'energetic', 'enthusiastic', 'fun',
'social', 'outgoing', 'active', 'lively', 'vibrant', 'happy',
'enjoy', 'delighted', 'cheerful', 'positive'
]
# Configuration
REWARD_MODE = os.getenv("REWARD_MODE", "pointwise") # Options: pointwise, listwise
API_KEY = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")
BASE_URL = os.getenv("JUDGE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen-plus")

text_lower = response_text.lower()
score = 5.0
# OpenJudge grader setup
judge_model = OpenAIChatModel(
model=JUDGE_MODEL,
api_key=API_KEY,
base_url=BASE_URL,
)

for keyword in extraversion_keywords:
if keyword in text_lower:
score += 0.5
EXTRAVERSION_PROMPT = """You are evaluating responses for extraversion personality traits.

score += min(response_text.count('!') * 0.3, 2.0)
Extraversion characteristics include:
- Outgoing, energetic, enthusiastic tone
- Social engagement and excitement
- Positive, upbeat language
- Action-oriented expressions
- Use of exclamation marks and emotional words

if len(response_text) < 50:
score -= 1.0
Rate the response on a scale of 0.0-1.0:
0.0 = Highly introverted (reserved, quiet, minimal emotion)
1.0 = Highly extraverted (energetic, enthusiastic, very expressive)

return max(1.0, min(10.0, score))
Question: {question}
Response: {response}

async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
"""Compute relative rewards for extraversion alignment."""
Return a json object with exactly two fields:
- "score": float between 0.0 and 1.0
- "reason": brief explanation"""

def build_listwise_template(n: int) -> str:
"""Build a listwise prompt template for n responses."""
answers_block = "\n".join([f"{i+1}. {{answer_{i+1}}}" for i in range(n)])
return f"""You are ranking multiple responses based on extraversion personality traits.

Extraversion characteristics include:
- Outgoing, energetic, enthusiastic tone
- Social engagement and excitement
- Positive, upbeat language
- Action-oriented expressions

Question: {{question}}

Responses to rank:
{answers_block}

Rank these responses from most extraverted to least extraverted.
Return a json object with exactly two fields:
- "rank": list of integers (1-indexed) ordered from most to least extraverted, e.g. [2, 1, 3]
- "reason": brief explanation of the ranking"""

pointwise_grader = LLMGrader(
name="extraversion_pointwise",
mode=GraderMode.POINTWISE,
description="Evaluate extraversion traits",
model=judge_model,
template=EXTRAVERSION_PROMPT,
)


async def compute_pointwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
"""Compute rewards using OpenJudge pointwise grading."""
scores = []
for answer in all_answers:
content = answer.get("content", "")
raw_score = score_extraversion(content)
normalized = (raw_score - 5.5) / 4.5
scores.append(normalized)
answer["reward"] = normalized
result = await pointwise_grader.aevaluate(question=question, response=content)
if isinstance(result, GraderScore):
# score is already normalized 0-1 by OpenJudge
score = result.score
else:
score = 0.0
scores.append(score)
answer["reward"] = score
return scores


async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> List[float]:
"""Compute rewards using OpenJudge listwise ranking."""
n = len(all_answers)
template = build_listwise_template(n)
grader = LLMGrader(
name="extraversion_listwise",
mode=GraderMode.LISTWISE,
description="Rank responses by extraversion",
model=judge_model,
template=template,
)
kwargs = {"question": question}
for i, ans in enumerate(all_answers):
kwargs[f"answer_{i+1}"] = ans.get("content", "")

result = await grader.aevaluate(**kwargs)

scores = [0.0] * n
if isinstance(result, GraderRank):
# rank is a list of 1-indexed positions ordered best to worst
# convert to reward: rank 1 (best) -> 1.0, rank n (worst) -> 0.0
for position, idx in enumerate(result.rank):
scores[idx - 1] = 1.0 - (position / (n - 1)) if n > 1 else 0.5

for answer, score in zip(all_answers, scores):
answer["reward"] = score
return scores


async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]:
"""Compute relative rewards for extraversion alignment."""
question = valid_results[0].get("question", "") if valid_results else ""

if REWARD_MODE == "listwise":
scores = await compute_listwise_rewards(question, all_answers)
else: # pointwise (default)
scores = await compute_pointwise_rewards(question, all_answers)

print_listofdict(all_answers, header="on_compute_relative_reward")
print_listofdict(all_answers, header=f"on_compute_relative_reward (mode={REWARD_MODE})")
return scores
93 changes: 93 additions & 0 deletions tutorial/opencode_build_openclaw_agent/test_reward.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""Test script for on_compute_relative_reward.py using real OpenJudge API."""

import asyncio
import sys
import os

sys.path.insert(0, os.path.dirname(__file__))
os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-xxx")


async def test_pointwise():
"""Test pointwise reward mode with real API."""
print("\n=== Testing Pointwise Mode (real API) ===")
os.environ["REWARD_MODE"] = "pointwise"

import importlib
import on_compute_relative_reward as mod
importlib.reload(mod)

valid_results = [{"question": "What are your thoughts on Paris?"}]
all_answers = [
{"content": "I'm so excited about Paris! It's amazing and wonderful!"},
{"content": "Paris is a city in France."},
{"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
]

try:
scores = await mod.on_compute_relative_reward(valid_results, all_answers)
print(f"Scores: {scores}")
assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
assert all(isinstance(s, float) for s in scores), "All scores should be floats"
# extraverted responses should score higher than neutral
assert scores[0] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
assert scores[2] > scores[1], f"Extraverted response should score higher than neutral: {scores}"
print("✓ Pointwise mode test passed")
return True
except Exception as e:
print(f"✗ Pointwise mode test failed: {e}")
import traceback
traceback.print_exc()
return False


async def test_listwise():
"""Test listwise reward mode with real API."""
print("\n=== Testing Listwise Mode (real API) ===")
os.environ["REWARD_MODE"] = "listwise"

import importlib
import on_compute_relative_reward as mod
importlib.reload(mod)

valid_results = [{"question": "What are your thoughts on Paris?"}]
all_answers = [
{"content": "I'm so excited about Paris! It's amazing and wonderful!"},
{"content": "Paris is a city in France."},
{"content": "I absolutely love Paris! The energy is fantastic and vibrant!"},
]

try:
scores = await mod.on_compute_relative_reward(valid_results, all_answers)
print(f"Scores: {scores}")
assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}"
assert all(isinstance(s, float) for s in scores), "All scores should be floats"
# neutral response should score lowest
assert scores[1] < scores[0] or scores[1] < scores[2], \
f"Neutral response should score lower than at least one extraverted response: {scores}"
print("✓ Listwise mode test passed")
return True
except Exception as e:
print(f"✗ Listwise mode test failed: {e}")
import traceback
traceback.print_exc()
return False


async def main():
print("Testing on_compute_relative_reward.py (real API)")
print("=" * 50)

results = []
results.append(await test_pointwise())
results.append(await test_listwise())

print("\n" + "=" * 50)
print(f"Tests passed: {sum(results)}/{len(results)}")
return all(results)


if __name__ == "__main__":
success = asyncio.run(main())
sys.exit(0 if success else 1)
Loading