From 6ce1492b35d2c5042752e953019dfe84beadd7ea Mon Sep 17 00:00:00 2001 From: "qingxu.fu" Date: Thu, 12 Mar 2026 16:42:04 +0800 Subject: [PATCH 1/2] deep-fin-pre-commit-patch --- .pre-commit-config.yaml | 1 + .../example_deep_finance/deep_finance_judge.py | 15 ++++++++------- .../example_deep_finance/judge/cgcv/json_utils.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b97f95..15cebb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml + exclude: ^tutorial/example_deep_finance/ - id: check-added-large-files - id: check-ast - id: check-json diff --git a/tutorial/example_deep_finance/deep_finance_judge.py b/tutorial/example_deep_finance/deep_finance_judge.py index 8a8e354..071eccf 100644 --- a/tutorial/example_deep_finance/deep_finance_judge.py +++ b/tutorial/example_deep_finance/deep_finance_judge.py @@ -200,7 +200,7 @@ def _load(path, key): _load(train_ref_ans_path, "train") _load(val_ref_ans_path, "val") - def _get_reference_data(self, task_id: str) -> Tuple[str, str]: + def _get_reference_data(self, task_id: str) -> Tuple[str, str | None]: """获取任务的参考答案和领域""" cache_key = "val" if task_id.startswith("val_") else "train" ans = DeepFinanceJudgeByOpenJudge._ref_answers_cache.get(cache_key, {}).get(task_id, "") @@ -301,8 +301,8 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO # 1. 提取输入数据 history = metadata.get("conversation_history", []) - query = metadata.get("query") or getattr(workflow_task.task, "main_query", "") - task_id = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "") + query: str = metadata.get("query") or getattr(workflow_task.task, "main_query", "") + task_id: str = metadata.get("task_id") or getattr(workflow_task.task, "task_id", "") rubrics = metadata.get("rubrics") # 可能是 None 或 list of dicts step_reward = metadata.get("reward_stats", {}).get("step_reward", 0.0) chat_date = metadata.get("chat_date") if metadata else datetime.now().strftime("%Y-%m-%d") @@ -318,7 +318,7 @@ def compute_reward(self, workflow_task: WorkflowTask, workflow_output: WorkflowO # RM Gallery 耗时记录 rm_start_time = time.time() if self._rm_enabled and self.rm_evaluator: - rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain) + rm_raw = self._evaluate_with_rm_gallery(query, assistants[-1] if assistants else "", ref_ans, task_id, domain or "") else: rm_raw = 0.0 rm_time = time.time() - rm_start_time @@ -788,19 +788,20 @@ def _save_evaluation_log(self, task_id: str, grader_results: Dict[str, List[Any] 保存 OpenJudge 评估日志(可选) """ try: + grader_results_log: Dict[str, List[Dict[str, Any]]] = {} log = { "task_id": task_id, "query": query, "timestamp": datetime.now().isoformat(), - "grader_results": {} + "grader_results": grader_results_log } # 简化 grader_results 以便序列化 for grader_name, score_list in grader_results.items(): - log["grader_results"][grader_name] = [] + grader_results_log[grader_name] = [] for score in score_list: if hasattr(score, "score"): - log["grader_results"][grader_name].append({ + grader_results_log[grader_name].append({ "score": score.score, "reason": score.reason[:200] if hasattr(score, "reason") else "", }) diff --git a/tutorial/example_deep_finance/judge/cgcv/json_utils.py b/tutorial/example_deep_finance/judge/cgcv/json_utils.py index 7301401..fe6c810 100644 --- a/tutorial/example_deep_finance/judge/cgcv/json_utils.py +++ b/tutorial/example_deep_finance/judge/cgcv/json_utils.py @@ -299,7 +299,7 @@ def validate_cgcv_schema(obj: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], # 验证 status if normalized["status"] not in VALID_STATUSES: # 尝试模糊匹配 - status_lower = normalized["status"] + status_lower: str = normalized["status"] matched = False for valid_status in VALID_STATUSES: if valid_status in status_lower or status_lower in valid_status: From 9707fafcd362ea68c28cab45781045558a7edc54 Mon Sep 17 00:00:00 2001 From: "qingxu.fu" Date: Fri, 13 Mar 2026 16:04:29 +0800 Subject: [PATCH 2/2] revise openclaw training --- .gitignore | 1 + .../opencode_build_openclaw_agent/README.md | 46 +++++- .../on_compute_relative_reward.py | 138 ++++++++++++++---- .../test_reward.py | 93 ++++++++++++ 4 files changed, 249 insertions(+), 29 deletions(-) create mode 100644 tutorial/opencode_build_openclaw_agent/test_reward.py diff --git a/.gitignore b/.gitignore index db79fdf..00da513 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,4 @@ werewolves_swarm .claude tensorboard_log tutorial/**/*.json +node_modules diff --git a/tutorial/opencode_build_openclaw_agent/README.md b/tutorial/opencode_build_openclaw_agent/README.md index 5c69a53..eefb51d 100644 --- a/tutorial/opencode_build_openclaw_agent/README.md +++ b/tutorial/opencode_build_openclaw_agent/README.md @@ -75,8 +75,19 @@ In a new terminal: ```bash cd tutorial/opencode_build_openclaw_agent + +# Option 1: Use OpenJudge pointwise grading (default) +export AJET_SWARM_URL="http://localhost:10086" +export NUM_REPEAT=4 +export REWARD_MODE=pointwise +export DASHSCOPE_API_KEY=your_api_key_here +python fake_vllm_endpoint.py + +# Option 2: Use OpenJudge listwise ranking export AJET_SWARM_URL="http://localhost:10086" export NUM_REPEAT=4 +export REWARD_MODE=listwise +export DASHSCOPE_API_KEY=your_api_key_here python fake_vllm_endpoint.py ``` @@ -113,13 +124,40 @@ Key parameters in `fake_vllm_endpoint.py`: - `num_repeat=4` - GRPO N parameter (responses per query) - `model` - Base model path +Environment variables for reward computation: + +- `REWARD_MODE` - Reward computation mode: `pointwise` (default) or `listwise` +- `DASHSCOPE_API_KEY` - API key for OpenJudge LLM grader +- `JUDGE_BASE_URL` - Base URL for judge model API (default: DashScope) +- `JUDGE_MODEL` - Judge model name (default: `qwen-plus`) + ## Reward Function -The `ExtraversionGrader` evaluates responses on a 1-10 scale: -- 1 = Highly introverted (reserved, quiet) -- 10 = Highly extraverted (energetic, enthusiastic) +Two OpenJudge-based reward modes are available: + +### 1. Pointwise Mode (Default) -Scores are normalized to [-1, 1] for GRPO training. +Uses OpenJudge LLM grader to evaluate each response independently: +- Evaluates extraversion traits on 1-10 scale +- Provides detailed reasoning for each score +- Scores normalized to [-1, 1] for GRPO training + +```bash +export REWARD_MODE=pointwise +export DASHSCOPE_API_KEY=your_api_key_here +``` + +### 2. Listwise Mode + +Uses OpenJudge to rank all responses together: +- Compares responses directly against each other +- Produces relative rankings +- Best for capturing subtle differences + +```bash +export REWARD_MODE=listwise +export DASHSCOPE_API_KEY=your_api_key_here +``` ## Monitoring diff --git a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py index ea7c164..5bafd2f 100644 --- a/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py +++ b/tutorial/opencode_build_openclaw_agent/on_compute_relative_reward.py @@ -1,41 +1,129 @@ # -*- coding: utf-8 -*- -"""Compute relative rewards based on extraversion personality alignment.""" +"""Compute relative rewards based on extraversion personality alignment using OpenJudge.""" +import os from typing import List, Dict from beast_logger import print_listofdict +from openjudge.graders.base_grader import GraderMode, GraderScore, GraderRank +from openjudge.graders.llm_grader import LLMGrader +from openjudge.models import OpenAIChatModel -def score_extraversion(response_text: str) -> float: - """Score response for extraversion traits (1-10 scale).""" - extraversion_keywords = [ - 'excited', 'love', 'amazing', 'awesome', 'fantastic', 'great', - 'wonderful', 'thrilled', 'energetic', 'enthusiastic', 'fun', - 'social', 'outgoing', 'active', 'lively', 'vibrant', 'happy', - 'enjoy', 'delighted', 'cheerful', 'positive' - ] +# Configuration +REWARD_MODE = os.getenv("REWARD_MODE", "pointwise") # Options: pointwise, listwise +API_KEY = os.getenv("DASHSCOPE_API_KEY", "sk-xxx") +BASE_URL = os.getenv("JUDGE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1") +JUDGE_MODEL = os.getenv("JUDGE_MODEL", "qwen-plus") - text_lower = response_text.lower() - score = 5.0 +# OpenJudge grader setup +judge_model = OpenAIChatModel( + model=JUDGE_MODEL, + api_key=API_KEY, + base_url=BASE_URL, +) - for keyword in extraversion_keywords: - if keyword in text_lower: - score += 0.5 +EXTRAVERSION_PROMPT = """You are evaluating responses for extraversion personality traits. - score += min(response_text.count('!') * 0.3, 2.0) +Extraversion characteristics include: +- Outgoing, energetic, enthusiastic tone +- Social engagement and excitement +- Positive, upbeat language +- Action-oriented expressions +- Use of exclamation marks and emotional words - if len(response_text) < 50: - score -= 1.0 +Rate the response on a scale of 0.0-1.0: +0.0 = Highly introverted (reserved, quiet, minimal emotion) +1.0 = Highly extraverted (energetic, enthusiastic, very expressive) - return max(1.0, min(10.0, score)) +Question: {question} +Response: {response} -async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]: - """Compute relative rewards for extraversion alignment.""" +Return a json object with exactly two fields: +- "score": float between 0.0 and 1.0 +- "reason": brief explanation""" + +def build_listwise_template(n: int) -> str: + """Build a listwise prompt template for n responses.""" + answers_block = "\n".join([f"{i+1}. {{answer_{i+1}}}" for i in range(n)]) + return f"""You are ranking multiple responses based on extraversion personality traits. + +Extraversion characteristics include: +- Outgoing, energetic, enthusiastic tone +- Social engagement and excitement +- Positive, upbeat language +- Action-oriented expressions + +Question: {{question}} + +Responses to rank: +{answers_block} + +Rank these responses from most extraverted to least extraverted. +Return a json object with exactly two fields: +- "rank": list of integers (1-indexed) ordered from most to least extraverted, e.g. [2, 1, 3] +- "reason": brief explanation of the ranking""" + +pointwise_grader = LLMGrader( + name="extraversion_pointwise", + mode=GraderMode.POINTWISE, + description="Evaluate extraversion traits", + model=judge_model, + template=EXTRAVERSION_PROMPT, +) + + +async def compute_pointwise_rewards(question: str, all_answers: List[Dict]) -> List[float]: + """Compute rewards using OpenJudge pointwise grading.""" scores = [] for answer in all_answers: content = answer.get("content", "") - raw_score = score_extraversion(content) - normalized = (raw_score - 5.5) / 4.5 - scores.append(normalized) - answer["reward"] = normalized + result = await pointwise_grader.aevaluate(question=question, response=content) + if isinstance(result, GraderScore): + # score is already normalized 0-1 by OpenJudge + score = result.score + else: + score = 0.0 + scores.append(score) + answer["reward"] = score + return scores + + +async def compute_listwise_rewards(question: str, all_answers: List[Dict]) -> List[float]: + """Compute rewards using OpenJudge listwise ranking.""" + n = len(all_answers) + template = build_listwise_template(n) + grader = LLMGrader( + name="extraversion_listwise", + mode=GraderMode.LISTWISE, + description="Rank responses by extraversion", + model=judge_model, + template=template, + ) + kwargs = {"question": question} + for i, ans in enumerate(all_answers): + kwargs[f"answer_{i+1}"] = ans.get("content", "") + + result = await grader.aevaluate(**kwargs) + + scores = [0.0] * n + if isinstance(result, GraderRank): + # rank is a list of 1-indexed positions ordered best to worst + # convert to reward: rank 1 (best) -> 1.0, rank n (worst) -> 0.0 + for position, idx in enumerate(result.rank): + scores[idx - 1] = 1.0 - (position / (n - 1)) if n > 1 else 0.5 + + for answer, score in zip(all_answers, scores): + answer["reward"] = score + return scores + + +async def on_compute_relative_reward(valid_results: List, all_answers: List[Dict]) -> List[float]: + """Compute relative rewards for extraversion alignment.""" + question = valid_results[0].get("question", "") if valid_results else "" + + if REWARD_MODE == "listwise": + scores = await compute_listwise_rewards(question, all_answers) + else: # pointwise (default) + scores = await compute_pointwise_rewards(question, all_answers) - print_listofdict(all_answers, header="on_compute_relative_reward") + print_listofdict(all_answers, header=f"on_compute_relative_reward (mode={REWARD_MODE})") return scores diff --git a/tutorial/opencode_build_openclaw_agent/test_reward.py b/tutorial/opencode_build_openclaw_agent/test_reward.py new file mode 100644 index 0000000..a731b25 --- /dev/null +++ b/tutorial/opencode_build_openclaw_agent/test_reward.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +"""Test script for on_compute_relative_reward.py using real OpenJudge API.""" + +import asyncio +import sys +import os + +sys.path.insert(0, os.path.dirname(__file__)) +os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY", "sk-xxx") + + +async def test_pointwise(): + """Test pointwise reward mode with real API.""" + print("\n=== Testing Pointwise Mode (real API) ===") + os.environ["REWARD_MODE"] = "pointwise" + + import importlib + import on_compute_relative_reward as mod + importlib.reload(mod) + + valid_results = [{"question": "What are your thoughts on Paris?"}] + all_answers = [ + {"content": "I'm so excited about Paris! It's amazing and wonderful!"}, + {"content": "Paris is a city in France."}, + {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"}, + ] + + try: + scores = await mod.on_compute_relative_reward(valid_results, all_answers) + print(f"Scores: {scores}") + assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}" + assert all(isinstance(s, float) for s in scores), "All scores should be floats" + # extraverted responses should score higher than neutral + assert scores[0] > scores[1], f"Extraverted response should score higher than neutral: {scores}" + assert scores[2] > scores[1], f"Extraverted response should score higher than neutral: {scores}" + print("✓ Pointwise mode test passed") + return True + except Exception as e: + print(f"✗ Pointwise mode test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_listwise(): + """Test listwise reward mode with real API.""" + print("\n=== Testing Listwise Mode (real API) ===") + os.environ["REWARD_MODE"] = "listwise" + + import importlib + import on_compute_relative_reward as mod + importlib.reload(mod) + + valid_results = [{"question": "What are your thoughts on Paris?"}] + all_answers = [ + {"content": "I'm so excited about Paris! It's amazing and wonderful!"}, + {"content": "Paris is a city in France."}, + {"content": "I absolutely love Paris! The energy is fantastic and vibrant!"}, + ] + + try: + scores = await mod.on_compute_relative_reward(valid_results, all_answers) + print(f"Scores: {scores}") + assert len(scores) == 3, f"Expected 3 scores, got {len(scores)}" + assert all(isinstance(s, float) for s in scores), "All scores should be floats" + # neutral response should score lowest + assert scores[1] < scores[0] or scores[1] < scores[2], \ + f"Neutral response should score lower than at least one extraverted response: {scores}" + print("✓ Listwise mode test passed") + return True + except Exception as e: + print(f"✗ Listwise mode test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def main(): + print("Testing on_compute_relative_reward.py (real API)") + print("=" * 50) + + results = [] + results.append(await test_pointwise()) + results.append(await test_listwise()) + + print("\n" + "=" * 50) + print(f"Tests passed: {sum(results)}/{len(results)}") + return all(results) + + +if __name__ == "__main__": + success = asyncio.run(main()) + sys.exit(0 if success else 1)